# Exploratory Surrogacy

In [1]:
# Deep learning imports.
import torch.nn as nn
import torch 
from collections import OrderedDict
from scipy import io

# Custom classes.
from core import *
from dataset import *
from sklearn import preprocessing

# Visual.
import matplotlib
import matplotlib.pyplot as plt

As the problem pertains to regression, the optimizer to be used is that of stochastic gradient and descent. The loss function for minimisation is set as the mean squared error (MSE).

## Kristen's datasets

In [2]:
# List of datasets provided by Kristen.
k_to_do = [KristenDataset("dataSet2SplineYawFixedSun.mat"),
           KristenDataset("dataSet2SplineYawVaryingSun.mat"),
           KristenDataset("datasetSplineYawVaryingSun.mat"),
           KristenDataset("datasetSplineYawFixedSun.mat")]

Save datasets according to test, train split.

In [None]:
for k in k_to_do:
    k.save_to_mat()
    

In [2]:
# Instantiated robust scaler from scikit learn.
scaler_y1 = preprocessing.RobustScaler()

# Apply RobustScaler preprocessing to output variables.
for k in k_to_do:
#     quantile = np.concatenate(([k.df.loc[:,"f1":"f4"].quantile(.25).values], [k.df.loc[:,"f1":"f4"].quantile(.50).values], [k.df.loc[:,"f1":"f4"].quantile(.75).values]),axis=0).T
#     io.savemat(os.path.join(k.filepath,"quantiles",f"{k.filename.split('.')[0]}_quantiles.mat"), {"Q1":np.array([quantile[:,0]]).reshape(-1, 1), "Q2":np.array([quantile[:,1]]).reshape(-1, 1),  "Q3":np.array([quantile[:,2]]).reshape(-1,1)})

#     print(np.array([quantile[:,0]]).reshape(-1, 1))
    k.add_f_preprocessing(scaler_y1)
    scale = np.array([k._preprocessing_f[0].scale_]).reshape(-1,1)
    center = np.array([k._preprocessing_f[0].center_]).reshape(-1,1)
    io.savemat(os.path.join(k.filepath,"post" , f"{k.filename.split('.')[0]}_post.mat"), {"post_scale":scale, "post_center":center} )
#     print(k._preprocessing_f[0].get_params())

### Model 1 Module

Model 1 will be used to explore the effect of (1 & 2) modifying the quantity of hidden neurons in each of the two fixed hidden layers and (2) the addition of intermediate batch normalisation after each hidden layers activation function.

In [3]:
# Define modular structure of model 1.
def get_model1_module(hidden_size, data_set, batch_norm=False):
    """
    :param hidden_size: number of neurons for the two hidden layers.
    :param data_set: the dataset being used, for defining input and output size.
    :param batch_norm: the inclusion of batch normalisation after each hidden layer.
    """
    _sequence = []
    print(data_set)
    # --- ##  Hidden layer 1
    _sequence.append(('fc1', nn.Linear(data_set.input_size, hidden_size)))
    # - Sigmoid activation
    _sequence.append(('sigmoid1', nn.Sigmoid()))      
    # Conditional
    if batch_norm:
        _sequence.append(('bn1', nn.BatchNorm1d(hidden_size))) 
    # --- ##  Hidden layer 2
    _sequence.append(('fc2', nn.Linear(hidden_size, hidden_size)))      
    # - ReLU activation
    _sequence.append(('relu2', nn.ReLU()))
    # - Conditional 
    if batch_norm:
        _sequence.append(('bn2', nn.BatchNorm1d(hidden_size)))                 
    # --- # Output layer
    _sequence.append(('output', nn.Linear(hidden_size, data_set.output_size))) 
    return nn.Sequential(OrderedDict(_sequence))


todo = []

import os, re
for f in os.listdir('models'):
    if re.match('lr_h200_model_1_kristen', f):
        todo.append(f"models/{f}")
              
def model_checkpoint_to_mat(dataset, model, path):
    print(path)
    trainer = Trainer(path, dataset, model, torch.optim.SGD, torch.nn.MSELoss, batch_size=100)
    trainer.load_checkpoint(path)
    model_state = trainer._model.state_dict()
#     example_x = dataset.x[0].cuda()
#     example_f_real = dataset.f[0].cuda()
#     example_f_pred = trainer._model(example_x.cuda().float())

#     print(example_x.cpu())
#     print(example_f_pred.cpu())
#     print(example_f_real.cpu())
#     print(dataset._preprocessing_f[0].inverse_transform([example_f_real.cpu().detach().numpy()]))
#     print(dataset._preprocessing_f[0].scale_)
#     print(dataset._preprocessing_f[0].center_)
#     print(model)
#     print(dict(model_state))
    dict_return = dict()
    dict_ = dict(model_state)
    for key,value in zip(dict_.keys(), dict_.values()):
        arr =  value.cpu().numpy()
        if len(arr.shape) == 1:
            arr = np.array([arr]).T
        print(arr.shape)
        dict_return[key.replace(".", "_")] = value.cpu().numpy()
    print(f"{path.split('/')[1]}.mat")
    io.savemat(f"data/kristen/models/{path.split('/')[1]}.mat", dict_return)
    
dataset = k_to_do[0]
# print(k)

for dataset, do in zip(k_to_do, todo):
    dataset_path = do.split("_")[-1] + ".mat"
    dataset = KristenDataset(dataset_path)
    dataset.add_f_preprocessing(scaler_y1)
    model_checkpoint_to_mat(dataset, get_model1_module(200, dataset, False), do)

<dataset.KristenDataset object at 0x7f54dca6a410>
models/lr_h200_model_1_kristen_datasetSplineYawVaryingSun
(200, 7)
(200, 1)
(200, 200)
(200, 1)
(4, 200)
(4, 1)
lr_h200_model_1_kristen_datasetSplineYawVaryingSun.mat
<dataset.KristenDataset object at 0x7f54dcfa2790>
models/lr_h200_model_1_kristen_datasetSplineYawFixedSun
(200, 7)
(200, 1)
(200, 200)
(200, 1)
(4, 200)
(4, 1)
lr_h200_model_1_kristen_datasetSplineYawFixedSun.mat
<dataset.KristenDataset object at 0x7f54dcfa2790>
models/lr_h200_model_1_kristen_dataSet2SplineYawVaryingSun
(200, 7)
(200, 1)
(200, 200)
(200, 1)
(4, 200)
(4, 1)
lr_h200_model_1_kristen_dataSet2SplineYawVaryingSun.mat
<dataset.KristenDataset object at 0x7f54dcfa2790>
models/lr_h200_model_1_kristen_dataSet2SplineYawFixedSun
(200, 7)
(200, 1)
(200, 200)
(200, 1)
(4, 200)
(4, 1)
lr_h200_model_1_kristen_dataSet2SplineYawFixedSun.mat


#### (1) Hidden Neurons per layer (h=100), **without** batch normalisation, **without** scheduler
<br>

In [4]:
# Define quantity of neurons in hidden layers.
model1_1_hidden_size = 100
model1_1_batch_norm  = False
model1_1_root_name = f"model1_h{model1_1_hidden_size}_lr0.1_bn{model1_1_batch_norm}"

# Train models and save.
for idx, k in enumerate(k_to_do):
    model_1 = get_model1_bn_module(hidden_size, k, model1_1_batch_norm)
    model_folder = f"models/{model1_1_root_name}_{k.filename.split('.')[0]}"
    trainer = Trainer(model_folder, k, model_1, torch.optim.SGD, torch.nn.MSELoss, num_epochs=150, batch_size=500, learning_rate=1e-1)
    trainer.train()

NameError: name 'get_model1_bn_module' is not defined

#### (1) Hidden Neurons per layer (h=100), **without** batch normalisation, **with** scheduler
<br>

In [None]:
# Define quantity of neurons in hidden layers.
model1_2_hidden_size = 100
model1_2_batch_norm  = False
model1_2_scheduler = True
model1_2_root_name = f"model1_h{model1_2_hidden_size}_lr0.1_bn{model1_2_batch_norm}_scheduler{model1_2_scheduler}"

for idx, k in enumerate(k_to_do):
    model_1 = get_model1_bn_module(hidden_size, k)
    model_folder = f"models/{model1_1_root_name}_{k.filename.split('.')[0]}"
    trainer = Trainer(model_folder, k, model_1, torch.optim.SGD, torch.nn.MSELoss, num_epochs=150, batch_size=500, learning_rate=1e-1)
    trainer.add_scheduler(torch.optim.lr_scheduler.MultiStepLR, milestones=[90,115,140], gamma=0.1)
    trainer.train()

#### (3) Hidden Neurons per layer (h=200), **with** batch normalisation, **with scheduler**
<br>

In [None]:
model1_1_hidden_size = 200
model1_1_batch_norm  = True

def get_model1_bn_module(hidden_size, data_set):
    return nn.Sequential(
    OrderedDict([
                      ('fc1', nn.Linear(data_set.input_size, hidden_size)),
                      ('sigmoid1', nn.Sigmoid()),
                      ('bn1', nn.BatchNorm1d(hidden_size)),
                      ('fc2', nn.Linear(hidden_size, hidden_size)),
                      ('relu2', nn.ReLU()),
                      ('bn2', nn.BatchNorm1d(hidden_size)),
                      ('output', nn.Linear(hidden_size, data_set.output_size))
                ]))

for idx, k in enumerate(k_to_do):
    model_1 = get_model1_module(hidden_size, k)
    model_folder = f"models/bn_lr_h200_model_1_kristen_{k.filename.split('.')[0]}"
    trainer = Trainer(model_folder, k, model_1, torch.optim.SGD, torch.nn.MSELoss, num_epochs=150, batch_size=500, learning_rate=1e-1)
    trainer.add_scheduler(torch.optim.lr_scheduler.MultiStepLR, milestones=[90,115,140], gamma=0.1)
    trainer.train()

#### (4) Hidden Neurons per layer (h=200), **with** batch normalisation, **with** scheduler
<br>

In [None]:
# Define quantity of neurons in hidden layers.
model1_4_hidden_size = 200
model1_4_batch_norm  = True
model1_4_scheduler = True
model1_4_root_name = f"model1_h{model1_4_hidden_size}_lr0.1_bn{model1_4_batch_norm}_scheduler{model1_4_scheduler}"

for idx, k in enumerate(k_to_do):
    model_1 = get_model1_module(model1_4_hidden_size, k, model1_4_batch_norm)
    model_folder = f"models/{model1_4_root_name}_{k.filename.split('.')[0]}"
    trainer = Trainer(model_folder, k, model_1, torch.optim.SGD, torch.nn.MSELoss, num_epochs=120, batch_size=500, learning_rate=1e-1)
    trainer.add_scheduler(torch.optim.lr_scheduler.MultiStepLR, milestones=[12, 24, 84, 96, 108], gamma=0.1)
    trainer.train()

In [7]:
df_return = None
for idx, k in enumerate(k_to_do):
    if idx == 0:
        f = k.f.cpu().numpy()
        x = k.x.cpu().numpy()
    else:
        f = np.concatenate((f, k.f.cpu().numpy()))
        x = np.concatenate((x, k.x.cpu().numpy()))
        
model1_all_root_name = f"model1_h{300}_lr0.1_bn{True}_scheduler{True}"
dataset = BaseDataset(f=f, x=x)
dataset.filename = "allDataGiven.mat"
dataset.add_f_preprocessing(scaler_y1)
model_all = get_model1_module(300, dataset, True)
model_folder = f"models/{model1_all_root_name}_{dataset.filename.split('.')[0]}"
trainer = Trainer(model_folder, dataset, model_all, torch.optim.SGD, torch.nn.MSELoss, num_epochs=200, batch_size=1000, learning_rate=1e-2)
# trainer.add_scheduler(torch.optim.lr_scheduler.MultiStepLR, milestones=[12, 24, 84, 96, 108], gamma=0.1)
# trainer.load_checkpoint(model_folder)
# trainer.train()
scale = np.array([dataset._preprocessing_f[0].scale_]).reshape(-1,1)
center = np.array([dataset._preprocessing_f[0].center_]).reshape(-1,1)
dataset.sub_dir="kristen"
io.savemat(os.path.join(dataset.filepath,"post" , f"{dataset.filename.split('.')[0]}_post.mat"), {"post_scale":scale, "post_center":center} )

# model_checkpoint_to_mat(dataset, get_model1_module(300, dataset, True), "models/model1_h300_lr0.1_bnTrue_schedulerTrue_allDataGiven")

<dataset.BaseDataset object at 0x7f54dca8f710>
