## Data loader
Creates a data loader given a dataframe and batchsize

In [3]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import copy

In [4]:
def make_data_loader(df,param,test=False):
    
    if (param.target=='All'):
        targets = torch.Tensor(np.array(df[['oh_logK','oh_A','oh_B','oh_n','o3_logK','o3_A','o3_B','o3_n',
                                               'no3_logK','no3_A','no3_B','no3_n','cl_logK','cl_A','cl_B','cl_n']]))
    elif param.target == 'OH':
        df = df.dropna(subset=['oh_logK'])
        targets = torch.Tensor(np.array(df[['oh_logK','oh_A','oh_B','oh_n']]))
    elif param.target == 'O3':
        df = df.dropna(subset=['o3_logK'])
        targets = torch.Tensor(np.array(df[['o3_logK','o3_A','o3_B','o3_n']]))
    elif param.target == 'NO3':
        df = df.dropna(subset=['no3_logK'])
        targets = torch.Tensor(np.array(df[['no3_logK','no3_A','no3_B','no3_n']]))
    elif param.target == 'Cl':
        df = df.dropna(subset=['cl_logK'])
        targets = torch.Tensor(np.array(df[['cl_logK','cl_A','cl_B','cl_n']]))
    else:
        print('Target not supported. Please choose from [All,OH,O3,NO3,Cl]')

    if param.input_type == 'Morgan':
        df_features = torch.Tensor(np.array(df.iloc[:,df.columns.get_loc(0):df.columns.get_loc(2047)],dtype=float))
        features = torch.column_stack([df_features])
    elif param.input_type == 'Chemprop':
        df_features_cp = torch.Tensor(np.array(df.iloc[:,df.columns.get_loc("fp_0"):],dtype=float))
        features = torch.column_stack([df_features_cp])
    else:
        df_features = torch.Tensor(np.array(df.iloc[:,df.columns.get_loc(0):df.columns.get_loc(2047)],dtype=float))
        df_features_cp = torch.Tensor(np.array(df.iloc[:,df.columns.get_loc("fp_0"):],dtype=float))
        features = torch.column_stack([df_features,df_features_cp])
    
    temps = torch.Tensor(np.array(df['T']))
    dataset = torch.utils.data.TensorDataset(features,targets,temps)
    if test:
        loader = torch.utils.data.DataLoader(dataset,batch_size=df.shape[0])
    else:
        loader = torch.utils.data.DataLoader(dataset,batch_size=param.batch_size)

    return loader

In [1]:
class Scaler:
    
    def Scale_data(self,train_mols,test_mols,param):
        train_scaled=copy.deepcopy(train_mols)
        test_scaled =copy.deepcopy(test_mols)
        self.tscaler=None
        # Feature scaling
        if (param.scale_features == True):
            scaler = MinMaxScaler()
            if (param.model_type == 'Arrhenius'):
                labels = (train_mols.columns.str.startswith('fp_',na=False) | np.array([isinstance(i,int) for i in train_mols]))
            elif (param.model_type == 'Point'):
                labels = (train_mols.columns.str.startswith('fp_',na=False) | np.array([isinstance(i,int) for i in train_mols]) | (train_mols.columns == 'T') )
            train_scaled.loc[:,labels] = scaler.fit_transform(train_mols.loc[:,labels])
            test_scaled.loc[:,labels]  = scaler.transform(test_mols.loc[:,labels])
        # Target scaling
        if (param.scale_targets == True):
            self.tscaler = StandardScaler()
            if (param.model_type == 'Arrhenius'):
                if param.target == 'All':
                    tlabels= ((train_mols.columns == 'oh_A')|(train_mols.columns == 'oh_n')|(train_mols.columns == 'oh_B')|
                              (train_mols.columns == 'o3_A')|(train_mols.columns == 'o3_n')|(train_mols.columns == 'o3_B')|
                              (train_mols.columns == 'no3_A')|(train_mols.columns == 'no3_n')|(train_mols.columns == 'no3_B')|
                              (train_mols.columns == 'cl_A')|(train_mols.columns == 'cl_n')|(train_mols.columns == 'cl_B'))
                elif param.target == 'OH':
                    tlabels= ((train_mols.columns == 'oh_A')|(train_mols.columns == 'oh_n')|(train_mols.columns == 'oh_B'))
                elif param.target == 'O3':
                    tlabels= ((train_mols.columns == 'o3_A')|(train_mols.columns == 'o3_n')|(train_mols.columns == 'o3_B'))
                elif param.target == 'NO3':
                    tlabels= ((train_mols.columns == 'no3_A')|(train_mols.columns == 'no3_n')|(train_mols.columns == 'no3_B'))
                elif param.target == 'Cl':
                    tlabels= ((train_mols.columns == 'cl_A')|(train_mols.columns == 'cl_n')|(train_mols.columns == 'cl_B'))
                else:
                    print('Target not supported. Please choose from [All,OH,O3,NO3,Cl]')
#                 if param.scale_k==True:
#                     self.t2scaler = StandardScaler()
#                     t2labels= ((train_mols.columns == 'oh_logK')|(train_mols.columns == 'o3_logK')|(train_mols.columns == 'no3_logK')|(train_mols.columns == 'cl_logK'))
#                     train_scaled.loc[:,t2labels] = self.t2scaler.fit_transform(train_mols.loc[:,t2labels])
#                     test_scaled.loc[:,t2labels]  = self.t2scaler.transform(test_mols.loc[:,t2labels])
            elif (param.model_type == 'Point'):
                if param.target == 'All':
                    tlabels= ((train_mols.columns == 'oh_logK')|(train_mols.columns == 'o3_logK')|(train_mols.columns == 'no3_logK')|(train_mols.columns == 'cl_logK'))
                elif param.target == 'OH':
                    tlabels= (train_mols.columns == 'oh_logK')
                elif param.target == 'O3':
                    tlabels= (train_mols.columns == 'o3_logK')
                elif param.target == 'NO3':
                    tlabels= (train_mols.columns == 'no3_logK')
                elif param.target == 'Cl':
                    tlabels= (train_mols.columns == 'cl_logK')
                else:
                    print('Target not supported. Please choose from [All,OH,O3,NO3,Cl]')
            train_scaled.loc[:,tlabels] = self.tscaler.fit_transform(train_mols.loc[:,tlabels])
            test_scaled.loc[:,tlabels]  = self.tscaler.transform(test_mols.loc[:,tlabels])
        return train_scaled,test_scaled
    
    def torch_inverse(self,X):
        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        return((X*torch.Tensor(np.sqrt(self.tscaler.var_)).to(device))+torch.Tensor(self.tscaler.mean_).to(device))
    
#     def torch_inverse2(self,i,X):
#         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#         return((X*torch.Tensor(np.sqrt(self.t2scaler.var_))[i].to(device))+torch.Tensor(self.t2scaler.mean_)[i].to(device))
    
#     def torch_transform(self,i,X):
#         device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#         return((X-torch.Tensor(self.t2scaler.mean_)[i].to(device))/torch.Tensor(np.sqrt(self.t2scaler.var_))[i].to(device))

In [None]:
def map_to_scale(A,B,n):
    if A.shape[1] == 1:
        output = torch.column_stack((A,B,n))
    else:
        output = torch.column_stack([A.reshape([A.shape[0]*A.shape[1]]),B.reshape([B.shape[0]*B.shape[1]]),
                                     n.reshape([n.shape[0]*n.shape[1]])]).reshape([A.shape[0],12])
    return (output)