## Evaluate file

In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from ipynb.fs.full.Utils import arrhenius, plot_results, plot_arrhenius_params, plot_loss, print_error, set_seed
from ipynb.fs.full.Model import build_model, model_initialize
from ipynb.fs.full.Data_loader import make_data_loader, Scaler, map_to_scale
from ipynb.fs.full.Get_fingerprints import get_rdkit_fingerprint, get_cp_fingerprint
from sklearn.utils import shuffle
from ipynb.fs.full.train_model import train_model
from sklearn.model_selection import KFold,StratifiedKFold

Function takes in a model and test data loader to perform evaluation

In [2]:
def evaluate_model(model,test_loader,scaler=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    features,target,temp= next(iter(test_loader))
    features = features.to(device)
    temp   = temp.to(device)
    model.eval()
    try:
        model.fck # check if model outputs k
        temp = temp.reshape([temp.shape[0],1])
        data = torch.cat((features,temp),1)
        #data = torch.nan_to_num(data)
        logk = model(data)
        if model.fck.out_features==1:
            target=target[:,0].reshape([target.shape[0],1])
        else:
            target=target[:,[0,4,8,12]]
        if scaler.tscaler != None:
            logk = scaler.torch_inverse(logk.to(device))
            target = scaler.torch_inverse(target.to(device)).cpu()
        kvec=logk.cpu().detach().numpy()
        plot_results(target,kvec,temp.cpu())
    
    except AttributeError :
        A,n,B = model(features)
        if scaler.tscaler != None:
            scaled = scaler.torch_inverse(map_to_scale(A,B,n).to(device))
            j = np.array(list(range(model.fcA.out_features)))*3
            A = scaled[:,(j+0)]
            B = scaled[:,(j+1)]
            n = scaled[:,(j+2)]
            j = np.array(list(range(model.fcA.out_features)))*4
            targetlogk = target[:,j].to(device)
            targetA = target[:,(j+1)]
            targetB = target[:,(j+2)]
            targetn = target[:,(j+3)]
            Xt = torch.column_stack([targetA,targetB,targetn]).to(device)
            target_scaled = Xt
            
        logk = torch.ones([A.shape[0],model.fcA.out_features])    
        for i in range(model.fcA.out_features):
            logk[:,i] = arrhenius(torch.pow(10.,A[:,i].cpu()),n[:,i].cpu(),B[:,i].cpu(),temp.cpu())
        kvec=logk.cpu().detach().numpy()
        temp = temp.reshape([temp.shape[0],1])
        if model.fcA.out_features==1:
            target=target[:,0].reshape([target.shape[0],1]).to(device)
            #target_scaled=torch.column_stack([target.cpu(),target_scaled.cpu()])
        else:
            target=target[:,[0,4,8,12]]
        plot_results(target.cpu(),kvec,temp.cpu())
        #plot_arrhenius_params(target_scaled.cpu(),A.cpu(),n.cpu(),B.cpu())
    
    print(target.shape,kvec.shape)
    RMS = print_error(target.cpu(),kvec)

    return(kvec,RMS)

In [None]:
def evaluate_ABN(model,test_loader,scaler=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    features,target,temp= next(iter(test_loader))
    features = features.to(device)
    temp   = temp.to(device)
    model.eval()
    try:
        model.fck # check if model outputs k
        temp = temp.reshape([temp.shape[0],1])
        data = torch.cat((features,temp),1)
        #data = torch.nan_to_num(data)
        logk = model(data)
        if model.fck.out_features==1:
            target=target[:,0].reshape([target.shape[0],1])
        else:
            target=target[:,[0,4,8,12]]
        if scaler.tscaler != None:
            logk = scaler.torch_inverse(logk.to(device))
            target = scaler.torch_inverse(target.to(device)).cpu()
        kvec=logk.cpu().detach().numpy()
        plot_results(target,kvec,temp.cpu())
    
    except AttributeError :
        A,n,B = model(features)
        if scaler.tscaler != None:
            scaled = scaler.torch_inverse(map_to_scale(A,B,n).to(device))
            j = np.array(list(range(model.fcA.out_features)))*3
            A = scaled[:,(j+0)]
            B = scaled[:,(j+1)]
            n = scaled[:,(j+2)]
            j = np.array(list(range(model.fcA.out_features)))*4
            targetlogk = target[:,j].to(device)
            targetA = target[:,(j+1)]
            targetB = target[:,(j+2)]
            targetn = target[:,(j+3)]
            Xt = torch.column_stack([targetA,targetB,targetn]).to(device)
            target_scaled = Xt
            
        logk = torch.ones([A.shape[0],model.fcA.out_features])    
        for i in range(model.fcA.out_features):
            logk[:,i] = arrhenius(torch.pow(10.,A[:,i].cpu()),n[:,i].cpu(),B[:,i].cpu(),temp.cpu())
        kvec=logk.cpu().detach().numpy()
        temp = temp.reshape([temp.shape[0],1])
        if model.fcA.out_features==1:
            target=target[:,0].reshape([target.shape[0],1]).to(device)
            #target_scaled=torch.column_stack([target.cpu(),target_scaled.cpu()])
        else:
            target=target[:,[0,4,8,12]]
        plot_results(target.cpu(),kvec,temp.cpu())
        #plot_arrhenius_params(target_scaled.cpu(),A.cpu(),n.cpu(),B.cpu())
    
    print(target.shape,kvec.shape)
    RMS = print_error(target.cpu(),kvec)

    return(kvec,RMS)

## Cross-validation
Takes in dataframe and parameters to perform K-fold cross-validation on

In [None]:
def cross_validate(df,param):
    train_molecules = shuffle(df['smiles'].unique())
    hebo = df.drop_duplicates(subset=['smiles'])
    if param.split == 'Stratified':
        kf = StratifiedKFold(n_splits=param.num_folds)
    elif param.split == 'Kfold':
        kf = KFold(n_splits=param.num_folds)
    else:
        print('Choose either Stratified or Kfold.')
    error = np.zeros((kf.n_splits,4 if param.target == 'All' else 1))
    i=0
    for train_index, test_index in kf.split(hebo.smiles,hebo.Type):
        set_seed(param.seed)
        # Split data to train/test folds
        print('####### Evaluating fold',i+1,'in a total of',param.num_folds,'folds #######')
        train_fold_mols, test_fold_mols = split_data(df,train_molecules[test_index],param)
        # Do scaling if needed
        Scobj=Scaler()
        train_fold_mols,test_fold_mols = Scobj.Scale_data(train_fold_mols,test_fold_mols,param)
        # Create and run model
        model, criterion, optimizer, scheduler = model_initialize(df,param)
        loss = train_model(model,optimizer,criterion,n_epochs=param.n_epochs,weight=param.weights,bweight=param.bweights,
                           train_loader=make_data_loader(train_fold_mols,param),
                           scheduler=scheduler,valid_loader=make_data_loader(test_fold_mols,param),scaler=Scobj)
        plot_loss(loss)
        # Evaluate and calculate error
        Prediction,error[i,:] = evaluate_model(model,make_data_loader(test_fold_mols,param,test=True),Scobj)
        i+=1
    print('\nCV-RMSE=',error.mean(axis=0))
    return loss

## Ensamble evaluation

In [1]:
def run_ensamble(options_test,param):
    df = pd.read_csv('./Data/df.csv')
    #make_df()
    test_mols  = df.loc[ df['smiles'].isin(options_test)]
    target_mols = ['OH','O3','NO3','Cl','All']
    target_names= ['oh_logK','o3_logK','no3_logK','cl_logK','logK']
    mol = target_names[target_mols.index(param.target)]
    Prediction= np.zeros(( test_mols.shape[0] if param.target == 'All' else test_mols.dropna(subset=[mol]).shape[0],
                          4 if param.target == 'All' else 1,5))
    for i in range(1):
        set_seed(param.seed+i*10)
        print('####### Evaluating model',i+1,'in an ensamble of 5 #######')
        # Load ensamble features
        df = pd.read_csv('./Data/df.csv')
        df = df.join(get_rdkit_fingerprint(df))
        cp_input = pd.read_csv('./Data/VOCensamble{}.csv'.format(i))
        cp_features = cp_input.iloc[:,cp_input.columns != 'smiles']
        df = df.join(cp_features)
        # Split data to train/test sets
#         train_mols = shuffle(df.loc[~df['smiles'].isin(options_test)])
#         test_mols  = df.loc[ df['smiles'].isin(options_test)]
        train_mols, test_mols = split_data(df,options_test,param)
        # Do scaling if needed
        Scobje=Scaler()
        train_mols,test_mols = Scobje.Scale_data(train_mols,test_mols,param)
        # Create and run model
        model, criterion, optimizer, scheduler = model_initialize(df,param)
        loss = train_model(model,optimizer,criterion,n_epochs=param.n_epochs,weight=param.weights,bweight=param.bweights,
                           train_loader=make_data_loader(train_mols,param),
                           scheduler=scheduler,valid_loader=make_data_loader(test_mols,param),scaler=Scobje) 
        plot_loss(loss)
        test_loader = make_data_loader(test_mols,param,test=True)         
        Prediction[:,:,i],_ = evaluate_model(model,test_loader,Scobje)
    return Prediction

In [None]:
def split_data(df,test_options,param):
    # split data
    if param.teston == 'All':
        train_mols = shuffle(df.loc[~df['smiles'].isin(test_options)])
    elif param.teston == 'OH':
        train_mols = shuffle(df.copy())
        train_mols.loc[df['smiles'].isin(test_options),['oh_logK','oh_A','oh_B','oh_n']] = np.nan
    elif param.teston == 'O3':
        train_mols = shuffle(df.copy())
        train_mols.loc[df['smiles'].isin(test_options),['o3_logK','o3_A','o3_B','o3_n']] = np.nan
    elif param.teston == 'NO3':
        train_mols = shuffle(df.copy())
        train_mols.loc[df['smiles'].isin(test_options),['no3_logK','no3_A','no3_B','no3_n']] = np.nan
    elif param.teston == 'Cl':
        train_mols = shuffle(df.copy())
        train_mols.loc[df['smiles'].isin(test_options),['cl_logK','cl_A','cl_B','cl_n']] = np.nan
    else:
        print('Target not supported. Please choose from [All,OH,O3,NO3,Cl]')
    test_mols  = df.loc[df['smiles'].isin(test_options)]
    # trim data
    if param.target == 'OH':
        label = ['oh_logK']
    elif param.target == 'O3':
        label = ['o3_logK']
    elif param.target == 'NO3':
        label = ['no3_logK']
    elif param.target == 'Cl':
        label = ['cl_logK']
    else:
        label = ['oh_logK','o3_logK','no3_logK','cl_logK']
    train_mols = train_mols.dropna(subset=label,how='all')
    test_mols  = test_mols.dropna(subset=label,how='all')

    return (train_mols, test_mols)