Make imports and define functions

In [1]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
# install required version of rdkit
! pip install rdkit-pypi==2022.3.5



In [3]:
import sys,os
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
from rdkit import Chem
from rdkit.Chem import AllChem
import random
from typing import Literal

from itertools import product

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import MinMaxScaler

from sklearn.feature_selection import f_regression, mutual_info_regression, r_regression, SelectKBest
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from scipy.stats import pearsonr
from math import e
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.svm import SVR

from sklearn.decomposition import PCA

from lightgbm import LGBMRegressor, LGBMClassifier

import json

import sys
sys.path.insert(0, "/content/drive/MyDrive/MLP_final/")
sys.path.insert(0, "/content/drive/MyDrive/MLP_final/ds_packg/")

from ds_packg.data_processing import one_hot_encode, remove_constant_cols, convert_to_canonical_smiles, remove_duplicates, \
                            normalize_columns, scramble_array
from ds_packg.feature_selection import FeatSelector
from ds_packg.model import ModelClass
from ds_packg.method_validation_final import LeaveNoutNx, LeaveNout, CrossVal

import os

def prepare_overall_data(sulfonamide_feats_data_path, chanlam_data_path, structure_cols, df_target_cols, verbose = False):

    #Load and process sulfonamide features
    df_sulfonamide = pd.read_csv(sulfonamide_feats_data_path)
    df_sulfonamide.drop(columns = ['catalyst_name'], inplace = True)
    df_sulfonamide = remove_constant_cols(df_sulfonamide)
    sulfonamide_feats = list(df_sulfonamide.columns[5:])

    #Load and process Chan-Lam data
    df_chan_lam = pd.read_csv(chanlam_data_path, index_col = 0)
    df_chan_lam.dropna(inplace = True)
    df_chan_lam.reset_index(drop= True, inplace = True)
    df_chan_lam.rename(columns={'Sulfonamide':"sulfonamide", 'Boronic Acid':"boronic_acid",
                                'Catalyst':"catalyst", 'Catalyst Name':"catalyst_name", 'Base':"base",
                                'Solvent':"solvent"}, inplace = True)

    df_chan_lam[df_target_cols] = df_chan_lam[df_target_cols]/100
    df_chan_lam = remove_duplicates(df_chan_lam, structure_cols)
    df_chan_lam = df_chan_lam[structure_cols + hand_picked_qm_cols + df_target_cols]

    #Prepare setup and combine both datasets
    df_chan_lam, df_sulfonamide = convert_to_canonical_smiles([df_chan_lam, df_sulfonamide], structure_cols, verbose)
    df_chan_lam_newqm = pd.merge(df_chan_lam, df_sulfonamide, on= structure_cols)

    qm_cols_use = hand_picked_qm_cols+sulfonamide_feats
    df_chan_lam_newqm = df_chan_lam_newqm[structure_cols+qm_cols_use+df_target_cols] #

    #Prepare features for ML/DL
    sulfo_series = df_chan_lam_newqm["sulfonamide"].copy()
    df_chan_lam_newqm, struct_ohe_cols = one_hot_encode(df_chan_lam_newqm, structure_cols)
    df_chan_lam_newqm = normalize_columns(df_chan_lam_newqm, qm_cols_use)
    df_chan_lam_newqm["sulfonamide"] = sulfo_series
    df_chan_lam = df_chan_lam_newqm.sample(frac=1, random_state = rand_state)

    return df_chan_lam, struct_ohe_cols, sulfonamide_feats

def print_performance(score_dict):
    for key, scores in score_dict.items():
        r2_scores = scores['R2']
        mses = scores['RMSE']
        maes = scores['MAE']

        print(key)
        print(f"R2 Score: {np.mean(r2_scores):.5f} ± {np.std(r2_scores):.5f}")
        if np.any(np.array(r2_scores) < 0):
            print(f"Cap Avg R2 Score: {np.mean([v if v >= 0 else 0 for v in r2_scores]):.5f} ± {np.std([v if v >= 0 else 0 for v in r2_scores]):.5f}")
        print(f"MAE: {np.mean(maes):.3%} ± {np.std(maes):.3%}")
        print(f"RMSE: {np.mean(mses):.3%} ± {np.std(mses):.3%}")
        print()

def get_reg_metrics(y_true, y_pred):
    R2 = r2_score(y_true, y_pred)
    MAE = mean_absolute_error(y_true, y_pred)
    RMSE = mean_squared_error(y_true, y_pred, squared=False)
    return R2, MAE, RMSE


def flatten_extend(matrix):
    flat_list = []
    for row in matrix:
        flat_list.extend(row)
    return flat_list

def gen_leave_out_list(lo_values: list,
                       lo_n: int=1,
                       random_state: int=42,
                       )->list:
    random.Random(random_state).shuffle(lo_values)
    lo_tests = []
    for i in range(int(len(lo_values)/lo_n)):
        lo_tests.append(lo_values[i*lo_n:(i+1)*lo_n])
    return lo_tests

def read_json(filename: str) -> dict:
    '''
    Read in a json file and return dictionary
    '''
    with open(filename, 'r') as jo:
        json_obj = json.load(jo)

    return json_obj

rand_state = 42

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



Read in feature sets identified via feature selection

In [4]:
feats = read_json('/content/drive/MyDrive/MLP_final/Feature_selection.json')

feats.keys()

dict_keys(['Only OHE', 'All Properties', 'DFT', 'NoCoLinear', 'F-regression', 'MutualInfo', 'F-MI_Intersection', 'F-MI_Union', 'RandomForest', 'PCA', 'SHAP'])

In [5]:
df_chan_lam = pd.read_csv('/content/drive/MyDrive/MLP_final/Data/ChanLam_full_dataset_with_features.csv')
df = df_chan_lam.copy()

In [6]:
# generate the list of sulfonamides for LOO

sulfo_series = df['sulfonamide'].copy()

In [7]:
# Define the columns for OHE (structure_cols) and y-output (target_cols)

structure_cols = ['sulfonamide', 'boronic_acid', 'catalyst', 'base', 'solvent']
target_cols = ['RAW-MonoYield (%)']
to_drop = ['boronic_acid', 'catalyst', 'catalyst_name', 'base', 'solvent',
           'MonoYield (%)', 'BisYield (%)', 'TotalYield (%)', 'MonoProduct', 'BisProduct',
           'MajorReaction', 'MinorReaction', 'MonoSTD', 'BisSTD', 'TotalSTD', 'Repetitions',
           'CleanLists', 'RAW-MonoSTD', 'RAW-BisYield (%)', 'RAW-BisSTD',
           'RAW-TotalYield (%)', 'RAW-TotalSTD', 'RAW-Repetitions', 'RawLists',
           'RAW-MonoYield-STD Ratio', 'RAW-MonoYield-Range', 'RAW-RangeYield-Fraction']

In [8]:
# one-hot-encode the appropriate columns

df_ssg, struct_ohe_cols = one_hot_encode(df, ['sulfonamide', 'boronic_acid', 'catalyst', 'base', 'solvent'])
df_ssg['sulfonamide'] = sulfo_series
df_ssg[structure_cols] = df[structure_cols].values

validation_sulfonamide_ohe_cols = [f'sulfonamide_{x}' for x in df_ssg[df_ssg['Set']=='Validation']['sulfonamide'].unique()]

for val_ohe in validation_sulfonamide_ohe_cols:
  struct_ohe_cols.remove(val_ohe)
  df_ssg.drop(val_ohe, axis=1, inplace=True)

df_ssg = df_ssg.sample(frac=1, random_state=42)
df_ssg[target_cols] = df_ssg[target_cols]/100

df_ssg.drop(to_drop, axis=1, inplace=True)

Modeling

In [9]:
# set hyperparameters for MLP

hyperparams_single = {
    'hidden_size': 375,
    'drop_out_rate': 0.55,
    'step_schedule': 80,
    'epochs': 200,
    'batch_size': 125,
    'output_size': 1,
    'l1_reg' : 0,
    'l2_reg' : 0
}

hidden_size, drop_out_rate, step_schedule, num_epochs, batch_size, output_size, l1_reg, l2_reg = hyperparams_single.values()

30 random 80/20 splits

In [13]:
# MLP metric

cols_to_use = struct_ohe_cols + feats['SHAP'] + target_cols + ['Set', 'sulfonamide', 'SO2N-dG_deprotonation_gas', 'C_Hirschfeld', 'N_Hirschfeld']
df_ = df_ssg[cols_to_use].copy()
df_ = df_[df_['Set']=='Train'].copy().drop('Set', axis=1)

#Initialize model class
input_size = len(df_.columns) - len(target_cols) - 1
MC = ModelClass("MLP", input_size, hidden_size, output_size, drop_out_rate, step_schedule, num_epochs, l1_reg, l2_reg, rand_state)

#Initialize CV
CV = CrossVal(df_.drop(columns=['sulfonamide']), MC, batch_size, rand_state=rand_state, n_cv=30)

#Run training loop with CV
CV.execute_loop(target_cols, output_size)
CV.print_performance()

mlp_preds = pd.merge(left=df['RAW-MonoYield (%)'], right=CV.predictions_df.set_index('Index'), left_index=True, right_index=True)
to_save = pd.merge(left=mlp_preds, right=df[['sulfonamide', 'boronic_acid', 'catalyst', 'base', 'solvent']], left_index=True, right_index=True)

to_save.to_csv('/content/drive/MyDrive/MLP_final/mlp_preds_random.csv')

  0%|          | 0/30 [00:00<?, ?it/s]

Cross-validation fold: 1/30


  self.predictions_df = pd.concat([self.predictions_df, fold_df], ignore_index=True)
  3%|▎         | 1/30 [00:48<23:16, 48.15s/it]

Fold 1 R2 Score: 0.80135, MSE: 0.09389, MAE: 0.06591
Cross-validation fold: 2/30


  7%|▋         | 2/30 [01:36<22:27, 48.12s/it]

Fold 2 R2 Score: 0.79637, MSE: 0.09761, MAE: 0.06999
Cross-validation fold: 3/30


 10%|█         | 3/30 [02:23<21:31, 47.85s/it]

Fold 3 R2 Score: 0.77513, MSE: 0.09882, MAE: 0.06959
Cross-validation fold: 4/30


 13%|█▎        | 4/30 [03:11<20:42, 47.78s/it]

Fold 4 R2 Score: 0.80687, MSE: 0.08852, MAE: 0.06396
Cross-validation fold: 5/30


 17%|█▋        | 5/30 [03:59<19:55, 47.81s/it]

Fold 5 R2 Score: 0.79561, MSE: 0.09464, MAE: 0.06713
Cross-validation fold: 6/30


 20%|██        | 6/30 [04:48<19:16, 48.17s/it]

Fold 6 R2 Score: 0.80424, MSE: 0.08883, MAE: 0.06257
Cross-validation fold: 7/30


 23%|██▎       | 7/30 [05:36<18:27, 48.16s/it]

Fold 7 R2 Score: 0.79546, MSE: 0.09922, MAE: 0.06863
Cross-validation fold: 8/30


 27%|██▋       | 8/30 [06:23<17:33, 47.91s/it]

Fold 8 R2 Score: 0.78344, MSE: 0.09686, MAE: 0.06893
Cross-validation fold: 9/30


 30%|███       | 9/30 [07:10<16:38, 47.57s/it]

Fold 9 R2 Score: 0.83077, MSE: 0.08417, MAE: 0.06045
Cross-validation fold: 10/30


 33%|███▎      | 10/30 [07:57<15:48, 47.40s/it]

Fold 10 R2 Score: 0.78896, MSE: 0.09605, MAE: 0.06637
Cross-validation fold: 11/30


 37%|███▋      | 11/30 [08:45<15:05, 47.65s/it]

Fold 11 R2 Score: 0.80947, MSE: 0.09215, MAE: 0.06455
Cross-validation fold: 12/30


 40%|████      | 12/30 [09:33<14:19, 47.73s/it]

Fold 12 R2 Score: 0.77804, MSE: 0.09473, MAE: 0.06427
Cross-validation fold: 13/30


 43%|████▎     | 13/30 [10:20<13:25, 47.37s/it]

Fold 13 R2 Score: 0.77854, MSE: 0.10037, MAE: 0.06921
Cross-validation fold: 14/30


 47%|████▋     | 14/30 [11:06<12:34, 47.18s/it]

Fold 14 R2 Score: 0.78166, MSE: 0.09644, MAE: 0.06908
Cross-validation fold: 15/30


 50%|█████     | 15/30 [11:53<11:45, 47.04s/it]

Fold 15 R2 Score: 0.80979, MSE: 0.08918, MAE: 0.06233
Cross-validation fold: 16/30


 53%|█████▎    | 16/30 [12:41<11:00, 47.16s/it]

Fold 16 R2 Score: 0.79808, MSE: 0.09478, MAE: 0.06795
Cross-validation fold: 17/30


 57%|█████▋    | 17/30 [13:28<10:12, 47.14s/it]

Fold 17 R2 Score: 0.80957, MSE: 0.09031, MAE: 0.06329
Cross-validation fold: 18/30


 60%|██████    | 18/30 [14:14<09:23, 46.96s/it]

Fold 18 R2 Score: 0.81517, MSE: 0.09165, MAE: 0.06459
Cross-validation fold: 19/30


 63%|██████▎   | 19/30 [15:02<08:39, 47.22s/it]

Fold 19 R2 Score: 0.79805, MSE: 0.09367, MAE: 0.06567
Cross-validation fold: 20/30


 67%|██████▋   | 20/30 [15:50<07:55, 47.51s/it]

Fold 20 R2 Score: 0.79051, MSE: 0.09111, MAE: 0.06496
Cross-validation fold: 21/30


 70%|███████   | 21/30 [16:38<07:09, 47.73s/it]

Fold 21 R2 Score: 0.79014, MSE: 0.09528, MAE: 0.06763
Cross-validation fold: 22/30


 73%|███████▎  | 22/30 [17:26<06:20, 47.55s/it]

Fold 22 R2 Score: 0.78163, MSE: 0.09317, MAE: 0.06393
Cross-validation fold: 23/30


 77%|███████▋  | 23/30 [18:14<05:34, 47.71s/it]

Fold 23 R2 Score: 0.80333, MSE: 0.09416, MAE: 0.06511
Cross-validation fold: 24/30


 80%|████████  | 24/30 [19:01<04:45, 47.53s/it]

Fold 24 R2 Score: 0.78294, MSE: 0.09104, MAE: 0.06461
Cross-validation fold: 25/30


 83%|████████▎ | 25/30 [19:48<03:57, 47.40s/it]

Fold 25 R2 Score: 0.79971, MSE: 0.09556, MAE: 0.06968
Cross-validation fold: 26/30


 87%|████████▋ | 26/30 [20:35<03:08, 47.22s/it]

Fold 26 R2 Score: 0.82944, MSE: 0.08607, MAE: 0.06093
Cross-validation fold: 27/30


 90%|█████████ | 27/30 [21:21<02:20, 46.95s/it]

Fold 27 R2 Score: 0.78954, MSE: 0.10136, MAE: 0.07032
Cross-validation fold: 28/30


 93%|█████████▎| 28/30 [22:08<01:34, 47.01s/it]

Fold 28 R2 Score: 0.80058, MSE: 0.09674, MAE: 0.06857
Cross-validation fold: 29/30


 97%|█████████▋| 29/30 [22:56<00:47, 47.14s/it]

Fold 29 R2 Score: 0.80497, MSE: 0.09056, MAE: 0.06420
Cross-validation fold: 30/30


100%|██████████| 30/30 [23:43<00:00, 47.43s/it]


Fold 30 R2 Score: 0.78768, MSE: 0.09728, MAE: 0.06784
R2 Score: 0.79723 ± 0.01369
Cap Avg R2 Score: 0.79723 ± 0.01369
MAE: 6.608% ± 0.274%
RMSE: 9.381% ± 0.405%


In [14]:
# 30 random 80/20 splits with other models: chemically informed; scale each split individually instead of train / test at once

models = [LGBMRegressor(
                 learning_rate=0.01, # 0.01
                 n_estimators=1024, # 1024
                 reg_alpha=0.01, # 0.01
                 reg_lambda=0.01, # 0.01
                 verbose=-1,
                 n_jobs=-1,
                 random_state=rand_state),

          RandomForestRegressor(
                 verbose=False,
                 n_jobs=-1,
                 n_estimators=1024,
                 min_samples_split=2,
                 random_state=rand_state),

          SVR(
                 verbose=False,
                 kernel='rbf',
                 C=0.25,
                 epsilon=0.1,
                 gamma='scale')]

model_names = ['LGBM', 'RF', 'SVM']

cols_to_use = struct_ohe_cols + feats['SHAP'] + target_cols + ['Set', 'sulfonamide', 'SO2N-dG_deprotonation_gas', 'C_Hirschfeld', 'N_Hirschfeld']
df_ = df_ssg[cols_to_use].copy()
df_ = df_[df_['Set']=='Train'].copy().drop('Set', axis=1)

results = {}

# Initialize lists
results = {}
predictions_dict = {}

# Initialize lists
for name, model in tqdm(zip(model_names, models)):
    results[name] = {}
    predictions_dict[name] = {'Index': [], 'True': [], 'Predicted': []}

    r2_list = []
    mae_list = []
    rmse_list = []
    iter_list = []

    for i in tqdm(range(0, 30)):
        iter_list.append(i)

        # Split data
        df_train, df_test = train_test_split(df_, random_state=i, train_size=0.8)

        # Save test indices
        test_indices = df_test.index.tolist()

        X_train, X_test = df_train.drop(columns=['RAW-MonoYield (%)', 'sulfonamide']).values, df_test.drop(columns=['RAW-MonoYield (%)', 'sulfonamide']).values
        y_train, y_test = np.asarray(df_train['RAW-MonoYield (%)'].tolist(), dtype=np.float64), np.asarray(df_test['RAW-MonoYield (%)'].tolist(), dtype=np.float64)

        ## Scale due to having QM descriptors
        scaler = MinMaxScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_test = scaler.transform(X_test)

        # Fit the model
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        # Store test indices, true and predicted values
        predictions_dict[name]['Index'].extend(test_indices)
        predictions_dict[name]['True'].extend(y_test)
        predictions_dict[name]['Predicted'].extend(y_pred)

        # Calculate metrics
        R2, MAE, RMSE = get_reg_metrics(y_true=y_test, y_pred=y_pred)

        r2_list.append(R2)
        mae_list.append(MAE)
        rmse_list.append(RMSE)

        r2_array = np.asarray(r2_list, dtype=np.float64)
        mae_array = np.asarray(mae_list, dtype=np.float64)
        rmse_array = np.asarray(rmse_list, dtype=np.float64)

    results[name] = {'CV_Iter': iter_list, 'R2': r2_array, 'MAE': mae_array, 'RMSE': rmse_array}

# Save true vs. predicted values as DataFrames
for name in model_names:
    df_pred = pd.DataFrame(predictions_dict[name])
    mlp_preds = pd.merge(left=df['RAW-MonoYield (%)'], right=df_pred.set_index('Index'), left_index=True, right_index=True)
    to_save = pd.merge(left=mlp_preds, right=df[['sulfonamide', 'boronic_acid', 'catalyst', 'base', 'solvent']], left_index=True, right_index=True)
    to_save.to_csv(f'/content/drive/MyDrive/MLP_final/{name}_preds_random.csv')

0it [00:00, ?it/s]






























100%|██████████| 30/30 [00:45<00:00,  1.51s/it]
1it [00:45, 45.29s/it]






























100%|██████████| 30/30 [11:36<00:00, 23.21s/it]
2it [12:21, 428.21s/it]






























100%|██████████| 30/30 [00:10<00:00,  2.88it/s]
3it [12:31, 250.66s/it]


MLP baseline models

In [15]:
# X-scrambled baseline model
cols_to_use = struct_ohe_cols + feats['SHAP'] + target_cols + ['Set', 'sulfonamide', 'SO2N-dG_deprotonation_gas', 'C_Hirschfeld', 'N_Hirschfeld']
df_ = df_ssg[cols_to_use].copy()
df_ = df_[df_['Set']=='Train'].copy().drop('Set', axis=1)

#Initialize model class
input_size = len(df_.columns) - len(target_cols) - 1
MC = ModelClass("MLP", input_size, hidden_size, output_size, drop_out_rate, step_schedule, num_epochs, l1_reg, l2_reg, rand_state)

#Initialize CV
CV = CrossVal(df_.drop(columns=['sulfonamide']), MC, batch_size, rand_state=rand_state, n_cv=30, scramble_X=True)

#Run training loop with CV
CV.execute_loop(target_cols, output_size)
CV.print_performance()

mlp_preds = pd.merge(left=df['RAW-MonoYield (%)'], right=CV.predictions_df.set_index('Index'), left_index=True, right_index=True)
to_save = pd.merge(left=mlp_preds, right=df[['sulfonamide', 'boronic_acid', 'catalyst', 'base', 'solvent']], left_index=True, right_index=True)
to_save.to_csv('/content/drive/MyDrive/MLP_final/mlp_preds_Xscrambled.csv')

  0%|          | 0/30 [00:00<?, ?it/s]

Cross-validation fold: 1/30


  self.predictions_df = pd.concat([self.predictions_df, fold_df], ignore_index=True)
  3%|▎         | 1/30 [00:47<23:08, 47.89s/it]

Fold 1 R2 Score: -0.33436, MSE: 0.24334, MAE: 0.18363
Cross-validation fold: 2/30


  7%|▋         | 2/30 [01:34<22:00, 47.16s/it]

Fold 2 R2 Score: -0.48475, MSE: 0.26358, MAE: 0.20112
Cross-validation fold: 3/30


 10%|█         | 3/30 [02:22<21:18, 47.34s/it]

Fold 3 R2 Score: -0.37172, MSE: 0.24407, MAE: 0.18734
Cross-validation fold: 4/30


 13%|█▎        | 4/30 [03:09<20:27, 47.23s/it]

Fold 4 R2 Score: -0.34900, MSE: 0.23396, MAE: 0.18136
Cross-validation fold: 5/30


 17%|█▋        | 5/30 [03:55<19:36, 47.05s/it]

Fold 5 R2 Score: -0.41623, MSE: 0.24912, MAE: 0.19199
Cross-validation fold: 6/30


 20%|██        | 6/30 [04:41<18:37, 46.55s/it]

Fold 6 R2 Score: -0.36345, MSE: 0.23444, MAE: 0.17928
Cross-validation fold: 7/30


 23%|██▎       | 7/30 [05:27<17:43, 46.22s/it]

Fold 7 R2 Score: -0.44647, MSE: 0.26387, MAE: 0.20158
Cross-validation fold: 8/30


 27%|██▋       | 8/30 [06:14<17:04, 46.57s/it]

Fold 8 R2 Score: -0.45780, MSE: 0.25131, MAE: 0.19328
Cross-validation fold: 9/30


 30%|███       | 9/30 [07:00<16:16, 46.52s/it]

Fold 9 R2 Score: -0.24481, MSE: 0.22829, MAE: 0.17424
Cross-validation fold: 10/30


 33%|███▎      | 10/30 [07:46<15:25, 46.27s/it]

Fold 10 R2 Score: -0.40790, MSE: 0.24809, MAE: 0.18973
Cross-validation fold: 11/30


 37%|███▋      | 11/30 [08:31<14:33, 45.97s/it]

Fold 11 R2 Score: -0.39245, MSE: 0.24910, MAE: 0.18936
Cross-validation fold: 12/30


 40%|████      | 12/30 [09:17<13:45, 45.88s/it]

Fold 12 R2 Score: -0.38910, MSE: 0.23698, MAE: 0.18222
Cross-validation fold: 13/30


 43%|████▎     | 13/30 [10:02<12:57, 45.73s/it]

Fold 13 R2 Score: -0.34846, MSE: 0.24767, MAE: 0.18580
Cross-validation fold: 14/30


 47%|████▋     | 14/30 [10:49<12:14, 45.90s/it]

Fold 14 R2 Score: -0.43893, MSE: 0.24756, MAE: 0.19097
Cross-validation fold: 15/30


 50%|█████     | 15/30 [11:35<11:29, 45.96s/it]

Fold 15 R2 Score: -0.33805, MSE: 0.23654, MAE: 0.18409
Cross-validation fold: 16/30


 53%|█████▎    | 16/30 [12:20<10:42, 45.86s/it]

Fold 16 R2 Score: -0.39476, MSE: 0.24910, MAE: 0.19403
Cross-validation fold: 17/30


 57%|█████▋    | 17/30 [13:07<10:00, 46.21s/it]

Fold 17 R2 Score: -0.31636, MSE: 0.23744, MAE: 0.18368
Cross-validation fold: 18/30


 60%|██████    | 18/30 [13:54<09:17, 46.47s/it]

Fold 18 R2 Score: -0.35028, MSE: 0.24771, MAE: 0.18840
Cross-validation fold: 19/30


 63%|██████▎   | 19/30 [14:40<08:29, 46.34s/it]

Fold 19 R2 Score: -0.33945, MSE: 0.24124, MAE: 0.18480
Cross-validation fold: 20/30


 67%|██████▋   | 20/30 [15:27<07:42, 46.28s/it]

Fold 20 R2 Score: -0.32491, MSE: 0.22913, MAE: 0.17759
Cross-validation fold: 21/30


 70%|███████   | 21/30 [16:13<06:56, 46.23s/it]

Fold 21 R2 Score: -0.47686, MSE: 0.25276, MAE: 0.19313
Cross-validation fold: 22/30


 73%|███████▎  | 22/30 [16:59<06:08, 46.11s/it]

Fold 22 R2 Score: -0.42471, MSE: 0.23799, MAE: 0.18041
Cross-validation fold: 23/30


 77%|███████▋  | 23/30 [17:46<05:25, 46.52s/it]

Fold 23 R2 Score: -0.37050, MSE: 0.24856, MAE: 0.18521
Cross-validation fold: 24/30


 80%|████████  | 24/30 [18:32<04:39, 46.51s/it]

Fold 24 R2 Score: -0.33562, MSE: 0.22584, MAE: 0.17965
Cross-validation fold: 25/30


 83%|████████▎ | 25/30 [19:18<03:51, 46.21s/it]

Fold 25 R2 Score: -0.32579, MSE: 0.24586, MAE: 0.18796
Cross-validation fold: 26/30


 87%|████████▋ | 26/30 [20:04<03:04, 46.10s/it]

Fold 26 R2 Score: -0.31594, MSE: 0.23907, MAE: 0.18509
Cross-validation fold: 27/30


 90%|█████████ | 27/30 [20:49<02:17, 45.78s/it]

Fold 27 R2 Score: -0.29624, MSE: 0.25155, MAE: 0.18906
Cross-validation fold: 28/30


 93%|█████████▎| 28/30 [21:34<01:31, 45.65s/it]

Fold 28 R2 Score: -0.34682, MSE: 0.25140, MAE: 0.19292
Cross-validation fold: 29/30


 97%|█████████▋| 29/30 [22:20<00:45, 45.73s/it]

Fold 29 R2 Score: -0.37892, MSE: 0.24080, MAE: 0.18587
Cross-validation fold: 30/30


100%|██████████| 30/30 [23:07<00:00, 46.27s/it]


Fold 30 R2 Score: -0.25314, MSE: 0.23634, MAE: 0.17857
R2 Score: -0.36779 ± 0.05823
Cap Avg R2 Score: 0.00000 ± 0.00000
MAE: 18.674% ± 0.636%
RMSE: 24.376% ± 0.904%


In [16]:
# y-scrambled baseline model
cols_to_use = struct_ohe_cols + feats['SHAP'] + target_cols + ['Set', 'sulfonamide', 'SO2N-dG_deprotonation_gas', 'C_Hirschfeld', 'N_Hirschfeld']
df_ = df_ssg[cols_to_use].copy()
df_ = df_[df_['Set']=='Train'].copy().drop('Set', axis=1)

#Initialize model class
input_size = len(df_.columns) - len(target_cols) - 1
MC = ModelClass("MLP", input_size, hidden_size, output_size, drop_out_rate, step_schedule, num_epochs, l1_reg, l2_reg, rand_state)

#Initialize CV
CV = CrossVal(df_.drop(columns=['sulfonamide']), MC, batch_size, rand_state=rand_state, n_cv=30, scramble_y=True)

#Run training loop with CV
CV.execute_loop(target_cols, output_size)
CV.print_performance()

mlp_preds = pd.merge(left=df['RAW-MonoYield (%)'], right=CV.predictions_df.set_index('Index'), left_index=True, right_index=True)
to_save = pd.merge(left=mlp_preds, right=df[['sulfonamide', 'boronic_acid', 'catalyst', 'base', 'solvent']], left_index=True, right_index=True)
to_save.to_csv('/content/drive/MyDrive/MLP_final/mlp_preds_yscrambled.csv')

  0%|          | 0/30 [00:00<?, ?it/s]

Cross-validation fold: 1/30


  self.predictions_df = pd.concat([self.predictions_df, fold_df], ignore_index=True)
  3%|▎         | 1/30 [00:48<23:34, 48.79s/it]

Fold 1 R2 Score: -0.35324, MSE: 0.24506, MAE: 0.18915
Cross-validation fold: 2/30


  7%|▋         | 2/30 [01:34<21:57, 47.04s/it]

Fold 2 R2 Score: -0.32208, MSE: 0.24873, MAE: 0.18889
Cross-validation fold: 3/30


 10%|█         | 3/30 [02:20<20:52, 46.37s/it]

Fold 3 R2 Score: -0.29518, MSE: 0.23716, MAE: 0.18262
Cross-validation fold: 4/30


 13%|█▎        | 4/30 [03:06<20:00, 46.17s/it]

Fold 4 R2 Score: -0.38362, MSE: 0.23695, MAE: 0.18246
Cross-validation fold: 5/30


 17%|█▋        | 5/30 [03:52<19:21, 46.44s/it]

Fold 5 R2 Score: -0.33015, MSE: 0.24143, MAE: 0.18485
Cross-validation fold: 6/30


 20%|██        | 6/30 [04:40<18:41, 46.75s/it]

Fold 6 R2 Score: -0.40305, MSE: 0.23782, MAE: 0.18280
Cross-validation fold: 7/30


 23%|██▎       | 7/30 [05:25<17:47, 46.39s/it]

Fold 7 R2 Score: -0.31438, MSE: 0.25153, MAE: 0.19049
Cross-validation fold: 8/30


 27%|██▋       | 8/30 [06:12<17:03, 46.51s/it]

Fold 8 R2 Score: -0.46327, MSE: 0.25178, MAE: 0.19229
Cross-validation fold: 9/30


 30%|███       | 9/30 [06:59<16:18, 46.61s/it]

Fold 9 R2 Score: -0.22362, MSE: 0.22634, MAE: 0.17455
Cross-validation fold: 10/30


 33%|███▎      | 10/30 [07:45<15:29, 46.50s/it]

Fold 10 R2 Score: -0.35273, MSE: 0.24318, MAE: 0.18865
Cross-validation fold: 11/30


 37%|███▋      | 11/30 [08:33<14:50, 46.89s/it]

Fold 11 R2 Score: -0.43460, MSE: 0.25285, MAE: 0.19130
Cross-validation fold: 12/30


 40%|████      | 12/30 [09:20<14:03, 46.85s/it]

Fold 12 R2 Score: -0.38115, MSE: 0.23630, MAE: 0.17989
Cross-validation fold: 13/30


 43%|████▎     | 13/30 [10:05<13:08, 46.41s/it]

Fold 13 R2 Score: -0.40247, MSE: 0.25258, MAE: 0.19017
Cross-validation fold: 14/30


 47%|████▋     | 14/30 [10:51<12:21, 46.32s/it]

Fold 14 R2 Score: -0.40499, MSE: 0.24463, MAE: 0.18738
Cross-validation fold: 15/30


 50%|█████     | 15/30 [11:37<11:33, 46.23s/it]

Fold 15 R2 Score: -0.27014, MSE: 0.23045, MAE: 0.17542
Cross-validation fold: 16/30


 53%|█████▎    | 16/30 [12:23<10:45, 46.11s/it]

Fold 16 R2 Score: -0.41076, MSE: 0.25052, MAE: 0.19413
Cross-validation fold: 17/30


 57%|█████▋    | 17/30 [13:12<10:08, 46.81s/it]

Fold 17 R2 Score: -0.29118, MSE: 0.23516, MAE: 0.18301
Cross-validation fold: 18/30


 60%|██████    | 18/30 [13:58<09:21, 46.83s/it]

Fold 18 R2 Score: -0.27603, MSE: 0.24080, MAE: 0.18058
Cross-validation fold: 19/30


 63%|██████▎   | 19/30 [14:45<08:33, 46.66s/it]

Fold 19 R2 Score: -0.32856, MSE: 0.24026, MAE: 0.18702
Cross-validation fold: 20/30


 67%|██████▋   | 20/30 [15:31<07:44, 46.43s/it]

Fold 20 R2 Score: -0.33463, MSE: 0.22997, MAE: 0.17804
Cross-validation fold: 21/30


 70%|███████   | 21/30 [16:17<06:57, 46.34s/it]

Fold 21 R2 Score: -0.46961, MSE: 0.25214, MAE: 0.19336
Cross-validation fold: 22/30


 73%|███████▎  | 22/30 [17:04<06:12, 46.51s/it]

Fold 22 R2 Score: -0.35088, MSE: 0.23174, MAE: 0.17782
Cross-validation fold: 23/30


 77%|███████▋  | 23/30 [17:52<05:28, 46.90s/it]

Fold 23 R2 Score: -0.24567, MSE: 0.23697, MAE: 0.18186
Cross-validation fold: 24/30


 80%|████████  | 24/30 [18:38<04:40, 46.81s/it]

Fold 24 R2 Score: -0.38115, MSE: 0.22966, MAE: 0.17965
Cross-validation fold: 25/30


 83%|████████▎ | 25/30 [19:24<03:53, 46.68s/it]

Fold 25 R2 Score: -0.35178, MSE: 0.24826, MAE: 0.18946
Cross-validation fold: 26/30


 87%|████████▋ | 26/30 [20:11<03:06, 46.62s/it]

Fold 26 R2 Score: -0.29633, MSE: 0.23729, MAE: 0.18377
Cross-validation fold: 27/30


 90%|█████████ | 27/30 [20:57<02:19, 46.34s/it]

Fold 27 R2 Score: -0.40302, MSE: 0.26171, MAE: 0.19813
Cross-validation fold: 28/30


 93%|█████████▎| 28/30 [21:43<01:32, 46.35s/it]

Fold 28 R2 Score: -0.32651, MSE: 0.24950, MAE: 0.18860
Cross-validation fold: 29/30


 97%|█████████▋| 29/30 [22:29<00:46, 46.34s/it]

Fold 29 R2 Score: -0.33855, MSE: 0.23725, MAE: 0.18099
Cross-validation fold: 30/30


100%|██████████| 30/30 [23:16<00:00, 46.55s/it]


Fold 30 R2 Score: -0.26093, MSE: 0.23708, MAE: 0.18378
R2 Score: -0.34668 ± 0.06109
Cap Avg R2 Score: 0.00000 ± 0.00000
MAE: 18.537% ± 0.573%
RMSE: 24.184% ± 0.849%


In [17]:
# ohe baseline model
cols_to_use = struct_ohe_cols + target_cols + ['Set', 'sulfonamide']
df_ = df_ssg[cols_to_use].copy()
df_ = df_[df_['Set']=='Train'].copy().drop('Set', axis=1)

#Initialize model class
input_size = len(df_.columns) - len(target_cols) - 1
MC = ModelClass("MLP", input_size, hidden_size, output_size, drop_out_rate, step_schedule, num_epochs, l1_reg, l2_reg, rand_state)

#Initialize CV
CV = CrossVal(df_.drop(columns=['sulfonamide']), MC, batch_size, rand_state=rand_state, n_cv=30)

#Run training loop with CV
CV.execute_loop(target_cols, output_size)
CV.print_performance()

mlp_preds = pd.merge(left=df['RAW-MonoYield (%)'], right=CV.predictions_df.set_index('Index'), left_index=True, right_index=True)
to_save = pd.merge(left=mlp_preds, right=df[['sulfonamide', 'boronic_acid', 'catalyst', 'base', 'solvent']], left_index=True, right_index=True)
to_save.to_csv('/content/drive/MyDrive/MLP_final/mlp_preds_OHE.csv')

  0%|          | 0/30 [00:00<?, ?it/s]

Cross-validation fold: 1/30


  self.predictions_df = pd.concat([self.predictions_df, fold_df], ignore_index=True)
  3%|▎         | 1/30 [00:45<22:10, 45.86s/it]

Fold 1 R2 Score: 0.79203, MSE: 0.09607, MAE: 0.06903
Cross-validation fold: 2/30


  7%|▋         | 2/30 [01:32<21:31, 46.14s/it]

Fold 2 R2 Score: 0.78108, MSE: 0.10121, MAE: 0.07268
Cross-validation fold: 3/30


 10%|█         | 3/30 [02:17<20:35, 45.77s/it]

Fold 3 R2 Score: 0.77125, MSE: 0.09967, MAE: 0.07105
Cross-validation fold: 4/30


 13%|█▎        | 4/30 [03:02<19:46, 45.65s/it]

Fold 4 R2 Score: 0.79053, MSE: 0.09219, MAE: 0.06626
Cross-validation fold: 5/30


 17%|█▋        | 5/30 [03:48<18:58, 45.54s/it]

Fold 5 R2 Score: 0.78314, MSE: 0.09748, MAE: 0.07011
Cross-validation fold: 6/30


 20%|██        | 6/30 [04:34<18:15, 45.63s/it]

Fold 6 R2 Score: 0.78266, MSE: 0.09360, MAE: 0.06596
Cross-validation fold: 7/30


 23%|██▎       | 7/30 [05:19<17:25, 45.45s/it]

Fold 7 R2 Score: 0.77707, MSE: 0.10359, MAE: 0.07223
Cross-validation fold: 8/30


 27%|██▋       | 8/30 [06:05<16:42, 45.57s/it]

Fold 8 R2 Score: 0.77846, MSE: 0.09797, MAE: 0.06987
Cross-validation fold: 9/30


 30%|███       | 9/30 [06:50<15:54, 45.47s/it]

Fold 9 R2 Score: 0.81691, MSE: 0.08755, MAE: 0.06184
Cross-validation fold: 10/30


 33%|███▎      | 10/30 [07:35<15:07, 45.39s/it]

Fold 10 R2 Score: 0.76968, MSE: 0.10034, MAE: 0.06985
Cross-validation fold: 11/30


 37%|███▋      | 11/30 [08:21<14:23, 45.44s/it]

Fold 11 R2 Score: 0.79331, MSE: 0.09597, MAE: 0.06815
Cross-validation fold: 12/30


 40%|████      | 12/30 [09:06<13:36, 45.37s/it]

Fold 12 R2 Score: 0.77194, MSE: 0.09602, MAE: 0.06604
Cross-validation fold: 13/30


 43%|████▎     | 13/30 [09:51<12:50, 45.34s/it]

Fold 13 R2 Score: 0.75506, MSE: 0.10556, MAE: 0.07280
Cross-validation fold: 14/30


 47%|████▋     | 14/30 [10:36<12:04, 45.30s/it]

Fold 14 R2 Score: 0.77716, MSE: 0.09742, MAE: 0.07065
Cross-validation fold: 15/30


 50%|█████     | 15/30 [11:23<11:26, 45.76s/it]

Fold 15 R2 Score: 0.79592, MSE: 0.09238, MAE: 0.06543
Cross-validation fold: 16/30


 53%|█████▎    | 16/30 [12:09<10:40, 45.76s/it]

Fold 16 R2 Score: 0.79050, MSE: 0.09654, MAE: 0.06922
Cross-validation fold: 17/30


 57%|█████▋    | 17/30 [12:55<09:55, 45.80s/it]

Fold 17 R2 Score: 0.80192, MSE: 0.09210, MAE: 0.06468
Cross-validation fold: 18/30


 60%|██████    | 18/30 [13:39<09:05, 45.47s/it]

Fold 18 R2 Score: 0.80727, MSE: 0.09358, MAE: 0.06646
Cross-validation fold: 19/30


 63%|██████▎   | 19/30 [14:25<08:19, 45.43s/it]

Fold 19 R2 Score: 0.79034, MSE: 0.09544, MAE: 0.06626
Cross-validation fold: 20/30


 67%|██████▋   | 20/30 [15:10<07:33, 45.36s/it]

Fold 20 R2 Score: 0.77131, MSE: 0.09520, MAE: 0.06755
Cross-validation fold: 21/30


 70%|███████   | 21/30 [15:55<06:47, 45.26s/it]

Fold 21 R2 Score: 0.77207, MSE: 0.09930, MAE: 0.06986
Cross-validation fold: 22/30


 73%|███████▎  | 22/30 [16:41<06:03, 45.41s/it]

Fold 22 R2 Score: 0.76548, MSE: 0.09656, MAE: 0.06652
Cross-validation fold: 23/30


 77%|███████▋  | 23/30 [17:27<05:19, 45.63s/it]

Fold 23 R2 Score: 0.78183, MSE: 0.09917, MAE: 0.06928
Cross-validation fold: 24/30


 80%|████████  | 24/30 [18:13<04:33, 45.64s/it]

Fold 24 R2 Score: 0.77907, MSE: 0.09185, MAE: 0.06473
Cross-validation fold: 25/30


 83%|████████▎ | 25/30 [18:58<03:47, 45.60s/it]

Fold 25 R2 Score: 0.79173, MSE: 0.09745, MAE: 0.06997
Cross-validation fold: 26/30


 87%|████████▋ | 26/30 [19:44<03:02, 45.58s/it]

Fold 26 R2 Score: 0.81935, MSE: 0.08858, MAE: 0.06327
Cross-validation fold: 27/30


 90%|█████████ | 27/30 [20:29<02:16, 45.49s/it]

Fold 27 R2 Score: 0.77511, MSE: 0.10478, MAE: 0.07208
Cross-validation fold: 28/30


 93%|█████████▎| 28/30 [21:14<01:31, 45.52s/it]

Fold 28 R2 Score: 0.79507, MSE: 0.09807, MAE: 0.07016
Cross-validation fold: 29/30


 97%|█████████▋| 29/30 [22:00<00:45, 45.43s/it]

Fold 29 R2 Score: 0.78663, MSE: 0.09472, MAE: 0.06681
Cross-validation fold: 30/30


100%|██████████| 30/30 [22:45<00:00, 45.50s/it]


Fold 30 R2 Score: 0.75572, MSE: 0.10435, MAE: 0.07234
R2 Score: 0.78399 ± 0.01510
Cap Avg R2 Score: 0.78399 ± 0.01510
MAE: 6.837% ± 0.287%
RMSE: 9.682% ± 0.436%


Leave-one-out cross validation

In [18]:
# chemically informed model for LOOCV

cols_to_use = struct_ohe_cols + feats['SHAP'] + target_cols + ['Set', 'sulfonamide', 'SO2N-dG_deprotonation_gas', 'C_Hirschfeld', 'N_Hirschfeld']

df_ = df_ssg[cols_to_use].copy()
df_ = df_[df_['Set']=='Train'].copy().drop('Set', axis=1)

output_size = 1
input_size = len(df_.columns) - len(target_cols) - 1

feat_results = pd.DataFrame(columns=['LO-SMILES','R2','RMSE','MAE']+['FeatureSet'])

MC = ModelClass("MLP", input_size, hidden_size, output_size, drop_out_rate, step_schedule, num_epochs, l1_reg, l2_reg, rand_state)

#Initialize Leave-N-Out Validation
LNO = LeaveNout(df_, "sulfonamide", MC, batch_size, rand_state=rand_state)

#Run training loop with Leave-N-Out Validation
LNO.execute_loop(target_cols, output_size)

feat_mlp = pd.DataFrame(data=np.stack([flatten_extend(LNO.test_list),LNO.r2_scores,LNO.mses,LNO.maes],axis=1),columns=['LO-SMILES','R2','RMSE','MAE'])

feat_results = pd.concat([feat_results,feat_mlp],ignore_index=True)

LNO.print_performance()

mlp_preds = pd.merge(left=df['RAW-MonoYield (%)'], right=LNO.predictions_df.set_index('Index'), left_index=True, right_index=True)
to_save = pd.merge(left=mlp_preds, right=df[['sulfonamide', 'boronic_acid', 'catalyst', 'base', 'solvent']], left_index=True, right_index=True)
to_save.to_csv('/content/drive/MyDrive/MLP_final/mlp_preds_loocv.csv')

  self.predictions_df = pd.concat([self.predictions_df, fold_df], ignore_index=True)
100%|██████████| 22/22 [19:58<00:00, 54.47s/it]

R2 Score: 0.25859 ± 0.68745
Cap Avg R2 Score: 0.42141 ± 0.28343
MAE: 11.116% ± 5.631%
RMSE: 14.314% ± 6.745%





In [12]:
# OHE baseline model for LOOCV prediction

cols_to_use = struct_ohe_cols + target_cols + ['Set', 'sulfonamide']

df_ = df_ssg[cols_to_use].copy()
df_ = df_[df_['Set']=='Train'].copy().drop('Set', axis=1)

output_size = 1
input_size = len(df_.columns) - len(target_cols) - 1

feat_results = pd.DataFrame(columns=['LO-SMILES','R2','RMSE','MAE']+['FeatureSet'])

MC = ModelClass("MLP", input_size, hidden_size, output_size, drop_out_rate, step_schedule, num_epochs, l1_reg, l2_reg, rand_state)

#Initialize Leave-N-Out Validation
LNO = LeaveNout(df_, "sulfonamide", MC, batch_size, rand_state=rand_state)

#Run training loop with Leave-N-Out Validation
LNO.execute_loop(target_cols, output_size)

feat_mlp = pd.DataFrame(data=np.stack([flatten_extend(LNO.test_list),LNO.r2_scores,LNO.mses,LNO.maes],axis=1),columns=['LO-SMILES','R2','RMSE','MAE'])

feat_results = pd.concat([feat_results,feat_mlp],ignore_index=True)

LNO.print_performance()

mlp_preds = pd.merge(left=df['RAW-MonoYield (%)'], right=LNO.predictions_df.set_index('Index'), left_index=True, right_index=True)
to_save = pd.merge(left=mlp_preds, right=df[['sulfonamide', 'boronic_acid', 'catalyst', 'base', 'solvent']], left_index=True, right_index=True)
to_save.to_csv('/content/drive/MyDrive/MLP_final/mlp_preds_ohe_loocv.csv')

  self.predictions_df = pd.concat([self.predictions_df, fold_df], ignore_index=True)
100%|██████████| 22/22 [20:08<00:00, 54.91s/it]

R2 Score: 0.12281 ± 0.84913
Cap Avg R2 Score: 0.38132 ± 0.26369
MAE: 12.025% ± 6.142%
RMSE: 15.309% ± 7.053%





External validation

In [19]:
# external validation: train using train and test using validation

cols_to_use = struct_ohe_cols + feats['SHAP'] + target_cols + ['Set', 'sulfonamide', 'SO2N-dG_deprotonation_gas', 'C_Hirschfeld', 'N_Hirschfeld']
df_ = df_ssg[cols_to_use].copy()

#Initialize model class
metrics_ = []

hidden_size, drop_out_rate, step_schedule, num_epochs, batch_size, output_size, l1_reg, l2_reg = hyperparams_single.values()

r2_list = []
rmse_list = []
mae_list = []

output_size = 1

# separate train and test dfs
df_train = df_[df_['Set']=='Train'].reset_index(drop=True).copy()
df_train = df_train.sample(frac=1, random_state=rand_state).reset_index(drop=False).rename(columns={'index': 'original_index'})
df_val = df_[df_['Set']=='Validation'].reset_index(drop=False).copy().rename(columns={'index': 'original_index'})

df_external = pd.concat([df_train, df_val], ignore_index=True).drop('sulfonamide', axis=1)

input_size = len(df_external.columns) - len(target_cols) - 2

# initialize model
MC = ModelClass("MLP", input_size, hidden_size, output_size, drop_out_rate, step_schedule, num_epochs, l1_reg, l2_reg, rand_state=rand_state)

# initialize LNO (where n=all external molecules as determined by "set")
LNO = LeaveNout(df_.drop('sulfonamide', axis=1), 'Set', MC, batch_size, rand_state=rand_state)

# run training loop
LNO.execute_loop(target_cols, output_size)

# performance metrics
val_idx = flatten_extend(LNO.test_list).index('Validation')
r2_val, rmse_val, mae_val = LNO.r2_scores[val_idx], LNO.mses[val_idx], LNO.maes[val_idx]

r2_list.append(r2_val)
rmse_list.append(rmse_val)
mae_list.append(mae_val)

mlp_preds = pd.merge(left=df['RAW-MonoYield (%)'], right=LNO.predictions_df.set_index('Index'), left_index=True, right_index=True)
to_save = pd.merge(left=mlp_preds, right=df[['sulfonamide', 'boronic_acid', 'catalyst', 'base', 'solvent']], left_index=True, right_index=True)
to_save.to_csv('/content/drive/MyDrive/MLP_final/mlp_preds_external.csv')

  self.predictions_df = pd.concat([self.predictions_df, fold_df], ignore_index=True)
100%|██████████| 2/2 [00:57<00:00, 28.72s/it]


In [20]:
# OHE baseline model for external test prediction

cols_to_use = struct_ohe_cols + target_cols + ['Set', 'sulfonamide']
df_ = df_ssg[cols_to_use].copy()

#Initialize model class
metrics_ = []

hidden_size, drop_out_rate, step_schedule, num_epochs, batch_size, output_size, l1_reg, l2_reg = hyperparams_single.values()

r2_list = []
rmse_list = []
mae_list = []

output_size = 1

# separate train and test dfs
df_train = df_[df_['Set']=='Train'].reset_index(drop=True).copy()
df_train = df_train.sample(frac=1, random_state=rand_state).reset_index(drop=False).rename(columns={'index': 'original_index'})
df_val = df_[df_['Set']=='Validation'].reset_index(drop=False).copy().rename(columns={'index': 'original_index'})

df_external = pd.concat([df_train, df_val], ignore_index=True).drop('sulfonamide', axis=1)

input_size = len(df_external.columns) - len(target_cols) - 2

# initialize model
MC = ModelClass("MLP", input_size, hidden_size, output_size, drop_out_rate, step_schedule, num_epochs, l1_reg, l2_reg, rand_state=rand_state)

# initialize LNO (where n=all external molecules as determined by "set")
LNO = LeaveNout(df_.drop('sulfonamide', axis=1), 'Set', MC, batch_size, rand_state=rand_state)

# run training loop
LNO.execute_loop(target_cols, output_size)

# performance metrics
val_idx = flatten_extend(LNO.test_list).index('Validation')
r2_val, rmse_val, mae_val = LNO.r2_scores[val_idx], LNO.mses[val_idx], LNO.maes[val_idx]

r2_list.append(r2_val)
rmse_list.append(rmse_val)
mae_list.append(mae_val)

mlp_preds = pd.merge(left=df['RAW-MonoYield (%)'], right=LNO.predictions_df.set_index('Index'), left_index=True, right_index=True)
to_save = pd.merge(left=mlp_preds, right=df[['sulfonamide', 'boronic_acid', 'catalyst', 'base', 'solvent']], left_index=True, right_index=True)
to_save.to_csv('/content/drive/MyDrive/MLP_final/mlp_preds_ohe_external.csv')

  self.predictions_df = pd.concat([self.predictions_df, fold_df], ignore_index=True)
100%|██████████| 2/2 [00:57<00:00, 28.59s/it]
