In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
! pip install arff

Collecting arff
  Downloading arff-0.9.tar.gz (4.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: arff
  Building wheel for arff (setup.py) ... [?25l[?25hdone
  Created wheel for arff: filename=arff-0.9-py3-none-any.whl size=4950 sha256=cb657b29b1acc99326201ad2159f1082921e30ff445ea8231253f14eb4c567f9
  Stored in directory: /root/.cache/pip/wheels/0c/39/12/4d8ff491018bce2dd9cc6034298f27399c7f4fd4063187f7be
Successfully built arff
Installing collected packages: arff
Successfully installed arff-0.9


In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import svm, tree
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor, plot_tree, export_graphviz
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_validate, cross_val_score, cross_val_predict, LeaveOneOut
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

import random
from datetime import datetime
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

Data reading & preprocessing

In [None]:
solvent_data = pd.read_excel('/content/drive/My Drive/Colab Notebooks/material_project/data/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'solvent')
solvent_data.set_index('solvent_name', inplace = True)
solvent_columns = list(solvent_data.columns)

In [None]:
nucleophile_data = pd.read_excel('/content/drive/My Drive/Colab Notebooks/material_project/data/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'nucleophiles')
nucleophile_data.set_index('nucleophile_name', inplace = True)
nucleophile_columns = list(nucleophile_data.columns)

In [None]:
raw_catalyst_data = pd.read_excel('/content/drive/My Drive/Colab Notebooks/material_project/data/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'full_catalyst')
raw_catalyst_data.rename(columns = {'last catalyst only S used': 'catalyst_name'}, inplace = True)

r_catalyst_data = raw_catalyst_data.copy()
for i in range(17):
    r_catalyst_data.iloc[i, 1] = r_catalyst_data.iloc[i, 1].replace('R/S', 'R')

s_catalyst_data = raw_catalyst_data.copy()
for i in range(17):
    s_catalyst_data.iloc[i, 1] = s_catalyst_data.iloc[i, 1].replace('R/S', 'S')

catalyst_data = pd.concat([r_catalyst_data, s_catalyst_data])
catalyst_data.set_index('catalyst_name', inplace = True)
catalyst_columns = list(catalyst_data.columns)

In [None]:
raw_catalyst_data = pd.read_excel('/content/drive/My Drive/Colab Notebooks/material_project/data/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'full_catalyst')
raw_catalyst_data.rename(columns = {'last catalyst only S used': 'catalyst_name'}, inplace = True)

#Taking care of catalyst R/S
r_catalyst_data = raw_catalyst_data.copy()
for i in range(17):
    r_catalyst_data.iloc[i, 1] = r_catalyst_data.iloc[i, 1].replace('R/S', 'R')
    #print(r_catalyst_data.iloc[i, 1])
#r_catalyst_data.drop(17, inplace = True)

s_catalyst_data = raw_catalyst_data.copy()
for i in range(17):
    s_catalyst_data.iloc[i, 1] = s_catalyst_data.iloc[i, 1].replace('R/S', 'S')
    #print(s_catalyst_data.iloc[i, 1])

catalyst_data = pd.concat([r_catalyst_data, s_catalyst_data])
catalyst_data.rename(columns = {'last catalyst only S used': 'catalyst_name'}, inplace = True)
catalyst_data.set_index('catalyst_name', inplace = True)

catalyst_columns = list(catalyst_data.columns)

# catalyst_data

In [None]:
iminium_data = pd.read_excel('/content/drive/My Drive/Colab Notebooks/material_project/data/Holistic prediction of enantioselectivity in asymmetric catalysis.Supplementary Data.xlsx', sheet_name = 'iminiums')
iminium_data.rename(columns = {'imine': 'iminium_name', 'electronic energy difference (kcal/mol) ': 'electronic energy difference (kcal/mol)'}, inplace = True)
#removed space at the end of electronic energy
iminium_data.set_index('iminium_name', inplace = True)
iminium_data.drop(labels = ['Unnamed: 1'], axis = 1, inplace = True)
for i in range(1, 181):
    z_iminium_name = '(Z)-Iminium ' + str(i)
    e_iminium_name = '(E)-Iminium ' + str(i)
    iminium_data.loc[z_iminium_name, 'electronic energy difference (kcal/mol)'] = iminium_data.loc[e_iminium_name, 'electronic energy difference (kcal/mol)']
iminium_columns = list(iminium_data.columns)

In [None]:
class Reaction():

    def __init__(self, name, entry, catalyst, nucleophile, substrate, solvent, iminium_type, iminium, majorenantiomer, minorenantiomer, ee, G):
        self.name = name
        self.entry = entry
        self.catalyst = catalyst
        self.nucleophile = nucleophile
        self.substrate = substrate
        self.solvent = solvent
        self.iminium_type = iminium_type
        self.iminium = iminium
        self.majorenantiomer = majorenantiomer
        self.minorenantiomer = minorenantiomer
        self.ee = ee
        self.G = G

        self.solvent_properties = dict()
        for column in solvent_columns:
            self.solvent_properties[column] = solvent_data.loc[solvent, column]

        self.catalyst_properties = dict()
        for column in catalyst_columns:
            self.catalyst_properties[column] = catalyst_data.loc[catalyst, column]

        self.nucleophile_properties = dict()
        for column in nucleophile_columns:
            self.nucleophile_properties[column] = nucleophile_data.loc[nucleophile, column]

        self.e_iminium = '(E)-' + str(iminium)
        self.z_iminium = '(Z)-' + str(iminium)

        self.e_iminium_properties = dict()
        self.z_iminium_properties = dict()
        for column in iminium_columns:
            self.e_iminium_properties[column] = iminium_data.loc[self.e_iminium, column]
            self.z_iminium_properties[column] = iminium_data.loc[self.z_iminium, column]



    def __repr__(self):
        return "Reaction - {}".format(self.name)
        #iminium stuff

In [None]:
reactions = dict()

def process_data(reaction_number, reaction, iminium_type, sheetname = None):
    reaction_file = '/content/drive/My Drive/Colab Notebooks/material_project/data/reaction information/' + str(reaction_number) + ' ' + reaction + '.xlsx'
    if sheetname == None:
        data = pd.read_excel(reaction_file)
    else:
        data = pd.read_excel(reaction_file, sheet_name = sheetname)

    data.set_index('entry', inplace = True)

    entries = len(data)
    for entry in range(1, entries + 1):
        if sheetname == None:
            reaction_name = reaction + ' ' + str(entry)
        else:
            reaction_name = reaction + ' ' + sheetname + ' ' + str(entry)


        reactions[reaction_name] = Reaction(reaction_name,
                                            entry,
                                            data.loc[entry, 'Catalyst'],
                                            data.loc[entry, 'Nucleophile'],
                                            data.loc[entry, 'Substrate'],
                                            data.loc[entry, 'Solvent'],
                                            iminium_type,
                                            data.loc[entry, 'Iminium'],
                                            data.loc[entry, 'Major Enantiomer'],
                                            data.loc[entry, 'Minor Enantiomer'],
                                            data.loc[entry, 'ee'],
                                            data.loc[entry, 'ΔΔG‡'])



In [None]:
process_data(1, 'Addition of Alcohols', 'E', 'Scope')
process_data(2, 'Addition of thiols', 'E', 'Catalyst & solvent screening da')
process_data(2, 'Addition of thiols', 'E', 'Effect of catalyst loading')
process_data(2, 'Addition of thiols', 'E', 'Imine scope')
process_data(2, 'Addition of thiols', 'E', 'Thiol scope')
process_data(3, 'Hydrophosphonylation of imines', 'E', 'Catalyst screening data')
process_data(3, 'Hydrophosphonylation of imines', 'E', 'Scope')
process_data(4, 'Addition of diazomethylphosphonates', 'E', "Optimization of catalyst and re")
process_data(4, 'Addition of diazomethylphosphonates', 'E', "Imine scope")
process_data(5, 'Addition of diazoacetamides', 'E', 'Catalyst screening data')
process_data(5, 'Addition of diazoacetamides', 'E', 'Solvent screening data')
process_data(5, 'Addition of diazoacetamides', 'E', 'Substrate(s) scope')
process_data(6, 'Strecker Reaction (with aldimines)', 'E', 'Catalyst screening data')
process_data(6, 'Strecker Reaction (with aldimines)', 'E', 'Solvent screening data')
process_data(6, 'Strecker Reaction (with aldimines)', 'E', 'Imine scope')
process_data(7, 'Peroxidation of imines', 'E', 'Catalyst screening data')
process_data(7, 'Peroxidation of imines', 'E', 'Solvent screening data')
#process_data(7, 'Peroxidation of imines', 'E', 'Substrate(s) scope')
process_data(8, 'Transfer Hydrogenation of b,g-Alkynyl a-Imino Esters', 'E', 'Catalyst screening and reaction')
process_data(8, 'Transfer Hydrogenation of b,g-Alkynyl a-Imino Esters', 'E', 'Scope')
process_data(9, 'Transfer Hydrogenation of Enamides', 'E', 'Scope')
process_data(10, 'Transfer Hydrogenation of N-aryl imines (List)', 'Z', 'Catalyst screening data')
process_data(10, 'Transfer Hydrogenation of N-aryl imines (List)', 'Z', 'Imine scope')
process_data(11, 'Transfer Hydrogenation of N-aryl imines (Rueping)', 'Z', 'Catalyst screening data')
process_data(11, 'Transfer Hydrogenation of N-aryl imines (Rueping)', 'Z', 'Solvent screening data')
process_data(11, 'Transfer Hydrogenation of N-aryl imines (Rueping)', 'Z', 'Imine scope')
#process_data(12, 'Reductive amination of N-aryl imines (Macmillan)', 'Z', 'Reaction optimization')
process_data(13, 'Transfer Hydrogenation of trifluoromethyl ketimines', 'Z', 'Benzothiazoline screening data')
process_data(13, 'Transfer Hydrogenation of trifluoromethyl ketimines', 'Z', 'Imine scope')
process_data(14, 'Transfer Hydrogenation of N-aryl imines by benzothiazoline', 'Z', 'Catalyst screening data')
process_data(14, 'Transfer Hydrogenation of N-aryl imines by benzothiazoline', 'Z', 'Benzothiazoline screening data')
process_data(14, 'Transfer Hydrogenation of N-aryl imines by benzothiazoline', 'Z', 'Imine scope')
#process_data(15, 'Reductive amination of aliphatic ketones by benzothiazoline', 'Z', 'Catalyst screening data')
#process_data(15, 'Reductive amination of aliphatic ketones by benzothiazoline', 'Z', 'Imine scope')
process_data(16, 'Transfer Hydrogenation of ethyl ketimines', 'Z', 'Benzothiazoline screening data')
process_data(16, 'Transfer Hydrogenation of ethyl ketimines', 'Z', 'Scope 1 benzothiazoline')
process_data(16, 'Transfer Hydrogenation of ethyl ketimines', 'Z', 'Scope 2 dihydropyridine')
process_data(17, 'Strecker Reaction (with ketimines)', 'Z', 'Catalyst screening data')
process_data(17, 'Strecker Reaction (with ketimines)', 'Z', 'Solvent data')
process_data(17, 'Strecker Reaction (with ketimines)', 'Z', 'Imine scope')
#process_data(18, 'Addition of enecarbamates to benzoyl imines')
#process_data(19, 'Hydrogenation of fluorinated alkynyl ketimines')
#process_data(20, 'Addition of thiols to imines (Denmark)')

In [None]:
print(len(reactions))

342


In [None]:
catalyst = pd.DataFrame(columns = ['Reaction'] + catalyst_columns)
catalyst.set_index('Reaction', inplace = True)

for reaction in reactions:
    for column in catalyst_columns:
        catalyst.loc[reaction, column] = reactions[reaction].catalyst_properties[column]

#first column is numerical

In [None]:
nucleophile = pd.DataFrame(columns = ['Reaction'] + nucleophile_columns)
nucleophile.set_index('Reaction', inplace = True)

for reaction in reactions:
    for column in nucleophile_columns:
        nucleophile.loc[reaction, column] = reactions[reaction].nucleophile_properties[column]

#first column is numerical

In [None]:
solvent = pd.DataFrame(columns = ['Reaction'] + solvent_columns)
solvent.set_index('Reaction', inplace = True)

for reaction in reactions:
    for column in solvent_columns:
        solvent.loc[reaction, column] = reactions[reaction].solvent_properties[column]

#first column is numerical

In [None]:
iminium = pd.DataFrame(columns = ['Reaction', "iminium_type"] + iminium_columns)
iminium.set_index('Reaction', inplace = True)

for reaction in reactions:
    if reactions[reaction].iminium_type == 'E':
        iminium.loc[reaction, "iminium_type"] = 'E'
        for column in iminium_columns:
            iminium.loc[reaction, column] = reactions[reaction].e_iminium_properties[column]
    elif reactions[reaction].iminium_type == 'Z':
        iminium.loc[reaction, "iminium_type"] = 'Z'
        for column in iminium_columns:
            iminium.loc[reaction, column] = reactions[reaction].z_iminium_properties[column]



In [None]:
ee = pd.DataFrame(columns = ['Reaction', 'ee'])
ee.set_index('Reaction', inplace = True)

for reaction in reactions:
    ee.loc[reaction, 'ee'] = reactions[reaction].ee

In [None]:
Y = pd.DataFrame(columns = ['Reaction', 'ΔΔG‡'])
Y.set_index('Reaction', inplace = True)

for reaction in reactions:
    Y.loc[reaction, 'ΔΔG‡'] = reactions[reaction].G


In [None]:
print(catalyst.shape, nucleophile.shape, solvent.shape, iminium.shape, ee.shape)

(342, 85) (342, 15) (342, 160) (342, 22) (342, 1)


In [None]:
X_iminium = pd.concat([catalyst.drop(['Ar group'], axis = 1),
               nucleophile.drop(['nucleophile'], axis = 1),
               solvent.drop(['solvent'], axis = 1),
               iminium.drop(['iminium_type'], axis = 1)], axis = 1)
X_iminium.shape

(342, 278)

In [None]:
#no iminium features
X_no_iminium = pd.concat([catalyst.drop(['Ar group'], axis = 1),
               nucleophile.drop(['nucleophile'], axis = 1),
               solvent.drop(['solvent'], axis = 1)], axis = 1)
X_no_iminium.shape

(342, 257)

In [None]:
X_no_nucleophile = pd.concat([catalyst.drop(['Ar group'], axis = 1),
                   solvent.drop(['solvent'], axis = 1)], axis = 1)
X_no_nucleophile.shape

(342, 243)

In [None]:
for reaction in reactions:
    if iminium.loc[reaction, 'iminium_type'] == 'Z':
        Y.loc[reaction, 'ΔΔG‡'] = Y.loc[reaction, 'ΔΔG‡'] * (-1)

Run 100 replications with Monte-carlo cross validation

In [None]:
X = X_iminium

In [None]:
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X, Y, test_size = 0.1, random_state = 1)

In [None]:
X_train1.shape

(307, 278)

In [None]:
X_test1.shape

(35, 278)

In [None]:
import os
import pandas as pd
import numpy as np
import sklearn
import random
import heapq
import matplotlib.pyplot as plt


from numpy import vstack
from numpy import argmax
from numpy import asarray
from numpy.random import normal

from scipy.stats import norm
from warnings import catch_warnings
from warnings import simplefilter
from matplotlib import pyplot


from scipy import stats
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from scipy.stats import pearsonr
from numpy.random import random


from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
X.shape

(342, 278)

In [None]:
X_train_train, X_valid, Y_train_train, Y_valid = train_test_split( X_train1,Y_train1, test_size = 0.2, random_state = 100)



X_valid=X_valid.to_numpy()
Y_valid=Y_valid.to_numpy()
X_train_train=X_train_train.to_numpy()
Y_train_train=Y_train_train.to_numpy()

Bayesian optimization for hyperparameter tuning

In [None]:

validation_replication=1
result_valid=np.zeros(validation_replication)
result_valid=np.reshape(result_valid,(1,validation_replication))



def objective(x1):

    # Constraints for each hyperparameter
    # All hyperparameters are positive integers.
    # Minimum of "min_n_estimators" : 1.

    a1 = int(x1)


    if a1<1:
        a1=1


# Performance evaluation for  validation set
    for j in range(0,1):
        # Fit the model with training data and check the Mean Absolute Percentage Error (MSE) of validation data
        reg_m = make_pipeline(StandardScaler(), GradientBoostingRegressor(n_estimators=a1,random_state=5))
        reg_m.fit(X_train_train, Y_train_train.ravel());

        predicted_y_m = reg_m.predict(X_valid);
        prediction=np.reshape(predicted_y_m,(predicted_y_m.shape[0],1))
        result_valid[0,j] =(mean_squared_error(Y_valid, prediction))
    # Since Bayesian Optimization is maximization problem, we used reciprocal as output.
    return (1/np.mean(result_valid))

# surrogate or approximation for the objective function
def surrogate(model, X):
    # catch any warning generated when making a prediction
    #with catch_warnings():
        # ignore generated warnings if the distribution is thin at a given point
        #simplefilter("ignore")
        return model.predict(X, return_std=True)

# We used Expected Improvement as our acquisition function.
def acquisition(X, Xsamples, model):
    # calculate the best surrogate score found so far
    yhat, _ = surrogate(model, X)
    best = max(yhat)
    # calculate mean and stdev via surrogate function
    mu, std = surrogate(model, Xsamples)
    # calculate the Expected improvement
    probs = (mu - best) * norm.cdf((mu - best) / (std + 1E-9)) + (std + 1E-9) * norm.pdf((mu - best) / (std + 1E-9))
    return probs

# optimize the acquisition function
def opt_acquisition(X, y, model):
    np.random.seed(i*100+h*100+100*i1)
    # random search, generate random samples
    X1samples = 100 * random((100,1))

    # calculate the acquisition function for each sample
    Xsamples = X1samples
    scores = acquisition(X, Xsamples, model)
    # locate the index of the largest scores
    ix = argmax(scores)
    return Xsamples[ix,]



X1 = [100]
X1 = asarray(X1)


y = asarray([objective(X1)])
X = X1
X = X.reshape(len(X1),1)
y = y.reshape(len(y),1)
h=0




# Starting from given 20 points
for i1 in range(0,20):
    np.random.seed(i1*100+i*100)
    X1 = 100* random(1)

    ysample = asarray([objective(X1)])
    Xsample = X1
    Xsample = Xsample.reshape(len(X1),1)
    ysample = ysample.reshape(len(ysample),1)

    X = vstack((X,Xsample))
    y = vstack((y,ysample))
# define the surrogate model
model = GaussianProcessRegressor()
model.fit(X, y)



# Sample new points (hyperparameters) with Bayesian Optimization.
# It sequentialy samples 100 points based on the optimization.
for h in range(100):
    print(h)
    # select the next point to sample
    x = opt_acquisition(X, y, model)
    x = asarray(x)
    # sample the point
    actual = objective(x[0])
    est, _ = surrogate(model,[x])
    actual = asarray(actual)
    #print(' f()=%3f, actual=%.3f' % (est, actual))
    # add the data to the dataset
    X = vstack((X,[x]))
    y = vstack((y,[[actual]]))
    # update the model
    model.fit(X, y)
    # best result
    ix = argmax(y)
    print("Current")
    print(x[0],1/est,1/actual)

#Provide Best hyperparameters settings based on Bayesian Optimization.
print("Best")
print((X[ix, 0],y[ix],1/y[ix]),ix)



a1=int(X[ix,0])

if a1<1:
    a1=1
print(a1)




  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)
  a1 = int(x1)


0
Current
52.20107825290352 [0.36425495] 0.3846738332578247
1
Current
74.46998848616458 [0.4399223] 0.3767566338063534
2
Current
95.9012735325267 [0.33841178] 0.369283817747342
3
Current
73.35483320951323 [0.3636098] 0.3762217204198802
4
Current
75.20352721392534 [0.35407684] 0.37684225450416187
5
Current
48.76649783015875 [0.4322243] 0.3848966885838451
6
Current
63.19094090532853 [0.41071267] 0.37896999001983356
7
Current
99.50375852697037 [0.35881034] 0.36881330849392613
8
Current
97.63409146547156 [0.37672784] 0.3690869458453953
9
Current
98.37982207036285 [0.36721779] 0.36867509367638407
10
Current
57.65036016578951 [0.48195884] 0.3830000092832301
11
Current
70.20583336200335 [0.53916148] 0.37683321149397736
12
Current
50.499707885794585 [0.45581233] 0.38304116179169223
13
Current
57.26294090869678 [0.36417927] 0.3830000092832301
14
Current
71.8622995156166 [0.49226847] 0.37647017089315
15
Current
28.98823788154744 [0.5619217] 0.41540063867463956
16
Current
15.361360393982027 [0.58

In [None]:
a1=int(X[ix,0])
print(a1)

93


Run 100 replications with Monte-carlo cross validation Calculate mean & standard deviation of MSE and R^2

In [None]:
def run_random_forest(iterations,value):
    scores = pd.DataFrame(columns = ['iteration', 'MSE', 'test r^2', 'train r^2', 'total r^2'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        a=10*i
        parameter=a1
        forest = make_pipeline(StandardScaler(), GradientBoostingRegressor(n_estimators=parameter,random_state=100))
        X_train, X_test, Y_train, Y_test = train_test_split(X_train1,  Y_train1, test_size = 0.2, random_state = a)

        forest.fit(X_train.reset_index().drop(['Reaction'], axis = 1), Y_train.reset_index().drop('Reaction', axis = 1).values.ravel())

        #evaluating performance
        Y_pred = forest.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = forest.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        all_results = pd.concat([train_results, results])

        scores = scores.append({'iteration': str(i+1),
                    'MSE': mean_squared_error(Y_test, Y_pred),
                    'test r^2': r2_score(Y_test, Y_pred),
                    'train r^2': r2_score(train_results['Actual'], train_results['Predicted']),
                    'total r^2': r2_score(all_results['Actual'], all_results['Predicted'])}, ignore_index = True)

    return scores

In [None]:
import warnings
warnings.filterwarnings("ignore")
for i in range(1,2):
  print("Parameter")
  print(20*i)
  results_rf = run_random_forest(100,1)
  print(results_rf)
  print(results_rf.mean())
  print(results_rf.std())

Parameter
20
   iteration       MSE  test r^2  train r^2  total r^2
0          1  0.145475  0.944891   0.984045   0.976869
1          2  0.212762  0.932053   0.981166   0.970476
2          3  0.256038  0.908098   0.986945   0.971682
3          4  0.399210  0.844309   0.981676   0.957073
4          5  0.151417  0.948647   0.981009   0.974342
..       ...       ...       ...        ...        ...
95        96  0.107049  0.967180   0.980210   0.977292
96        97  0.218303  0.935916   0.982047   0.971281
97        98  0.357539  0.873871   0.982657   0.961099
98        99  0.134912  0.952747   0.982081   0.976309
99       100  0.116335  0.949131   0.981663   0.976526

[100 rows x 5 columns]
iteration    1.234568e+189
MSE           2.209922e-01
test r^2      9.218703e-01
train r^2     9.828068e-01
total r^2     9.708634e-01
dtype: float64
MSE          0.085461
test r^2     0.030938
train r^2    0.001782
total r^2    0.005560
dtype: float64


Analysis without Imine's features

In [None]:
X = X_iminium

In [None]:
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X, Y, test_size = 0.1, random_state = 1)

In [None]:
os.chdir('/content/drive/My Drive/Colab Notebooks/material_project/code/original/chem_brandeis/IEEE_BIBM')
X_train2 = pd.read_excel('X_train2.xlsx', index_col=0)

In [None]:
print(X_train2.shape)
print(X_no_iminium.shape)

(307, 257)
(342, 257)


In [None]:
import os
import pandas as pd
import numpy as np
import sklearn
import random
import heapq
import matplotlib.pyplot as plt


from numpy import vstack
from numpy import argmax
from numpy import asarray
from numpy.random import normal

from scipy.stats import norm
from warnings import catch_warnings
from warnings import simplefilter
from matplotlib import pyplot


from scipy import stats
from scipy.stats import pearsonr
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from scipy.stats import pearsonr
from numpy.random import random


from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

Bayesian optimization for hyperparameter tuning

In [None]:

X_train_train, X_valid, Y_train_train, Y_valid = train_test_split( X_train2,Y_train1, test_size = 0.2, random_state = 100)



X_valid=X_valid.to_numpy()
Y_valid=Y_valid.to_numpy()
X_train_train=X_train_train.to_numpy()
Y_train_train=Y_train_train.to_numpy()

In [None]:

validation_replication=1
result_valid=np.zeros(validation_replication)
result_valid=np.reshape(result_valid,(1,validation_replication))



def objective(x1):

    # Constraints for each hyperparameter
    # All hyperparameters are positive integers.
    # Minimum of "min_n_estimators" : 1.

    a1 = int(x1)


    if a1<1:
        a1=1


# Performance evaluation for  validation set
    for j in range(0,1):
        # Fit the model with training data and check the Mean Absolute Percentage Error (MSE) of validation data
        reg_m = make_pipeline(StandardScaler(), GradientBoostingRegressor(n_estimators=a1,random_state=5))
        reg_m.fit(X_train_train, Y_train_train.ravel());

        predicted_y_m = reg_m.predict(X_valid);
        prediction=np.reshape(predicted_y_m,(predicted_y_m.shape[0],1))
        result_valid[0,j] =(mean_squared_error(Y_valid, prediction))
    # Since Bayesian Optimization is maximization problem, we used reciprocal as output.
    return (1/np.mean(result_valid))

# surrogate or approximation for the objective function
def surrogate(model, X):
    # catch any warning generated when making a prediction
    #with catch_warnings():
        # ignore generated warnings if the distribution is thin at a given point
        #simplefilter("ignore")
        return model.predict(X, return_std=True)

# We used Expected Improvement as our acquisition function.
def acquisition(X, Xsamples, model):
    # calculate the best surrogate score found so far
    yhat, _ = surrogate(model, X)
    best = max(yhat)
    # calculate mean and stdev via surrogate function
    mu, std = surrogate(model, Xsamples)
    # calculate the Expected improvement
    probs = (mu - best) * norm.cdf((mu - best) / (std + 1E-9)) + (std + 1E-9) * norm.pdf((mu - best) / (std + 1E-9))
    return probs

# optimize the acquisition function
def opt_acquisition(X, y, model):
    np.random.seed(i*100+h*100+100*i1)
    # random search, generate random samples
    X1samples = 100 * random((100,1))

    # calculate the acquisition function for each sample
    Xsamples = X1samples
    scores = acquisition(X, Xsamples, model)
    # locate the index of the largest scores
    ix = argmax(scores)
    return Xsamples[ix,]



X1 = [100]
X1 = asarray(X1)


y = asarray([objective(X1)])
X = X1
X = X.reshape(len(X1),1)
y = y.reshape(len(y),1)
h=0




# Starting from given 20 points
for i1 in range(0,20):
    np.random.seed(i1*100+i*100)
    X1 = 100* random(1)

    ysample = asarray([objective(X1)])
    Xsample = X1
    Xsample = Xsample.reshape(len(X1),1)
    ysample = ysample.reshape(len(ysample),1)

    X = vstack((X,Xsample))
    y = vstack((y,ysample))
# define the surrogate model
model = GaussianProcessRegressor()
model.fit(X, y)



# Sample new points (hyperparameters) with Bayesian Optimization.
# It sequentialy samples 100 points based on the optimization.
for h in range(100):
    print(h)
    # select the next point to sample
    x = opt_acquisition(X, y, model)
    x = asarray(x)
    # sample the point
    actual = objective(x[0])
    est, _ = surrogate(model,[x])
    actual = asarray(actual)
    #print(' f()=%3f, actual=%.3f' % (est, actual))
    # add the data to the dataset
    X = vstack((X,[x]))
    y = vstack((y,[[actual]]))
    # update the model
    model.fit(X, y)
    # best result
    ix = argmax(y)
    print("Current")
    print(x[0],1/est,1/actual)

#Provide Best hyperparameters settings based on Bayesian Optimization.
print("Best")
print((X[ix, 0],y[ix],1/y[ix]),ix)



a1=int(X[ix,0])

if a1<1:
    a1=1
print(a1)




0
Current
99.21200295448001 [0.20519099] 0.23380480394628222
1
Current
64.34987075099797 [0.22056475] 0.23419251555638715
2
Current
54.92628171817484 [0.23751241] 0.23780926930621724
3
Current
44.20324126661561 [0.22068154] 0.2463717426677679
4
Current
65.89371881453168 [0.22032662] 0.2332292696030328
5
Current
68.65454197977625 [0.24595267] 0.23171622610296153
6
Current
94.55649989577893 [0.23913792] 0.23390982460647602
7
Current
32.874382384130854 [0.23287998] 0.25956243254306083
8
Current
60.32698344565115 [0.2546543] 0.23243641449377572
9
Current
56.7796587572533 [0.22646613] 0.23547158757842152
10
Current
80.52571366463135 [0.23879572] 0.23103376767788225
11
Current
68.9059525605662 [0.22583638] 0.23171622610296153
12
Current
58.642313217134735 [0.31472969] 0.23448410064919778
13
Current
68.1788572966276 [0.22779462] 0.23171622610296153
14
Current
58.2588682983287 [0.22855309] 0.23448410064919778
15
Current
98.61840895921588 [0.23192365] 0.23372349924228647
16
Current
63.077150964

In [None]:
a1=int(X[ix,0])
print(a1)

76


Run 100 replications with Monte-carlo cross validation Calculate mean & standard deviation of MSE and R^2

In [None]:
def run_random_forest(iterations,value):
    scores = pd.DataFrame(columns = ['iteration', 'MSE', 'test r^2', 'train r^2', 'total r^2'])
    #scores.set_index('iteration', inplace = True)
    for i in range(iterations):
        a=10*i
        parameter=a1
        forest = make_pipeline(StandardScaler(), GradientBoostingRegressor(n_estimators=parameter,random_state=100))
        X_train, X_test, Y_train, Y_test = train_test_split(X_train2,  Y_train1, test_size = 0.2, random_state = a)

        forest.fit(X_train.reset_index().drop(['Reaction'], axis = 1), Y_train.reset_index().drop('Reaction', axis = 1).values.ravel())

        #evaluating performance
        Y_pred = forest.predict(X_test.reset_index().drop(['Reaction'], axis = 1))

        results = pd.concat([Y_test.reset_index(), pd.DataFrame(Y_pred)], axis = 1)
        results.set_index('Reaction', inplace = True)
        results.columns = ['Actual', 'Predicted']

        Y_train_pred = forest.predict(X_train.reset_index().drop(['Reaction'], axis = 1))
        train_results = pd.concat([Y_train.reset_index(), pd.DataFrame(Y_train_pred)], axis = 1)
        train_results.set_index('Reaction', inplace = True)
        train_results.columns = ['Actual', 'Predicted']

        all_results = pd.concat([train_results, results])

        scores = scores.append({'iteration': str(i+1),
                    'MSE': mean_squared_error(Y_test, Y_pred),
                    'test r^2': r2_score(Y_test, Y_pred),
                    'train r^2': r2_score(train_results['Actual'], train_results['Predicted']),
                    'total r^2': r2_score(all_results['Actual'], all_results['Predicted'])}, ignore_index = True)

    return scores

In [None]:
import warnings
warnings.filterwarnings("ignore")
for i in range(1,2):
  print("Parameter")
  print(20*i)
  results_rf = run_random_forest(100,1)
  print(results_rf)
  print(results_rf.mean())
  print(results_rf.std())

Parameter
20
   iteration       MSE  test r^2  train r^2  total r^2
0          1  0.212523  0.919491   0.954636   0.948293
1          2  0.182049  0.941862   0.957204   0.953989
2          3  0.416826  0.850385   0.966660   0.944257
3          4  0.204074  0.920412   0.960587   0.953441
4          5  0.224256  0.923943   0.956035   0.949432
..       ...       ...       ...        ...        ...
95        96  0.221520  0.932084   0.961280   0.954728
96        97  0.398261  0.883088   0.963415   0.944736
97        98  0.252205  0.911030   0.966935   0.955864
98        99  0.187264  0.934410   0.964321   0.958516
99       100  0.177443  0.922411   0.963193   0.956804

[100 rows x 5 columns]
iteration    1.234568e+189
MSE           2.393410e-01
test r^2      9.153573e-01
train r^2     9.614897e-01
total r^2     9.525981e-01
dtype: float64
MSE          0.063711
test r^2     0.024237
train r^2    0.003657
total r^2    0.004187
dtype: float64
