In [None]:
# Written by: Daniel Seungwook Min, Doyle Group 
# For questions, please contact:
# Daniel: mindaniel@chem.ucla.edu
# Prof. Abby Doyle: agdoyle@chem.ucla.edu

In [20]:
import pandas as pd
import numpy as np

In [2]:
# read-only publicly accessible aws s3 bucket
s3 = 's3://reductionprediction/'

kraken_pre_feats = pd.read_csv(s3+"datasets/test_set_raw.csv", index_col = 0)
kraken_pre_feats

Unnamed: 0,reductant,reductant_C,temperature,catalyst_C,cat2lig,solvent,concentration,ligands
3069326,n-BuZnBr,2.0,30.0,10.0,2.0,THF,0.15,67
3098928,n-BuZnBr,2.0,30.0,10.0,2.0,CPME,0.10,67
3111392,n-BuZnBr,2.0,30.0,10.0,3.0,PhMe,0.10,67
124706,n-PrZnBr,2.0,30.0,10.0,3.0,Dioxane,0.05,67
1600132,n-PrMgBr,2.0,30.0,10.0,2.0,dibutyl ether,0.20,67
...,...,...,...,...,...,...,...,...
1555249,n-PrMgBr,2.0,30.0,5.0,3.0,MTBE,0.15,377
15945,n-PrZnBr,2.0,30.0,5.0,2.0,Dioxane,0.15,377
56453,n-PrZnBr,2.0,30.0,5.0,3.0,MTBE,0.05,377
3047813,n-BuZnBr,2.0,30.0,5.0,3.0,MTBE,0.05,377


In [3]:
# Features used in each of the models. Be sure to update these if these features
# are modified in the original script (h_learning.ipynb)
feats = ['reductant_n_carbons', 'reductant_n_beta_H', 'reductant_metal',
       'ligand_nmrtens_szz_P_boltz', 'ligand_efg_amp_P_boltz',
       'ligand_efgtens_xx_P_boltz', 'ligand_nbo_lp_P_percent_s_boltz',
       'ligand_nbo_lp_P_occ_boltz', 'ligand_nbo_bds_e_avg_boltz',
       'ligand_nbo_bd_occ_avg_boltz', 'ligand_nbo_bds_occ_avg_boltz',
       'ligand_E_solv_total_boltz', 'ligand_pyr_P_boltz', 'ligand_pyr_P_delta',
       'ligand_vbur_far_vbur_boltz', 'ligand_vbur_far_vbur_min',
       'ligand_sterimol_B1_boltz', 'ligand_sterimol_B1_min',
       'ligand_sterimol_B5_delta', 'ligand_sterimol_burB5_min']

feats_delta_par3 = ['reductant_n_carbons', 'reductant_metal',
       'ligand_nbo_lp_P_percent_s_boltz', 'ligand_nbo_bd_e_avg_boltz',
       'ligand_vbur_ovbur_min_vburminconf']

feats_delta_pr3 =  ['ligand_E_solv_cds_boltz','ligand_qpole_amp_boltz','reductant_n_beta_H']

In [4]:
# Choose only features that are necessary for model predictions to reduce computational cost
all_chosen_feats = list(set(feats_delta_pr3 + feats_delta_par3 + feats))
def filter_feats(keyword):
    filtered = []
    for feature in all_chosen_feats:
        if keyword in feature:
            filtered.append(feature[len(keyword)+1:])
    return filtered
chosen_ligand_feats = filter_feats('ligand')
chosen_red_feats = filter_feats('reductant')
chosen_solvent_feats = filter_feats('solvent')

In [5]:
# Add phosphine features
p_feats =pd.read_csv(s3+"datasets/phosphine.csv", index_col = 0)
p_feats = p_feats[chosen_ligand_feats]
columns = "ligand_" + p_feats.columns
p_feats_mapped = kraken_pre_feats["ligands"].map(lambda x: p_feats.loc[x].values)
p_feats_df = pd.DataFrame(np.array(list(p_feats_mapped.values)), columns = columns)
p_feats_df.index = kraken_pre_feats.index
kraken_pre_feats = pd.concat([kraken_pre_feats, p_feats_df], axis = 1)
kraken_pre_feats

Unnamed: 0,reductant,reductant_C,temperature,catalyst_C,cat2lig,solvent,concentration,ligands,ligand_efgtens_xx_P_boltz,ligand_nbo_lp_P_occ_boltz,...,ligand_nbo_bds_e_avg_boltz,ligand_vbur_far_vbur_boltz,ligand_sterimol_B1_min,ligand_pyr_P_boltz,ligand_qpole_amp_boltz,ligand_vbur_far_vbur_min,ligand_E_solv_total_boltz,ligand_E_solv_cds_boltz,ligand_sterimol_burB5_min,ligand_nbo_bd_occ_avg_boltz
3069326,n-BuZnBr,2.0,30.0,10.0,2.0,THF,0.15,67,-0.803842,1.933998,...,0.211184,4.116689,3.465580,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971
3098928,n-BuZnBr,2.0,30.0,10.0,2.0,CPME,0.10,67,-0.803842,1.933998,...,0.211184,4.116689,3.465580,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971
3111392,n-BuZnBr,2.0,30.0,10.0,3.0,PhMe,0.10,67,-0.803842,1.933998,...,0.211184,4.116689,3.465580,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971
124706,n-PrZnBr,2.0,30.0,10.0,3.0,Dioxane,0.05,67,-0.803842,1.933998,...,0.211184,4.116689,3.465580,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971
1600132,n-PrMgBr,2.0,30.0,10.0,2.0,dibutyl ether,0.20,67,-0.803842,1.933998,...,0.211184,4.116689,3.465580,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555249,n-PrMgBr,2.0,30.0,5.0,3.0,MTBE,0.15,377,-0.817920,1.894899,...,0.221425,0.000000,4.092583,0.934315,5.631674,0.0,-13.059046,-6.405159,6.036507,1.955161
15945,n-PrZnBr,2.0,30.0,5.0,2.0,Dioxane,0.15,377,-0.817920,1.894899,...,0.221425,0.000000,4.092583,0.934315,5.631674,0.0,-13.059046,-6.405159,6.036507,1.955161
56453,n-PrZnBr,2.0,30.0,5.0,3.0,MTBE,0.05,377,-0.817920,1.894899,...,0.221425,0.000000,4.092583,0.934315,5.631674,0.0,-13.059046,-6.405159,6.036507,1.955161
3047813,n-BuZnBr,2.0,30.0,5.0,3.0,MTBE,0.05,377,-0.817920,1.894899,...,0.221425,0.000000,4.092583,0.934315,5.631674,0.0,-13.059046,-6.405159,6.036507,1.955161


In [6]:
# Add reductant features
r_feats = pd.read_csv(s3+"datasets/reductant.csv", index_col = 0)
r_feats = r_feats[chosen_red_feats]
columns = "reductant_" + r_feats.columns
r_feats_mapped = kraken_pre_feats["reductant"].map(lambda x: r_feats.loc[x].values)
r_feats_df = pd.DataFrame(np.array(list(r_feats_mapped.values)), columns = columns)
r_feats_df.index = kraken_pre_feats.index
kraken_pre_feats = pd.concat([kraken_pre_feats, r_feats_df], axis = 1)
kraken_pre_feats

Unnamed: 0,reductant,reductant_C,temperature,catalyst_C,cat2lig,solvent,concentration,ligands,ligand_efgtens_xx_P_boltz,ligand_nbo_lp_P_occ_boltz,...,ligand_pyr_P_boltz,ligand_qpole_amp_boltz,ligand_vbur_far_vbur_min,ligand_E_solv_total_boltz,ligand_E_solv_cds_boltz,ligand_sterimol_burB5_min,ligand_nbo_bd_occ_avg_boltz,reductant_metal,reductant_n_beta_H,reductant_n_carbons
3069326,n-BuZnBr,2.0,30.0,10.0,2.0,THF,0.15,67,-0.803842,1.933998,...,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971,1,2,4
3098928,n-BuZnBr,2.0,30.0,10.0,2.0,CPME,0.10,67,-0.803842,1.933998,...,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971,1,2,4
3111392,n-BuZnBr,2.0,30.0,10.0,3.0,PhMe,0.10,67,-0.803842,1.933998,...,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971,1,2,4
124706,n-PrZnBr,2.0,30.0,10.0,3.0,Dioxane,0.05,67,-0.803842,1.933998,...,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971,1,2,3
1600132,n-PrMgBr,2.0,30.0,10.0,2.0,dibutyl ether,0.20,67,-0.803842,1.933998,...,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971,0,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555249,n-PrMgBr,2.0,30.0,5.0,3.0,MTBE,0.15,377,-0.817920,1.894899,...,0.934315,5.631674,0.0,-13.059046,-6.405159,6.036507,1.955161,0,2,3
15945,n-PrZnBr,2.0,30.0,5.0,2.0,Dioxane,0.15,377,-0.817920,1.894899,...,0.934315,5.631674,0.0,-13.059046,-6.405159,6.036507,1.955161,1,2,3
56453,n-PrZnBr,2.0,30.0,5.0,3.0,MTBE,0.05,377,-0.817920,1.894899,...,0.934315,5.631674,0.0,-13.059046,-6.405159,6.036507,1.955161,1,2,3
3047813,n-BuZnBr,2.0,30.0,5.0,3.0,MTBE,0.05,377,-0.817920,1.894899,...,0.934315,5.631674,0.0,-13.059046,-6.405159,6.036507,1.955161,1,2,4


In [7]:
# Add solvent features
s_feats = pd.read_csv(s3+"datasets/solvent.csv", index_col = 0)
s_feats = s_feats[chosen_solvent_feats]
columns = "solvent_" + s_feats.columns
s_feats_mapped = kraken_pre_feats["solvent"].map(lambda x: s_feats.loc[x].values)
s_feats_df = pd.DataFrame(np.array(list(s_feats_mapped.values)), columns = columns)
s_feats_df.index = kraken_pre_feats.index
kraken_pre_feats = pd.concat([kraken_pre_feats, s_feats_df], axis = 1)
kraken_pre_feats

Unnamed: 0,reductant,reductant_C,temperature,catalyst_C,cat2lig,solvent,concentration,ligands,ligand_efgtens_xx_P_boltz,ligand_nbo_lp_P_occ_boltz,...,ligand_pyr_P_boltz,ligand_qpole_amp_boltz,ligand_vbur_far_vbur_min,ligand_E_solv_total_boltz,ligand_E_solv_cds_boltz,ligand_sterimol_burB5_min,ligand_nbo_bd_occ_avg_boltz,reductant_metal,reductant_n_beta_H,reductant_n_carbons
3069326,n-BuZnBr,2.0,30.0,10.0,2.0,THF,0.15,67,-0.803842,1.933998,...,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971,1,2,4
3098928,n-BuZnBr,2.0,30.0,10.0,2.0,CPME,0.10,67,-0.803842,1.933998,...,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971,1,2,4
3111392,n-BuZnBr,2.0,30.0,10.0,3.0,PhMe,0.10,67,-0.803842,1.933998,...,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971,1,2,4
124706,n-PrZnBr,2.0,30.0,10.0,3.0,Dioxane,0.05,67,-0.803842,1.933998,...,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971,1,2,3
1600132,n-PrMgBr,2.0,30.0,10.0,2.0,dibutyl ether,0.20,67,-0.803842,1.933998,...,0.926201,4.057338,0.0,-12.925720,-8.392010,6.086396,1.954971,0,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1555249,n-PrMgBr,2.0,30.0,5.0,3.0,MTBE,0.15,377,-0.817920,1.894899,...,0.934315,5.631674,0.0,-13.059046,-6.405159,6.036507,1.955161,0,2,3
15945,n-PrZnBr,2.0,30.0,5.0,2.0,Dioxane,0.15,377,-0.817920,1.894899,...,0.934315,5.631674,0.0,-13.059046,-6.405159,6.036507,1.955161,1,2,3
56453,n-PrZnBr,2.0,30.0,5.0,3.0,MTBE,0.05,377,-0.817920,1.894899,...,0.934315,5.631674,0.0,-13.059046,-6.405159,6.036507,1.955161,1,2,3
3047813,n-BuZnBr,2.0,30.0,5.0,3.0,MTBE,0.05,377,-0.817920,1.894899,...,0.934315,5.631674,0.0,-13.059046,-6.405159,6.036507,1.955161,1,2,4


In [14]:
# download test set to local computer
#h_learning.ipynb will use test_set on s3 bucket. Same file unless this file is modified

kraken_pre_feats[all_chosen_feats].to_csv("test_set.csv")