### Use the published autochem workflow to generate DFT features for the compounds in the dataset

NOTE: This notebook needs to be run with the python environment for autoqchem.

In [1]:
from autoqchem.molecule import molecule
from autoqchem.sge_manager import sge_manager
from autoqchem.draw_utils import draw
from autoqchem.db_functions import descriptors
from rdkit import Chem
import pandas as pd
import numpy as np
import logging
logging.basicConfig(level=logging.INFO)


In [None]:
# connect to UCLA's computation cluster
sm = sge_manager(user='XXXX', host='hoffman2.idre.ucla.edu')
sm.connect()

### Load the product smiles strings and create input files for the DFT calculations

In [18]:
# Load the previously saved list of reaction products
df_bromides = pd.read_csv("./xec_smiles_bromide_scope.csv",names=["SMILES"])
df_bromides

Unnamed: 0,SMILES
0,C[C@@H](NC(=O)OC(C)(C)C)c1ccc(Br)cc1
1,CCOC(=O)c1nnc(Br)[nH]1
2,O=C(c1cc2cc(Br)cnc2s1)N1CCOCC1
3,COc1ccccc1Br
4,Cc1ccccc1Br
...,...
376,Brc1ccc([C@@H]2CNCCO2)cc1
377,FC(F)(F)c1ccnc(Nc2cccc(Br)c2)n1
378,CC1(C)Cc2cc(Cl)cc(Br)c2O1
379,Cc1cc(Br)cc(Nc2nccc(C3CC3)n2)c1


In [19]:
bromide_smiles = df_bromides["SMILES"].to_list()
print(f"Total number of substrates: {len(bromide_smiles)}")

Total number of substrates: 381


In [96]:
# generate molecule objects with up to 3 conformers for each structure
mols = [molecule(s, num_conf=3) for s in bromide_smiles]




In [12]:
# check the compounds by drawing them
draw(mols[33].mol)

interactive(children=(Dropdown(description='confId', options=(0, 1), value=0), Output()), _dom_classes=('widge…

<function autoqchem.draw_utils._graph_conf(m, confId=0, energies=[])>

In [13]:
# create Gaussian jobs locally
for mol in mols:
    sm.create_jobs_for_molecule(mol, theory="APFD",heavy_basis_set="LANL2DZ",light_basis_set='6-31G*',max_light_atomic_number=36)

INFO:autoqchem.gaussian_input_generator:Generating Gaussian input files for 3 conformations.
INFO:autoqchem.gaussian_input_generator:Generating Gaussian input files for 3 conformations.
INFO:autoqchem.gaussian_input_generator:Generating Gaussian input files for 2 conformations.
INFO:autoqchem.gaussian_input_generator:Generating Gaussian input files for 3 conformations.
INFO:autoqchem.gaussian_input_generator:Generating Gaussian input files for 2 conformations.
INFO:autoqchem.gaussian_input_generator:Generating Gaussian input files for 1 conformations.
INFO:autoqchem.gaussian_input_generator:Generating Gaussian input files for 1 conformations.
INFO:autoqchem.gaussian_input_generator:Generating Gaussian input files for 2 conformations.
INFO:autoqchem.gaussian_input_generator:Generating Gaussian input files for 3 conformations.
INFO:autoqchem.gaussian_input_generator:Generating Gaussian input files for 3 conformations.
INFO:autoqchem.gaussian_input_generator:Generating Gaussian input file

Note how 10 compounds were already calculated with the same parameters in other datasets and therefore not recalculated.

### Manage the DFT jobs on the cluster

In [47]:
# Submit jobs
sm.submit_jobs()

INFO:autoqchem.sge_manager:Submitting 27 jobs.
  0%|          | 0/27 [00:00<?, ?it/s]INFO:autoqchem.sge_manager:Submitted job 3d30411824927e811c05cb5626a838f3, job_id: 10345980.
  4%|▎         | 1/27 [00:01<00:50,  1.93s/it]INFO:autoqchem.sge_manager:Submitted job 2e504a805b00190a3ab5fdfda85f1f85, job_id: 10345981.
  7%|▋         | 2/27 [00:03<00:42,  1.69s/it]INFO:autoqchem.sge_manager:Submitted job 3fcc6d5fc435923088ca8851a336691e, job_id: 10345982.
 11%|█         | 3/27 [00:04<00:38,  1.62s/it]INFO:autoqchem.sge_manager:Submitted job f06f803388e93d4a02e78bf31bdab917, job_id: 10345983.
 15%|█▍        | 4/27 [00:06<00:36,  1.59s/it]INFO:autoqchem.sge_manager:Submitted job ac92c560c3c110961ed26f4cefc47066, job_id: 10345984.
 19%|█▊        | 5/27 [00:08<00:35,  1.61s/it]INFO:autoqchem.sge_manager:Submitted job 4c3dd87e34bfd9cef85de420c2364968, job_id: 10345985.
 22%|██▏       | 6/27 [00:09<00:33,  1.59s/it]INFO:autoqchem.sge_manager:Submitted job c4c6fa4eb39c31e212be7ee1a7cc163a, job_id

In [5]:
# Resubmit jobs that did not finish properly
sm.resubmit_incomplete_jobs()

INFO:autoqchem.sge_manager:There are no incomplete jobs to resubmit.


In [2]:
# Retrieve finished jobs from the cluster
sm.retrieve_jobs()

INFO:autoqchem.sge_manager:There are no jobs submitted to cluster. Nothing to retrieve.


In [None]:
# Upload data for finished compounds to the autoqchem database (autoqchem.org)
sm.upload_done_molecules_to_db(tags=["SVR_MacMillanXEC_ArBr"])

### Get the desriptors from the autoqchem database

In [43]:
# Download the descriptors
data = descriptors(tags=["SVR_MacMillanXEC_ArBr"],presets=["global","substructure"],conf_option="boltzmann",solvent="None",
                   functional="APFD",basis_set="6-31G*",substructure="cBr")

In [44]:
# Process the data so that it is in one dataframe
label_dict={}
for key in data:
    if key != "global":
        # atom descriptor dataframes are by default called atom1, atom2, etc. --> replace with the atom type and a running number (e. g. "C1" and "C2")
        if data[key].iloc[0,-1] not in label_dict:
            label_dict[data[key].iloc[0,-1]] = 1
        else:
            label_dict[data[key].iloc[0,-1]] += 1
        label = data[key].iloc[0,-1]+str(label_dict[data[key].iloc[0,-1]])
        data[key].drop(columns=["labels","X","Y","Z"],inplace=True)
        data[key].columns = [f"{label}_{column}" for column in data[key].columns]
    else:
        data[key].drop(columns=["converged","multiplicity"],inplace=True)

df_combined = pd.concat(data,axis=1)
df_combined.columns = [multi_column_index[1] for multi_column_index in df_combined.columns]

In [45]:
df_combined

Unnamed: 0_level_0,E,ES_root_dipole,ES_root_electronic_spatial_extent,ES_root_molar_volume,E_scf,E_thermal_correction,E_zpe,G,G_thermal_correction,H,...,Br1_ES_root_NPA_valence,Br1_Mulliken_charge,Br1_NMR_anisotropy,Br1_NMR_shift,Br1_NPA_Rydberg,Br1_NPA_charge,Br1_NPA_core,Br1_NPA_total,Br1_NPA_valence,Br1_VBur
can,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
B(c1c(ccc(c1Br)OCC)F)(O)O,-3231.540851,3.5075,3630.15185,1726.752875,-3231.628761,0.183063,-3231.554710,-3231.596811,0.127104,-3231.539907,...,6.815777,-0.086074,765.158775,2312.589798,0.01855,0.084334,27.999009,34.915666,6.898106,0.475556
B(c1ccc(s1)Br)(O)O,-3299.457693,2.1397,2593.08990,1126.156000,-3299.402577,0.091292,-3299.466757,-3299.503329,0.045656,-3299.456749,...,6.68514,-0.044191,948.7559,2269.7402,0.02098,0.13192,27.99896,34.86808,6.84814,0.369651
B1(OC(C(O1)(C)C)(C)C)c2ccc(cc2)Br,-3213.049261,2.2397,7563.16400,1709.256000,-3213.247530,0.280880,-3213.064991,-3213.108849,0.221291,-3213.048316,...,6.85791,-0.110674,1074.2565,2216.7303,0.01778,0.06333,27.99911,34.93667,6.91978,0.374921
C#Cc1ccc(cc1)Br,-2878.927637,1.4534,2237.10170,1121.382000,-2879.035644,0.108007,-2878.935466,-2878.968515,0.067129,-2878.926693,...,6.83711,-0.100266,1065.2392,2212.6936,0.01802,0.07411,27.9991,34.92589,6.90878,0.375393
C#Cc1ccccc1Br,-2878.926724,0.8340,1653.06000,1126.144000,-2879.033831,0.108103,-2878.934485,-2878.968033,0.066794,-2878.925780,...,6.72542,-0.085612,986.9976,2225.2734,0.01801,0.09159,27.99906,34.90841,6.89133,0.402297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
c1csc(c1C(=O)O)Br,-3312.017336,3.4126,1857.40080,1201.187000,-3312.094419,0.080964,-3312.025510,-3312.060534,0.037766,-3312.016392,...,6.66052,-0.005087,836.567,2217.0024,0.02115,0.16721,27.99893,34.83279,6.81272,0.40686
c1csc(n1)Br,-3139.649215,3.9538,1052.24420,939.031000,-3139.698151,0.050962,-3139.654401,-3139.684950,0.015227,-3139.648271,...,6.82329,-0.022554,1211.569,2111.428,0.02067,0.13306,27.99894,34.86694,6.84732,0.363436
c1nc([nH]n1)Br,-2812.903766,3.7585,853.33380,910.640000,-2812.957389,0.055401,-2812.908491,-2812.938133,0.021035,-2812.902822,...,6.91168,-0.033335,986.5422,2300.4773,0.02045,0.12789,27.99892,34.87211,6.85274,0.356199
c1nnc(s1)Br,-3155.667110,1.4351,1026.02200,703.107000,-3155.703156,0.038571,-3155.672146,-3155.702621,0.003060,-3155.666166,...,6.81651,0.007124,1050.1952,2186.8076,0.02119,0.16269,27.9989,34.83731,6.81722,0.363279


Get the missing compounds from the other datasets.

In [46]:
# canonicalize the smiles
df_combined.index = [Chem.MolToSmiles(Chem.MolFromSmiles(s)) for s in df_combined.index]

# look for missing bromides
missing_bromides = []
for bromide in bromide_smiles:
    if bromide not in df_combined.index:
        if bromide != Chem.MolToSmiles(Chem.MolFromSmiles('[2H]c1nc(cs1)Br')):
            # manually remove this one smiles from the list as it gave errors in the descriptor calculations due to the isotope
            # this smiles was manually changed to the smiles "n1cscc1Br" or the autoqchem calculations
            # this is the protiated version of the original deuterium-labelled compond. It was assumed that the 
            # isotope will have a negligible effect on the reactivity.
            missing_bromides.append(bromide)
print(f"There are {len(missing_bromides)} missing bromides.")

There are 10 missing bromides.


In [47]:
other_dsets = ["arylbromides_ssg","ArBr_NiAr_WCG"]
missing_bromides = [Chem.MolToSmiles(Chem.MolFromSmiles(bromide),canonical=True) for bromide in missing_bromides]
dfs_missing = {}
for dset in other_dsets:
# Download the descriptors from the other datasets
    data_missing = descriptors(tags=[dset],presets=["global","substructure"],conf_option="boltzmann",solvent="None",
                    functional="APFD",basis_set="6-31G*",substructure="cBr")
    # Process the data so that it is in one dataframe
    label_dict={}
    for key in data:
        if key != "global":
            # atom descriptor dataframes are by default called atom1, atom2, etc. --> replace with the atom type and a running number (e. g. "C1" and "C2")
            if data_missing[key].iloc[0,-1] not in label_dict:
                label_dict[data_missing[key].iloc[0,-1]] = 1
            else:
                label_dict[data_missing[key].iloc[0,-1]] += 1
            label = data_missing[key].iloc[0,-1]+str(label_dict[data_missing[key].iloc[0,-1]])
            data_missing[key].drop(columns=["labels","X","Y","Z"],inplace=True)
            data_missing[key].columns = [f"{label}_{column}" for column in data_missing[key].columns]
        else:
            data_missing[key].drop(columns=["converged","multiplicity"],inplace=True)

    df_missing = pd.concat(data_missing,axis=1)
    df_missing.columns = [multi_column_index[1] for multi_column_index in df_missing.columns]
    df_missing.index = [Chem.MolToSmiles(Chem.MolFromSmiles(smiles),canonical=True) for smiles in df_missing.index]
    df_missing = df_missing.loc[df_missing.index.isin(missing_bromides)]
    dfs_missing[dset] = df_missing
df_missing = pd.concat(dfs_missing,axis=0)
df_missing.index = [multi_index[1] for multi_index in df_missing.index]
df_missing

Unnamed: 0,E,ES_root_dipole,ES_root_electronic_spatial_extent,ES_root_molar_volume,E_scf,E_thermal_correction,E_zpe,G,G_thermal_correction,H,...,Br1_ES_root_NPA_valence,Br1_Mulliken_charge,Br1_NMR_anisotropy,Br1_NMR_shift,Br1_NPA_Rydberg,Br1_NPA_charge,Br1_NPA_core,Br1_NPA_total,Br1_NPA_valence,Br1_VBur
Brc1cnc2nccn2c1,-2966.365637,4.8948,2244.333,1144.483,-2966.46306,0.103958,-2966.372867,-2966.406153,0.063443,-2966.364693,...,6.89658,-0.071188,943.4742,2341.6409,0.01895,0.1008,27.99908,34.8992,6.88117,0.375393
CC(C)(C)OC(=O)N1CCCCC1c1ccc(Br)cc1,-3398.669046,3.184877,10137.509207,2627.866793,-3399.043554,0.380832,-3398.688841,-3398.739903,0.309975,-3398.668101,...,6.807738,-0.117857,1073.395062,2251.919513,0.017846,0.05766,27.99911,34.94234,6.925384,0.376253
Cc1cn2cc(F)cc(Br)c2n1,-3088.759492,4.7685,2910.936,1309.099,-3088.894837,0.138535,-3088.769519,-3088.80638,0.091647,-3088.758547,...,6.8847,-0.045783,877.1748,2285.202,0.01907,0.12723,27.99902,34.87277,6.85467,0.385935
Clc1noc2ccc(Br)cc12,-3429.649744,5.6702,3247.5975,1133.556,-3429.742333,0.095052,-3429.658209,-3429.693697,0.051099,-3429.6488,...,6.69454,-0.087104,1113.2342,2219.6704,0.01811,0.08306,27.9991,34.91694,6.89973,0.377124
FC(F)(F)c1nc2ccc(Br)cc2s1,-3629.928558,4.8149,5633.6337,1116.239,-3630.031328,0.109723,-3629.940019,-3629.981208,0.057073,-3629.927614,...,6.83295,-0.087355,1085.5414,2207.6492,0.01812,0.08495,27.9991,34.91505,6.89783,0.377911
O=C1CCc2ncc(Br)cc2N1,-3064.781081,0.8217,3543.2396,1386.651,-3064.930211,0.152538,-3064.790771,-3064.827291,0.106327,-3064.780137,...,6.86037,-0.090537,1000.2284,2283.2781,0.01833,0.08334,27.99909,34.91666,6.89923,0.374685
O=C(O)c1nc(Br)cs1,-3328.063391,0.9678,2296.7037,846.744,-3328.127785,0.068971,-3328.071358,-3328.106147,0.026215,-3328.062447,...,6.87674,-0.04506,1099.5199,2223.5756,0.01968,0.1192,27.999,34.8808,6.86212,0.363436
COC(=O)c1cnc2[nH]cc(Br)c2c1,-3178.002918,8.554152,4494.894471,1656.668248,-3178.163254,0.164616,-3178.014995,-3178.055122,0.112412,-3178.001974,...,6.699005,-0.092303,735.527369,2485.693371,0.019735,0.09782,27.999,34.90218,6.88344,0.373893
FC(F)(F)c1cc2ccc(Br)cc2cn1,-3309.171551,5.3772,6034.5086,1944.921,-3309.305128,0.1433,-3309.183313,-3309.223645,0.091206,-3309.170607,...,6.76438,-0.084894,1087.1564,2201.6786,0.01802,0.08641,27.99909,34.91359,6.89648,0.377281
O=c1cc(O)c2ccc(Br)cc2o1,-3142.569127,3.789804,3896.268543,1433.087856,-3142.695844,0.133261,-3142.579345,-3142.616462,0.085927,-3142.568183,...,6.859183,-0.082459,1070.058017,2191.142049,0.018092,0.090083,27.99909,34.909917,6.892745,0.376964


In [48]:
#combine the data to get the final dataset
df_combined = pd.concat([df_combined,df_missing],axis=0)
df_combined

Unnamed: 0,E,ES_root_dipole,ES_root_electronic_spatial_extent,ES_root_molar_volume,E_scf,E_thermal_correction,E_zpe,G,G_thermal_correction,H,...,Br1_ES_root_NPA_valence,Br1_Mulliken_charge,Br1_NMR_anisotropy,Br1_NMR_shift,Br1_NPA_Rydberg,Br1_NPA_charge,Br1_NPA_core,Br1_NPA_total,Br1_NPA_valence,Br1_VBur
CCOc1ccc(F)c(B(O)O)c1Br,-3231.540851,3.507500,3630.151850,1726.752875,-3231.628761,0.183063,-3231.554710,-3231.596811,0.127104,-3231.539907,...,6.815777,-0.086074,765.158775,2312.589798,0.01855,0.084334,27.999009,34.915666,6.898106,0.475556
OB(O)c1ccc(Br)s1,-3299.457693,2.139700,2593.089900,1126.156000,-3299.402577,0.091292,-3299.466757,-3299.503329,0.045656,-3299.456749,...,6.68514,-0.044191,948.7559,2269.7402,0.02098,0.13192,27.99896,34.86808,6.84814,0.369651
CC1(C)OB(c2ccc(Br)cc2)OC1(C)C,-3213.049261,2.239700,7563.164000,1709.256000,-3213.247530,0.280880,-3213.064991,-3213.108849,0.221291,-3213.048316,...,6.85791,-0.110674,1074.2565,2216.7303,0.01778,0.06333,27.99911,34.93667,6.91978,0.374921
C#Cc1ccc(Br)cc1,-2878.927637,1.453400,2237.101700,1121.382000,-2879.035644,0.108007,-2878.935466,-2878.968515,0.067129,-2878.926693,...,6.83711,-0.100266,1065.2392,2212.6936,0.01802,0.07411,27.9991,34.92589,6.90878,0.375393
C#Cc1ccccc1Br,-2878.926724,0.834000,1653.060000,1126.144000,-2879.033831,0.108103,-2878.934485,-2878.968033,0.066794,-2878.925780,...,6.72542,-0.085612,986.9976,2225.2734,0.01801,0.09159,27.99906,34.90841,6.89133,0.402297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O=C1CCc2ncc(Br)cc2N1,-3064.781081,0.821700,3543.239600,1386.651000,-3064.930211,0.152538,-3064.790771,-3064.827291,0.106327,-3064.780137,...,6.86037,-0.090537,1000.2284,2283.2781,0.01833,0.08334,27.99909,34.91666,6.89923,0.374685
O=C(O)c1nc(Br)cs1,-3328.063391,0.967800,2296.703700,846.744000,-3328.127785,0.068971,-3328.071358,-3328.106147,0.026215,-3328.062447,...,6.87674,-0.04506,1099.5199,2223.5756,0.01968,0.1192,27.999,34.8808,6.86212,0.363436
COC(=O)c1cnc2[nH]cc(Br)c2c1,-3178.002918,8.554152,4494.894471,1656.668248,-3178.163254,0.164616,-3178.014995,-3178.055122,0.112412,-3178.001974,...,6.699005,-0.092303,735.527369,2485.693371,0.019735,0.09782,27.999,34.90218,6.88344,0.373893
FC(F)(F)c1cc2ccc(Br)cc2cn1,-3309.171551,5.377200,6034.508600,1944.921000,-3309.305128,0.143300,-3309.183313,-3309.223645,0.091206,-3309.170607,...,6.76438,-0.084894,1087.1564,2201.6786,0.01802,0.08641,27.99909,34.91359,6.89648,0.377281


In [49]:
def feature_preprocessing(df):
    """
    Function for removing non-varied and highly correlated features.
    Take a df as input and returns it in processed form.
    """
    # Remove columns that have only one or two unique values.
    removed_columns = []
    for column in df.columns:
        if len(np.unique(df[column].values)) <= 2:
            removed_columns.append(column)
    df = df.drop(removed_columns, axis=1)
    
    # Remove highly correlated features
    corr_matrix = df.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape),k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    df = df.drop(to_drop, axis=1)

    # Store the names of the column removed due to correlation
    for column in to_drop:
        removed_columns.append(column)

    print(f"The following features were removed: {removed_columns}")
    
    return df

df_processed = feature_preprocessing(df_combined)
df_processed

The following features were removed: ['charge', 'E_scf', 'E_zpe', 'G', 'G_thermal_correction', 'H', 'H_thermal_correction', 'electronic_spatial_extent', 'number_of_atoms', 'zero_point_correction', 'C1_ES_root_NPA_total', 'C1_ES_root_NPA_valence', 'C1_Mulliken_charge', 'C1_NPA_core', 'C1_NPA_total', 'C1_NPA_valence', 'Br1_ES_root_NPA_charge', 'Br1_ES_root_NPA_total', 'Br1_ES_root_NPA_valence', 'Br1_NPA_total', 'Br1_NPA_valence']


Unnamed: 0,E,ES_root_dipole,ES_root_electronic_spatial_extent,ES_root_molar_volume,E_thermal_correction,dipole,electronegativity,hardness,homo_energy,lumo_energy,...,Br1_ES_root_Mulliken_charge,Br1_ES_root_NPA_Rydberg,Br1_ES_root_NPA_core,Br1_Mulliken_charge,Br1_NMR_anisotropy,Br1_NMR_shift,Br1_NPA_Rydberg,Br1_NPA_charge,Br1_NPA_core,Br1_VBur
CCOc1ccc(F)c(B(O)O)c1Br,-3231.540851,3.507500,3630.151850,1726.752875,0.183063,2.352038,0.141368,0.104404,-0.245772,-0.036964,...,-0.021834,0.020462,27.998934,-0.086074,765.158775,2312.589798,0.01855,0.084334,27.999009,0.475556
OB(O)c1ccc(Br)s1,-3299.457693,2.139700,2593.089900,1126.156000,0.091292,1.166600,0.140780,0.100210,-0.240990,-0.040570,...,0.086668,0.02459,27.99881,-0.044191,948.7559,2269.7402,0.02098,0.13192,27.99896,0.369651
CC1(C)OB(c2ccc(Br)cc2)OC1(C)C,-3213.049261,2.239700,7563.164000,1709.256000,0.280880,2.232500,0.137680,0.107840,-0.245520,-0.029840,...,-0.059843,0.01919,27.99905,-0.110674,1074.2565,2216.7303,0.01778,0.06333,27.99911,0.374921
C#Cc1ccc(Br)cc1,-2878.927637,1.453400,2237.101700,1121.382000,0.108007,1.189900,0.140565,0.100035,-0.240600,-0.040530,...,-0.04278,0.01881,27.99902,-0.100266,1065.2392,2212.6936,0.01802,0.07411,27.9991,0.375393
C#Cc1ccccc1Br,-2878.926724,0.834000,1653.060000,1126.144000,0.108103,1.776500,0.141540,0.102190,-0.243730,-0.039350,...,0.054455,0.01891,27.99889,-0.085612,986.9976,2225.2734,0.01801,0.09159,27.99906,0.402297
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O=C1CCc2ncc(Br)cc2N1,-3064.781081,0.821700,3543.239600,1386.651000,0.152538,1.004500,0.141895,0.101565,-0.243460,-0.040330,...,-0.06787,0.01989,27.99903,-0.090537,1000.2284,2283.2781,0.01833,0.08334,27.99909,0.374685
O=C(O)c1nc(Br)cs1,-3328.063391,0.967800,2296.703700,846.744000,0.068971,4.374300,0.181120,0.093130,-0.274250,-0.087990,...,-0.070642,0.0198,27.99898,-0.04506,1099.5199,2223.5756,0.01968,0.1192,27.999,0.363436
COC(=O)c1cnc2[nH]cc(Br)c2c1,-3178.002918,8.554152,4494.894471,1656.668248,0.164616,2.466699,0.140353,0.092141,-0.232493,-0.048212,...,0.091154,0.018729,27.998885,-0.092303,735.527369,2485.693371,0.019735,0.09782,27.999,0.373893
FC(F)(F)c1cc2ccc(Br)cc2cn1,-3309.171551,5.377200,6034.508600,1944.921000,0.143300,3.817300,0.165160,0.090360,-0.255520,-0.074800,...,0.031004,0.01792,27.99897,-0.084894,1087.1564,2201.6786,0.01802,0.08641,27.99909,0.377281


In [50]:
# map the yields
df_labels = pd.read_csv("./../../xec_raw_dset_bromide_scope.csv",index_col=0,header=0)
df_labels.set_index("ArX_Smiles",inplace=True)
df_labels.index = [Chem.MolToSmiles(Chem.MolFromSmiles(smiles),canonical=True) for smiles in df_labels.index]
df_processed.index = [Chem.MolToSmiles(Chem.MolFromSmiles(smiles),canonical=True) for smiles in df_processed.index]

df_processed["yield"] = df_processed.index.map(df_labels["CAD Yield"])

In [51]:
# check that all yields were assigned
df_processed[df_processed["yield"].isna()]

Unnamed: 0,E,ES_root_dipole,ES_root_electronic_spatial_extent,ES_root_molar_volume,E_thermal_correction,dipole,electronegativity,hardness,homo_energy,lumo_energy,...,Br1_ES_root_NPA_Rydberg,Br1_ES_root_NPA_core,Br1_Mulliken_charge,Br1_NMR_anisotropy,Br1_NMR_shift,Br1_NPA_Rydberg,Br1_NPA_charge,Br1_NPA_core,Br1_VBur,yield
Brc1cscn1,-3139.649994,2.913,1160.4273,679.615,0.050946,2.4799,0.147265,0.107525,-0.25479,-0.03974,...,0.01915,27.99873,-0.064025,1082.4151,2239.8811,0.0196,0.10187,27.99901,0.362807,


In [52]:
# This is the compound for which the isotope label was manually removed. manually assign the yield
deut_smiles = Chem.MolToSmiles(Chem.MolFromSmiles("[2H]c1nc(cs1)Br"),canonical=True)
df_processed.loc["Brc1cscn1","yield"] = df_labels.loc[deut_smiles,"CAD Yield"]
print(f'Now, there are {len(df_processed[df_processed["yield"].isna()])} compounds without an assigned yield.')

Now, there are 0 compounds without an assigned yield.


In [53]:
# Save the dataset
df_processed.to_csv("./../xec_dft_data_bromide_scope.csv",index=True,header=True)

Also generate the pruned dataset

In [54]:
df_labels = pd.read_csv("./../../xec_raw_dset_bromide_scope_pruned.csv",index_col=0,header=0)
df_labels.set_index("ArX_Smiles",inplace=True)
df_labels.index = [Chem.MolToSmiles(Chem.MolFromSmiles(smiles),canonical=True) for smiles in df_labels.index]
df_combined.index = [Chem.MolToSmiles(Chem.MolFromSmiles(smiles),canonical=True) for smiles in df_combined.index]

df_combined = df_combined.loc[df_combined.index.isin(df_labels.index.to_list())]
print("Remaining substrates in the search space:", len(df_combined))

Remaining substrates in the search space: 258


In [55]:
df_processed_pruned = feature_preprocessing(df_combined)
df_processed_pruned

The following features were removed: ['charge', 'E_scf', 'E_zpe', 'G', 'G_thermal_correction', 'H', 'H_thermal_correction', 'electronic_spatial_extent', 'number_of_atoms', 'zero_point_correction', 'C1_ES_root_NPA_total', 'C1_ES_root_NPA_valence', 'C1_Mulliken_charge', 'C1_NPA_core', 'C1_NPA_total', 'C1_NPA_valence', 'Br1_ES_root_NPA_charge', 'Br1_ES_root_NPA_total', 'Br1_ES_root_NPA_valence', 'Br1_NPA_total', 'Br1_NPA_valence']


Unnamed: 0,E,ES_root_dipole,ES_root_electronic_spatial_extent,ES_root_molar_volume,E_thermal_correction,dipole,electronegativity,hardness,homo_energy,lumo_energy,...,Br1_ES_root_Mulliken_charge,Br1_ES_root_NPA_Rydberg,Br1_ES_root_NPA_core,Br1_Mulliken_charge,Br1_NMR_anisotropy,Br1_NMR_shift,Br1_NPA_Rydberg,Br1_NPA_charge,Br1_NPA_core,Br1_VBur
CCOc1ccc(F)c(B(O)O)c1Br,-3231.540851,3.507500,3630.151850,1726.752875,0.183063,2.352038,0.141368,0.104404,-0.245772,-0.036964,...,-0.021834,0.020462,27.998934,-0.086074,765.158775,2312.589798,0.01855,0.084334,27.999009,0.475556
OB(O)c1ccc(Br)s1,-3299.457693,2.139700,2593.089900,1126.156000,0.091292,1.166600,0.140780,0.100210,-0.240990,-0.040570,...,0.086668,0.02459,27.99881,-0.044191,948.7559,2269.7402,0.02098,0.13192,27.99896,0.369651
CC1(C)OB(c2ccc(Br)cc2)OC1(C)C,-3213.049261,2.239700,7563.164000,1709.256000,0.280880,2.232500,0.137680,0.107840,-0.245520,-0.029840,...,-0.059843,0.01919,27.99905,-0.110674,1074.2565,2216.7303,0.01778,0.06333,27.99911,0.374921
C#Cc1ccc(Br)cc1,-2878.927637,1.453400,2237.101700,1121.382000,0.108007,1.189900,0.140565,0.100035,-0.240600,-0.040530,...,-0.04278,0.01881,27.99902,-0.100266,1065.2392,2212.6936,0.01802,0.07411,27.9991,0.375393
C=CCc1ccc(Br)c(F)c1,-3018.580396,1.392737,3174.053561,1238.438943,0.155232,2.728584,0.132611,0.112955,-0.245566,-0.019655,...,0.00069,0.020571,27.99897,-0.080558,828.768086,2361.732075,0.019085,0.091399,27.99907,0.388583
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CC(C)(C)OC(=O)N1CCCCC1c1ccc(Br)cc1,-3398.669046,3.184877,10137.509207,2627.866793,0.380832,3.943029,0.122275,0.114605,-0.236880,-0.007671,...,-0.019984,0.018624,27.998986,-0.117857,1073.395062,2251.919513,0.017846,0.05766,27.99911,0.376253
Cc1cn2cc(F)cc(Br)c2n1,-3088.759492,4.768500,2910.936000,1309.099000,0.138535,3.026200,0.136640,0.089700,-0.226340,-0.046940,...,-0.088094,0.02148,27.99904,-0.045783,877.1748,2285.202,0.01907,0.12723,27.99902,0.385935
O=C1CCc2ncc(Br)cc2N1,-3064.781081,0.821700,3543.239600,1386.651000,0.152538,1.004500,0.141895,0.101565,-0.243460,-0.040330,...,-0.06787,0.01989,27.99903,-0.090537,1000.2284,2283.2781,0.01833,0.08334,27.99909,0.374685
FC(F)(F)c1cc2ccc(Br)cc2cn1,-3309.171551,5.377200,6034.508600,1944.921000,0.143300,3.817300,0.165160,0.090360,-0.255520,-0.074800,...,0.031004,0.01792,27.99897,-0.084894,1087.1564,2201.6786,0.01802,0.08641,27.99909,0.377281


In [56]:
# map the yields
df_processed_pruned["yield"] = df_processed_pruned.index.map(df_labels["CAD Yield"])

In [57]:
# check that all yields were assigned
df_processed_pruned[df_processed_pruned["yield"].isna()]

Unnamed: 0,E,ES_root_dipole,ES_root_electronic_spatial_extent,ES_root_molar_volume,E_thermal_correction,dipole,electronegativity,hardness,homo_energy,lumo_energy,...,Br1_ES_root_NPA_Rydberg,Br1_ES_root_NPA_core,Br1_Mulliken_charge,Br1_NMR_anisotropy,Br1_NMR_shift,Br1_NPA_Rydberg,Br1_NPA_charge,Br1_NPA_core,Br1_VBur,yield


In [58]:
# Save the dataset
df_processed_pruned.to_csv("./../xec_dft_data_bromide_scope_pruned.csv",index=True,header=True)