### Use the published autochem workflow to generate DFT features for the compounds in the dataset

NOTE: This notebook needs to be run with the python environment for autoqchem.

In [2]:
from autoqchem.molecule import molecule
from autoqchem.sge_manager import sge_manager
from autoqchem.draw_utils import draw
from autoqchem.db_functions import descriptors
from rdkit import Chem
import pandas as pd
import logging
logging.basicConfig(level=logging.INFO)


In [None]:
# connect to UCLA's computation cluster
sm = sge_manager(user='XXXX', host='hoffman2.idre.ucla.edu')
sm.connect()

### Load the product smiles strings and create input files for the DFT calculations

In [6]:
# Load the previously saved list of reaction products
df_smiles = pd.read_csv("./xec_smiles_acid_scope_subs.csv",header=0)
df_smiles

Unnamed: 0,0
0,COC(=O)CC1CCc2cc(Br)cc3[nH]c(=O)c(=O)n1c23
1,CCOC(=O)c1ncn2c1CN(C)C(=O)c1cc(Br)ccc1-2
2,CN1C(=O)C[C@@](C)(c2cc(Br)cs2)N/C1=N/C(=O)OC(C...
3,O=C(O)C1CC1
4,O=C(O)C1(c2ccc(Cl)cc2)CC1
...,...
382,Cn1ncc2ccc(CC(=O)O)cc21
383,C[C@@H]1CC[C@H](C(=O)O)CN1C(=O)OCc1ccccc1
384,COC(=O)C1CN(C(=O)OC(C)(C)C)CCC1C(=O)O
385,CC(C)(C)OC(=O)N1CC2(C1)CS(=O)(=O)CC2C(=O)O


In [7]:
smiles_list = df_smiles.iloc[:,0]
len(smiles_list)

387

In [10]:
# generate molecule objects with up to 8 conformers for each structure
mols = [molecule(s, num_conf=8) for s in product_smiles]

In [12]:
# check the compounds by drawing them
draw(mols[29].mol)

interactive(children=(Dropdown(description='confId', options=(0, 1, 2, 3, 4, 5, 6, 7), value=0), Output()), _d…

<function autoqchem.draw_utils._graph_conf(m, confId=0, energies=[])>

In [None]:
# create Gaussian jobs locally
for mol in mols:
    sm.create_jobs_for_molecule(mol, theory="APFD",heavy_basis_set="def2tzvp",light_basis_set='def2svp',max_light_atomic_number=10)

### Manage the DFT jobs on the cluster

In [5]:
# Submit jobs
sm.submit_jobs()

INFO:autoqchem.sge_manager:Submitting 5115 jobs.
  0%|          | 0/5115 [00:00<?, ?it/s]INFO:paramiko.transport.sftp:[chan 1] Opened sftp connection (server version 3)
INFO:autoqchem.sge_manager:Submitted job 452c8236418da918c5e6b3c51f99c5d2, job_id: 10194892.
  0%|          | 1/5115 [00:03<4:42:42,  3.32s/it]INFO:autoqchem.sge_manager:Submitted job 293b20cd44ba9349caae40bd75260026, job_id: 10194893.
  0%|          | 2/5115 [00:05<3:32:38,  2.50s/it]INFO:autoqchem.sge_manager:Submitted job d8f63b99afdf9636fc44bfa92ac364b9, job_id: 10194894.
  0%|          | 3/5115 [00:07<3:08:39,  2.21s/it]INFO:autoqchem.sge_manager:Submitted job 33048eb2e10feb11eb536b24f45824d8, job_id: 10194895.
  0%|          | 4/5115 [00:08<2:52:14,  2.02s/it]INFO:autoqchem.sge_manager:Submitted job 72023b64bf749bfe087be399681012b4, job_id: 10194896.
  0%|          | 5/5115 [00:10<2:40:28,  1.88s/it]INFO:autoqchem.sge_manager:Submitted job 4849e6bc554f46516b08bee21d18c13e, job_id: 10194897.
  0%|          | 6/5115

KeyboardInterrupt: 

In [3]:
# Resubmit jobs that did not finish properly
sm.resubmit_incomplete_jobs()

INFO:autoqchem.sge_manager:Resubmitting failed jobs:
  0%|          | 0/17 [00:00<?, ?it/s]INFO:autoqchem.sge_manager:Submitted job 78d18a1699498c04fc6e7aebd69e9982, job_id: 10092196.
  6%|▌         | 1/17 [00:02<00:41,  2.58s/it]INFO:autoqchem.sge_manager:Submitted job f4ff46dcd99ac9db16e9b09ec57b69a1, job_id: 10092197.
 12%|█▏        | 2/17 [00:04<00:30,  2.04s/it]INFO:autoqchem.sge_manager:Submitted job 1b6d9e07fe92efcdc980bdbe731011c7, job_id: 10092198.
 18%|█▊        | 3/17 [00:05<00:26,  1.89s/it]INFO:autoqchem.sge_manager:Submitted job 5810cdfd89c0b027fe3af3348c3e27d3, job_id: 10092199.
 24%|██▎       | 4/17 [00:07<00:23,  1.84s/it]INFO:autoqchem.sge_manager:Submitted job 597073b107ba683cc30247887b5574c7, job_id: 10092200.
 29%|██▉       | 5/17 [00:09<00:21,  1.78s/it]INFO:autoqchem.sge_manager:Submitted job b3ca31108e6e1762d2b98c63e49669de, job_id: 10092201.
 35%|███▌      | 6/17 [00:11<00:19,  1.75s/it]INFO:autoqchem.sge_manager:Submitted job f83834be5fbd30397ba197ac6465f675, 

In [None]:
# Retrieve finished jobs from the cluster
sm.retrieve_jobs()

In [None]:
# Upload data for finished compounds to the autoqchem database (autoqchem.org)
sm.upload_done_molecules_to_db(tags=["SVR_MacMillanXEC"])

### Get the desriptors from the autoqchem database

first get the 3 bromides

In [95]:
# Download the descriptors
data = descriptors(tags=["SVR_MacMillanXEC"],presets=["global","substructure"],conf_option="boltzmann",solvent="None",
                   functional="APFD",basis_set="def2svp",substructure="cBr")

In [96]:
# Process the data so that it is in one dataframe
label_dict={}
for key in data:
    if key != "global":
        # atom descriptor dataframes are by default called atom1, atom2, etc. --> replace with the atom type and a running number (e. g. "C1" and "C2")
        if data[key].iloc[0,-1] not in label_dict:
            label_dict[data[key].iloc[0,-1]] = 1
        else:
            label_dict[data[key].iloc[0,-1]] += 1
        label = data[key].iloc[0,-1]+str(label_dict[data[key].iloc[0,-1]])
        data[key].drop(columns=["labels","X","Y","Z"],inplace=True)
        data[key].columns = [f"{label}_{column}" for column in data[key].columns]
    else:
        data[key].drop(columns=["converged","multiplicity"],inplace=True)

df_bromide = pd.concat(data,axis=1)
df_bromide.columns = [multi_column_index[1] for multi_column_index in df_bromide.columns]

In [97]:
df_bromide

Unnamed: 0_level_0,E,ES_root_dipole,ES_root_electronic_spatial_extent,ES_root_molar_volume,E_scf,E_thermal_correction,E_zpe,G,G_thermal_correction,H,...,Br1_ES_root_NPA_valence,Br1_Mulliken_charge,Br1_NMR_anisotropy,Br1_NMR_shift,Br1_NPA_Rydberg,Br1_NPA_charge,Br1_NPA_core,Br1_NPA_total,Br1_NPA_valence,Br1_VBur
can,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CCOC(=O)c1ncn2c1CN(C)C(=O)c1cc(Br)ccc1-2,-3542.222451,7.456172,11392.143394,2262.849124,-3542.511141,0.297904,-3542.242007,-3542.292264,0.228092,-3542.221507,...,6.898692,-0.155115,1243.325483,2063.919134,0.02425,0.081892,27.99911,34.918108,6.894753,0.375827
CN1C(=O)C[C@@](C)(c2cc(Br)cs2)N/C1=N/C(=O)OC(C)(C)C,-3943.891919,6.507722,12911.462626,2613.668236,-3944.244652,0.367085,-3943.915688,-3943.971596,0.287409,-3943.890975,...,6.906679,-0.170085,1166.646775,2162.33304,0.025172,0.0834,27.99906,34.9166,6.892359,0.368236
COC(=O)CC1CCc2cc(Br)cc3[nH]c(=O)c(=O)n1c23,-3524.070081,6.885499,6901.164734,2329.619286,-3524.33937,0.279476,-3524.088138,-3524.13552,0.214037,-3524.069137,...,6.861579,-0.175634,1255.934604,2095.646495,0.02451,0.067722,27.99911,34.932278,6.908648,0.376023


In [98]:
# Save the data
df_bromide.to_csv("./../1_Dataset_Generation/Data_For_Individual_Substrates/xec_dft_descr_bromides.csv",index=True,header=True)

same thing for the acids

In [236]:
# Download the descriptors
data = descriptors(tags=["SVR_MacMillanXEC"],presets=["global","substructure"],conf_option="boltzmann",solvent="None",
                   functional="APFD",basis_set="def2svp",substructure="[c,C]C(=O)[OH]")
# Process the data so that it is in one dataframe
label_dict={}
for key in data:
    if key != "global":
        # atom descriptor dataframes are by default called atom1, atom2, etc. --> replace with the atom type and a running number (e. g. "C1" and "C2")
        if data[key].iloc[0,-1] not in label_dict:
            label_dict[data[key].iloc[0,-1]] = 1
        else:
            label_dict[data[key].iloc[0,-1]] += 1
        label = data[key].iloc[0,-1]+str(label_dict[data[key].iloc[0,-1]])
        data[key].drop(columns=["labels","X","Y","Z"],inplace=True)
        data[key].columns = [f"{label}_{column}" for column in data[key].columns]
    else:
        data[key].drop(columns=["converged","multiplicity"],inplace=True)

df_acid = pd.concat(data,axis=1)
df_acid.columns = [multi_column_index[1] for multi_column_index in df_acid.columns]
df_acid

Unnamed: 0_level_0,E,ES_root_dipole,ES_root_electronic_spatial_extent,ES_root_molar_volume,E_scf,E_thermal_correction,E_zpe,G,G_thermal_correction,H,...,O2_ES_root_NPA_valence,O2_Mulliken_charge,O2_NMR_anisotropy,O2_NMR_shift,O2_NPA_Rydberg,O2_NPA_charge,O2_NPA_core,O2_NPA_total,O2_NPA_valence,O2_VBur
can,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
C#CCCCCCCC(=O)O,-500.798988,0.761858,2861.420521,1536.287003,-501.024103,0.228128,-500.811427,-500.852384,0.174732,-500.798044,...,6.654414,-0.202855,189.077454,146.270783,0.010221,-0.70813,1.999684,8.70813,6.698221,0.370462
C=C1C[C@]23C[C@@]1(O)CC[C@H]2[C@@]12C=C[C@H](O)[C@@](C)(C(=O)O1)[C@H]2[C@@H]3C(=O)O,-1186.441955,3.265400,7237.409300,2326.101000,-1186.842847,0.411340,-1186.463448,-1186.512284,0.341011,-1186.441011,...,6.70485,-0.201909,169.4313,142.8157,0.01009,-0.71354,1.99968,8.71354,6.70378,0.474906
C=CC[C@H](CC(=O)O)NC(=O)OC(C)(C)C,-784.799817,2.130126,4720.991198,2052.312816,-785.109260,0.316699,-784.817996,-784.866027,0.250490,-784.798872,...,6.668095,-0.202661,183.927435,145.370577,0.010238,-0.709632,1.999683,8.709632,6.699712,0.389075
C=C[C@@H]1C[C@]1(NC(=O)OC(C)(C)C)C(=O)O,-783.592022,3.448019,4517.526412,1885.081656,-783.869832,0.292470,-783.609297,-783.655044,0.229449,-783.591078,...,6.694336,-0.192219,165.945639,147.540717,0.009882,-0.703023,1.999683,8.703023,6.693457,0.433593
CC(=O)CC(C(=O)O)c1ccccc1,-650.882866,1.821279,3097.375357,1402.543790,-651.102854,0.223807,-650.896293,-650.938311,0.168362,-650.881921,...,6.68379,-0.184378,170.335674,144.544904,0.010265,-0.69183,1.999661,8.69183,6.681905,0.404615
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O=C1CC(C(=O)O)CCN1,-513.585362,2.865819,1418.964656,1130.577386,-513.747004,0.165295,-513.594421,-513.629557,0.121100,-513.584418,...,6.713283,-0.196779,179.382001,149.054064,0.010234,-0.706725,1.99968,8.706725,6.696811,0.393586
O=C1CCC(C(=O)O)CC1,-497.539858,2.594684,1458.411943,1235.468180,-497.712096,0.175948,-497.549013,-497.584301,0.131505,-497.538914,...,6.698578,-0.197578,180.944804,148.750766,0.010198,-0.708076,1.99968,8.708076,6.698206,0.40774
O=C1CCC(C(=O)O)Cc2ccccc21,-688.932575,3.299982,2966.877481,1791.833361,-689.161642,0.232772,-688.944782,-688.984092,0.181257,-688.931631,...,6.70012,-0.195811,172.939736,149.64276,0.010072,-0.707039,1.99968,8.707039,6.697287,0.400356
O=C1O[C@]2(CC[C@H](C(=O)O)CC2)c2cnccc21,-857.347135,4.924800,4876.792149,1856.916379,-857.602831,0.263725,-857.361562,-857.404382,0.206478,-857.346191,...,6.698105,-0.199537,175.349453,150.225523,0.010237,-0.707902,1.999681,8.707902,6.697984,0.395898


Please note that four acid substrates were already calculated in the dataset "SVR_Amide" with the same calculation settings and therefore not recalculated as part of this dataset.

In [237]:
# Find the missing acids
acid_smiles_incomplete = df_acid.index.tolist()
bromide_smiles = df_bromide.index.tolist()
acid_smiles_missing = [smiles for smiles in smiles_list]  # smiles_list contains all substrates (acids and bromides)
for smiles in smiles_list:
    if (smiles in acid_smiles_incomplete) or (smiles in bromide_smiles):
        acid_smiles_missing.remove(smiles)
print("Missing compounds that were already calculated for the dataset 'SVR_Amide':")
acid_smiles_missing

Missing compounds that were already calculated for the dataset 'SVR_Amide':


['COc1ccc(CC(=O)O)cc1',
 'O=C(O)Cc1ccc(C(F)(F)F)cc1',
 'CC1(C)C(C(=O)O)C1(C)C',
 'O=C(O)C1CCC(F)(F)CC1']

In [238]:
# download the descriptors for the missing acids
data_missing = descriptors(tags=["SVR_Amide"],presets=["global","substructure"],conf_option="boltzmann",solvent="None",
                   functional="APFD",basis_set="def2svp",substructure="[c,C]C(=O)[OH]")
# Process the data so that it is in one dataframe
label_dict={}
for key in data_missing:
    if key != "global":
        # atom descriptor dataframes are by default called atom1, atom2, etc. --> replace with the atom type and a running number (e. g. "C1" and "C2")
        if data_missing[key].iloc[0,-1] not in label_dict:
            label_dict[data_missing[key].iloc[0,-1]] = 1
        else:
            label_dict[data_missing[key].iloc[0,-1]] += 1
        label = data_missing[key].iloc[0,-1]+str(label_dict[data_missing[key].iloc[0,-1]])
        data_missing[key].drop(columns=["labels","X","Y","Z"],inplace=True)
        data_missing[key].columns = [f"{label}_{column}" for column in data_missing[key].columns]
    else:
        data_missing[key].drop(columns=["converged","multiplicity"],inplace=True)

df_missing = pd.concat(data_missing,axis=1)
df_missing.columns = [multi_column_index[1] for multi_column_index in df_missing.columns]

# only keep the missing acids
df_missing = df_missing.loc[df_missing.index.isin(acid_smiles_missing)]
df_missing

Unnamed: 0_level_0,E,ES_root_dipole,ES_root_electronic_spatial_extent,ES_root_molar_volume,E_scf,E_thermal_correction,E_zpe,G,G_thermal_correction,H,...,O2_ES_root_NPA_valence,O2_Mulliken_charge,O2_NMR_anisotropy,O2_NMR_shift,O2_NPA_Rydberg,O2_NPA_charge,O2_NPA_core,O2_NPA_total,O2_NPA_valence,O2_VBur
can,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CC1(C)C(C(=O)O)C1(C)C,-462.817211,0.512,1498.4251,1353.173,-463.029059,0.221085,-462.82863,-462.86473,0.173566,-462.816266,...,6.64311,-0.19892,200.9927,135.3815,0.00993,-0.70119,1.9997,8.70119,6.69157,0.355648
COc1ccc(CC(=O)O)cc1,-573.597364,3.153495,2707.38043,1506.025079,-573.782225,0.18837,-573.608446,-573.647751,0.137983,-573.59642,...,6.694966,-0.202234,177.250002,145.792822,0.010411,-0.705854,1.999683,8.705854,6.69576,0.375991
O=C(O)C1CCC(F)(F)CC1,-621.840309,2.635259,1918.758346,1100.191768,-622.018584,0.180612,-621.850101,-621.886045,0.134876,-621.839365,...,6.658938,-0.202033,178.153197,150.492035,0.010197,-0.711129,1.99968,8.711129,6.70125,0.398661
O=C(O)Cc1ccc(C(F)(F)F)cc1,-795.833904,4.088151,3723.640118,1428.520495,-795.990933,0.162045,-795.845977,-795.888103,0.107846,-795.83296,...,6.691862,-0.197326,180.765052,146.055224,0.0102,-0.70358,1.99969,8.70358,6.693695,0.355745


In [248]:
# ensure that the columns are aligned
df_missing_acids = df_missing[df_acid.columns]

# concatenate the dataframes
df_acids_complete = pd.concat([df_missing,df_acid],axis=0)
df_acids_complete.drop(columns="charge",inplace=True)
df_acids_complete

Unnamed: 0_level_0,E,ES_root_dipole,ES_root_electronic_spatial_extent,ES_root_molar_volume,E_scf,E_thermal_correction,E_zpe,G,G_thermal_correction,H,...,O2_ES_root_NPA_valence,O2_Mulliken_charge,O2_NMR_anisotropy,O2_NMR_shift,O2_NPA_Rydberg,O2_NPA_charge,O2_NPA_core,O2_NPA_total,O2_NPA_valence,O2_VBur
can,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CC1(C)C(C(=O)O)C1(C)C,-462.817211,0.512000,1498.425100,1353.173000,-463.029059,0.221085,-462.828630,-462.864730,0.173566,-462.816266,...,6.64311,-0.19892,200.9927,135.3815,0.00993,-0.70119,1.9997,8.70119,6.69157,0.355648
COc1ccc(CC(=O)O)cc1,-573.597364,3.153495,2707.380430,1506.025079,-573.782225,0.188370,-573.608446,-573.647751,0.137983,-573.596420,...,6.694966,-0.202234,177.250002,145.792822,0.010411,-0.705854,1.999683,8.705854,6.69576,0.375991
O=C(O)C1CCC(F)(F)CC1,-621.840309,2.635259,1918.758346,1100.191768,-622.018584,0.180612,-621.850101,-621.886045,0.134876,-621.839365,...,6.658938,-0.202033,178.153197,150.492035,0.010197,-0.711129,1.99968,8.711129,6.70125,0.398661
O=C(O)Cc1ccc(C(F)(F)F)cc1,-795.833904,4.088151,3723.640118,1428.520495,-795.990933,0.162045,-795.845977,-795.888103,0.107846,-795.832960,...,6.691862,-0.197326,180.765052,146.055224,0.0102,-0.70358,1.99969,8.70358,6.693695,0.355745
C#CCCCCCCC(=O)O,-500.798988,0.761858,2861.420521,1536.287003,-501.024103,0.228128,-500.811427,-500.852384,0.174732,-500.798044,...,6.654414,-0.202855,189.077454,146.270783,0.010221,-0.70813,1.999684,8.70813,6.698221,0.370462
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
O=C1CC(C(=O)O)CCN1,-513.585362,2.865819,1418.964656,1130.577386,-513.747004,0.165295,-513.594421,-513.629557,0.121100,-513.584418,...,6.713283,-0.196779,179.382001,149.054064,0.010234,-0.706725,1.99968,8.706725,6.696811,0.393586
O=C1CCC(C(=O)O)CC1,-497.539858,2.594684,1458.411943,1235.468180,-497.712096,0.175948,-497.549013,-497.584301,0.131505,-497.538914,...,6.698578,-0.197578,180.944804,148.750766,0.010198,-0.708076,1.99968,8.708076,6.698206,0.40774
O=C1CCC(C(=O)O)Cc2ccccc21,-688.932575,3.299982,2966.877481,1791.833361,-689.161642,0.232772,-688.944782,-688.984092,0.181257,-688.931631,...,6.70012,-0.195811,172.939736,149.64276,0.010072,-0.707039,1.99968,8.707039,6.697287,0.400356
O=C1O[C@]2(CC[C@H](C(=O)O)CC2)c2cnccc21,-857.347135,4.924800,4876.792149,1856.916379,-857.602831,0.263725,-857.361562,-857.404382,0.206478,-857.346191,...,6.698105,-0.199537,175.349453,150.225523,0.010237,-0.707902,1.999681,8.707902,6.697984,0.395898


In [249]:
df_acids_complete.to_csv("./../1_Dataset_Generation/Data_For_Individual_Substrates/xec_dft_descr_acids.csv",index=True,header=True)