In [1]:
import numpy as np
import pandas as pd
import oenotebook as oenb
from openeye.oechem import *
from openmoltools.openeye import *
from openeye.oeiupac import *

## 1. Read in Cleaned Table

In [2]:
table = pd.read_pickle('table_cleaned.pkl')
print(len(table.Solute.unique()))
table.head()

419


Unnamed: 0,Solute,SMILES,Temp,Measured,Uncertain,Method,Ref.,Notes
0,pentane,CCCCC,293.15,945.0,0,GC,47,
1,pentane,CCCCC,303.15,796.0,0,GC,47,
2,pentane,CCCCC,313.15,517.0,0,GC,47,
3,hexane,CCCCCC,293.15,2940.0,0,GC,47,
4,hexane,CCCCCC,298.15,40000.0,0,--,10,


## 2. Filter Compounds

#### Part A: Compounds with Only One Entry

In [3]:
def single_entry(table):
    names = []
    for name in table.Solute.unique():
        subset = table[table.Solute == name]
        if len(subset) == 1:
            names.append(name)
    return names

In [4]:
single_names = single_entry(table)
print(len(single_names))

238


In [5]:
single_index = []
for name in single_names:
    single_index.append(*list(table[table.Solute == name].index))

In [6]:
#Drops any compounds with only a single measurement value (hard to guage the accuracy)
table2 = table.drop(single_index)
print('Old Table:', len(table))
print('New Table:', len(table2))

Old Table: 1374
New Table: 1136


#### Part B: Compounds with Measurements that are Duplicate in Table 1/2


In [7]:
def duplicates_b(table):
    doubles = table[table.Notes == 'This compound can also be found in Table 1']
    names = list(doubles.Solute.unique())
    return names

In [8]:
duplicate_names = duplicates_b(table2)
print(len(duplicate_names))

31


#### Part C: Compounds with Measurements that Span Various References 
Caveat this if different references give similar values within a set threshold (10% deviation?)

In [9]:
def multiple_ref(table):
    names = []
    for name in table.Solute.unique():
        group = table[table.Solute == name]
        if len(group['Ref.'].unique()) > 1:
            names.append(name)
    return names

In [10]:
multiple_ref_names = multiple_ref(table2)
print(len(multiple_ref_names))

148


#### Part D: Compounds with Multiple Measurements at the Same Temperature

In [11]:
def mult_temp(table):
    names = []
    for name in table.Solute.unique():
        subset = table[table.Solute == name]
        for temp in subset['Temp']:
            if len(subset[subset.Temp == temp]) > 1:
                names.append(name)
    return list(set(names))

In [12]:
mult_temp_names = mult_temp(table2)
print(len(mult_temp_names))

140


#### Part E: Combine Lists from All Previous Parts to Get all Flagged Compounds

In [13]:
#List of flagged compounds
flagged = sorted(list(set().union(duplicate_names, multiple_ref_names, mult_temp_names)))
print(len(flagged))

158


In [14]:
#List of compounds that have passed all the various flags
good_compounds = [x for x in table2.Solute.unique() if x not in flagged]
len(good_compounds)

23

## 3. Search in Flagged Compounds

In [15]:
#Checks the difference in the max and min values at a given temperature 
def check_difference(temp, subset):
    return 1-(min(subset[subset.Temp == temp].Measured)/max(subset[subset.Temp == temp].Measured))

#Make dictionary of all flagged compounds and the internal variance at different temperatures (output: big_check)
big_check = {} 
for name in flagged:
    subset = table2[table2.Solute == name]
    diff_list = []
    for temp in subset['Temp'].unique():
        diff_list.append(check_difference(temp, subset))
    big_check[name] = diff_list

In [16]:
#Recover potential compounds that have less than 10% variance for any temperatures (output: potential)
potential = []
for key in big_check:
    if max(big_check[key]) <= 0.10:
        potential.append(key)

In [17]:
#Separate out compounds that have large variance for a given temperature
##The list 'flagged' is separated into 'potential' and 'discard'
discard = [x for x in flagged if x not in potential]
print(len(discard))

#Don't really need this list, but it's here in case we want to keep track of what we discarded

95


Note: Many of the compounds in the discard pile have measurements from multiple references. When evaluated individually, references have relatively reasonable measurements (typically monotonic trend). However, the filter eliminated these compounds because variance across references is too high. If filter were changed to a cutoff more generous than 10%, further compounds may also be recovered.

In [18]:
#Find compounds with measurements at only one temperature; but from different references (output: flag_1_test)
flag_1_test = []
for name in potential:
    subset = table2[table2.Solute == name]
    if len(subset.Temp.unique()) < 2:
        flag_1_test.append(name)

#Reapply filter to determine how many compounds have <10% variance between references (output: accepted)
accepted = []
for name in flag_1_test:
    sub = table2[table2.Solute == name]
    temp = sub.Temp.unique()[0]
    if check_difference(temp, sub) <= 0.10:
        accepted.append(name)
print(len(accepted))

32


In [19]:
#Reapply filter for compounds with multiple references at multiple temperatures (output: okay)
##Note: within 'okay' are some compounds with non-monotonic trends 
flag_2_test = []
for name in potential:
    subset = table2[table2.Solute == name]
    if len(subset.Temp.unique()) >= 2:
        flag_2_test.append(name)
len(flag_2_test)

flag_2 = {}
for name in flag_2_test:
    subset = table2[table2.Solute == name]
    diff_list = []
    for temp in subset['Temp'].unique():
        diff_list.append(check_difference(temp, subset))
    flag_2[name] = diff_list

okay = []
for key in flag_2:
    if max(flag_2[key]) <= 0.10:
        okay.append(key)
print(len(okay))

31


### 3B. Combine List of Accepted and Recovered Compounds

In [20]:
#Compile list of all accepted compounds
complete = sorted(list(set().union(good_compounds, accepted, okay)))
compiled_table = pd.DataFrame()
for name in complete:
    compiled_table = compiled_table.append(table2[table2.Solute == name])
compiled_table.head()

Unnamed: 0,Solute,SMILES,Temp,Measured,Uncertain,Method,Ref.,Notes
923,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,288.15,127600.0,0.0,,7,
924,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,298.15,115000.0,0.0,,912,
925,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,298.15,118100.0,0.0,,7,
926,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,308.15,107400.0,0.0,,7,
927,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,318.15,96340.0,0.0,,7,


In [21]:
comp_1 = compiled_table['Solute'].unique()[10]
comp_2 = compiled_table['Solute'].unique()[11]

In [22]:
compiled_table[compiled_table['Solute'] == comp_2]

Unnamed: 0,Solute,SMILES,Temp,Measured,Uncertain,Method,Ref.,Notes
255,1-bromopropane,CCCBr,298.15,2850.0,0.0,GS,38,
256,1-bromopropane,CCCBr,298.15,2900.0,0.0,KGW,2,
1032,1-bromopropane,CCCBr,303.15,2770.0,0.0,,213,This compound can also be found in Table 1


In [23]:
#pd.DataFrame.to_pickle(complete_table, 'input/1_complete_table.pkl')

In [24]:
#Create dictionary that stores all temperatures with measured values for a given compound
temp_dict = {}
for name in complete:
    subset = compiled_table[compiled_table.Solute == name]
    temp_dict[name] = [x for x in sorted(subset.Temp.unique())]

In [25]:
#Parse down temperature list for each compound to temperatures 5 degrees or more from each other
new_temp_dict = {}
for name in complete:
    keep_temp = []
    keep_temp.append(temp_dict[name][0])
    reference = keep_temp[0]

    for entry in temp_dict[name][1:]:
        if entry >= reference + 5:
            keep_temp.append(entry)
            reference = entry
        
    new_temp_dict[name] = keep_temp

## 4. Create Final Output Dataframe

In [24]:
#Parse down the list of compounds with the newly generated temperature list
final_table = pd.DataFrame()
for name in new_temp_dict:
    subset = compiled_table[compiled_table.Solute == name]
    subset = subset.drop_duplicates(subset = 'Temp')
    final_table = final_table.append(subset[subset.Temp.isin(new_temp_dict[name])])

In [25]:
print(len(compiled_table))
print(len(final_table))
final_table.head()

322
240


Unnamed: 0,Solute,SMILES,Temp,Measured,Uncertain,Method,Ref.,Notes
923,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,288.15,127600.0,0.0,,7,
924,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,298.15,115000.0,0.0,,912,
926,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,308.15,107400.0,0.0,,7,
927,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,318.15,96340.0,0.0,,7,
487,"1,2-butanediol",CCC(CO)O,299.15,2.0,0.2,DP,62,


In [26]:
import pickle

pickle.dump(final_table, open('input/final_table.p', 'wb'))
pickle.dump(new_temp_dict, open('temp_dict.p', 'wb'))

In [27]:
def make_tables(new_table, solvent_list, ref_table):
    '''
    Input:
    new_table: name of table to be created
    solvent_list: list of corresponding smiles for every compound in the table to be created
    ref_table: table that contains all relevant information about the compounds
    
    Output:
    Returns a table ready to be saved as an .oeb file and submitted to Orion for processing
    '''
    new_table["Molecule"] = [smiles_to_oemol(x) for x in ref_table.SMILES]
    new_table["solute_name"] = [x for x in ref_table.Solute]
    new_table["solvents"] = solvent_list
    new_table["temperature"] = [x for x in ref_table.Temp]
    new_table["density"] = ['1.0' for x in ref_table.Solute]
    new_table["molar_fractions"] = ['1.0' for x in ref_table.Solute]
    new_table["pressure"] = ['1.0' for x in ref_table.Solute]
    new_table['IDAC_expt'] = [x for x in ref_table.Measured]
    new_table['Error'] = [x for x in ref_table.Uncertain]
    new_table['solute_SMILES'] = [x for x in ref_table.SMILES]
    
    #Generate a random conformer of each molecule to initialize the object with coordinates
    mols = []
    for i in range(len(new_table)):
        mol = new_table.Molecule[i]
        OETriposAtomNames(mol)
        mol = normalize_molecule(mol)
        mol = generate_conformers(mol, max_confs=1, strictStereo=False, strictTypes=False)
        for j, conf in enumerate(mol.GetConfs()):
            if j > 0:
                mol.DeleteConf(conf)
        mols.append(mol)
    new_table['Molecule'] = mols
    return new_table

In [28]:
solv_list = ['[H]O[H]' for x in final_table.Solute]
ssolv_list = [x for x in final_table.SMILES]

In [29]:
#Create solvation and self-solvation tables to export as oeb files
IDAC_solv = pd.DataFrame()
IDAC_ssolv = pd.DataFrame()
final_solv = make_tables(IDAC_solv, solv_list, final_table)
final_ssolv = make_tables(IDAC_ssolv, ssolv_list, final_table)



In [31]:
final_ssolv.head()

Unnamed: 0,Molecule,solute_name,solvents,temperature,density,molar_fractions,pressure,IDAC_expt,Error,solute_SMILES
0,<oechem.OEMol; proxy of <Swig Object of type '...,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,288.15,1.0,1.0,1.0,127600.0,0.0,Cc1ccc(c(c1)C)C
1,<oechem.OEMol; proxy of <Swig Object of type '...,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,298.15,1.0,1.0,1.0,115000.0,0.0,Cc1ccc(c(c1)C)C
2,<oechem.OEMol; proxy of <Swig Object of type '...,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,308.15,1.0,1.0,1.0,107400.0,0.0,Cc1ccc(c(c1)C)C
3,<oechem.OEMol; proxy of <Swig Object of type '...,"1,2,4-trimethylbenzene",Cc1ccc(c(c1)C)C,318.15,1.0,1.0,1.0,96340.0,0.0,Cc1ccc(c(c1)C)C
4,<oechem.OEMol; proxy of <Swig Object of type '...,"1,2-butanediol",CCC(CO)O,299.15,1.0,1.0,1.0,2.0,0.2,CCC(CO)O


In [32]:
final_solv.head()

Unnamed: 0,Molecule,solute_name,solvents,temperature,density,molar_fractions,pressure,IDAC_expt,Error,solute_SMILES
0,<oechem.OEMol; proxy of <Swig Object of type '...,"1,2,4-trimethylbenzene",[H]O[H],288.15,1.0,1.0,1.0,127600.0,0.0,Cc1ccc(c(c1)C)C
1,<oechem.OEMol; proxy of <Swig Object of type '...,"1,2,4-trimethylbenzene",[H]O[H],298.15,1.0,1.0,1.0,115000.0,0.0,Cc1ccc(c(c1)C)C
2,<oechem.OEMol; proxy of <Swig Object of type '...,"1,2,4-trimethylbenzene",[H]O[H],308.15,1.0,1.0,1.0,107400.0,0.0,Cc1ccc(c(c1)C)C
3,<oechem.OEMol; proxy of <Swig Object of type '...,"1,2,4-trimethylbenzene",[H]O[H],318.15,1.0,1.0,1.0,96340.0,0.0,Cc1ccc(c(c1)C)C
4,<oechem.OEMol; proxy of <Swig Object of type '...,"1,2-butanediol",[H]O[H],299.15,1.0,1.0,1.0,2.0,0.2,CCC(CO)O


In [31]:
##Tests to save molecule as .mol2 file and confirm it has coordinates attached to it
solv_molecule = final_solv['Molecule'][0]
ssolv_molecule = final_ssolv['Molecule'][0]

#molecule_to_mol2(solv_molecule, tripos_mol2_filename='test/but_solv.mol2')
#molecule_to_mol2(ssolv_molecule, tripos_mol2_filename='test/but_ssolv.mol2')

('1,2,4-trimethylbenzene', 'test/but_ssolv.mol2')

In [33]:
#Output dataframe as .oeb file for input into Orion workfloe
oenb.write_dataframe_to_file(final_solv, "input/IDAC_complete_solv.oeb")
oenb.write_dataframe_to_file(final_ssolv, "input/IDAC_complete_ssolv.oeb")

### Creating Test Subset from Complete List

In [73]:
test_set_solv = pd.DataFrame()
test_set_solv = test_set_solv.append(final_solv.loc[0])
test_set_solv = test_set_solv.append(final_solv.loc[1:240:120], sort=False)
test_set_solv

Unnamed: 0,IDAC_expt,Molecule,density,molar_fractions,name,pressure,solvents,temperature
0,127600.0,<oechem.OEMol; proxy of <Swig Object of type '...,1.0,1.0,"1,2,4-trimethylbenzene",1.0,[H]O[H],288.15
1,115000.0,<oechem.OEMol; proxy of <Swig Object of type '...,1.0,1.0,"1,2,4-trimethylbenzene",1.0,[H]O[H],298.15
121,0.17,<oechem.OEMol; proxy of <Swig Object of type '...,1.0,1.0,dimethylsulfoxide,1.0,[H]O[H],337.85


In [74]:
test_set_ssolv = pd.DataFrame()
test_set_ssolv = test_set_ssolv.append(final_ssolv.loc[0])
test_set_ssolv = test_set_ssolv.append(final_ssolv.loc[1:240:120], sort=False)
test_set_ssolv

Unnamed: 0,IDAC_expt,Molecule,density,molar_fractions,name,pressure,solvents,temperature
0,127600.0,<oechem.OEMol; proxy of <Swig Object of type '...,1.0,1.0,"1,2,4-trimethylbenzene",1.0,Cc1ccc(c(c1)C)C,288.15
1,115000.0,<oechem.OEMol; proxy of <Swig Object of type '...,1.0,1.0,"1,2,4-trimethylbenzene",1.0,Cc1ccc(c(c1)C)C,298.15
121,0.17,<oechem.OEMol; proxy of <Swig Object of type '...,1.0,1.0,dimethylsulfoxide,1.0,CS(=O)C,337.85


In [35]:
#Save DataFrames
pd.DataFrame.to_pickle(final_solv, 'input/final_solv.pkl')
pd.DataFrame.to_pickle(final_ssolv, 'input/final_ssolv.pkl')

In [75]:
#Output dataframe as .oeb file for input into Orion workfloe
oenb.write_dataframe_to_file(test_set_solv, "input/testset_solv.oeb")
oenb.write_dataframe_to_file(test_set_ssolv, "input/testset_ssolv.oeb")