In [43]:
import numpy as np
import pandas as pd
from pandas import DataFrame as df
from openeye.oechem import *
from openeye.oeiupac import *

In [44]:
#Returns True if inputted function is a float
def is_number(s):
    if type(s) == tuple:
        return False
    try:
        float(s)
        return True
    except ValueError:
        return False
    
#Returns index of any values that are not floats
def check_value(column):
    problem_index = []
    for i,entry in enumerate(column):
        if is_number(entry) == False:
            problem_index.append(i)
    return problem_index


#Cleans a column of temperature values 
#operations in the function are made by inspection
def clean_temp(value):
    if value[0].isdigit() == False: #fixes terms with characters out in front (i.e. .373.15)
        value = value[1:]
    value = value.replace(',','.')
    value = value.replace('l','1')
    value = value.replace('I','1')
    value = value.replace('i','1')
    value = value.replace(' ', '')
    value = value.replace('A', '.1')
    return value

#Cleans a column of measured values. Again, operations are done by inspection
def clean_measure(value):
    value = value.replace(" ", "")
    value = value.replace(",", ".")
    value = value.replace('l', "1")
    value = value.replace('I', "1")
    value = value.replace("x10","e")
    value = value.replace("×10", "e")
    value = value.replace("s10", "e")
    value = value.replace("10s", "5")
    value = value.replace('es', "e5")
    value = value.replace('i', '1')
    value = value.replace('>', "")
    value = value.replace('¢', "c")
    value = value.replace('-.', "")
    value = value.replace('O', '0')
    return value

#Finds where a symbol occurs within a string
def findOccur(s, ch):
    return [i for i, letter in enumerate(s) if letter == ch]

#Finds the maximum value of a list of lists
def max_value(inputlist):
    maxs = []
    for sublist in inputlist:
        maxs.append(max(sublist))
    return max(maxs)

#Finds the minimum value of a list of lists
def min_value(inputlist):
    mins = []
    for sublist in inputlist:
        mins.append(min(sublist))
    return min(mins)


#For an entry with a measured and uncertainty value, turns the two into a tuple
def split_uncertain(value):
    chars = set('+-_±')
    all_index = []
    
    for c in chars:
        if len(findOccur(value,c)) == 0:
            pass
        else:
            all_index.append(findOccur(value,c))
            
    if len(all_index) == 0:
        return value
    
    else:   
        max_i = max_value(all_index)
        min_i = min_value(all_index)
    
        return value[:min_i], value[(max_i+1):]

#Get the measured value from the measured and uncertainty tuple
def extractvalue(measured):
    if type(measured) == str:
        return measured
    else:
        return measured[0]
    
#Get the uncertainty value from the measured and uncertainty tuple
def extractuncertain(measured):
    if type(measured) == str:
        return '0'
    else:
        return measured[1]

#Used to flag entries that have footnote
def endswithb(value):
    if value[-1] == 'b':
        return True
    else:
        return False
    
#Used to flag entries that have footnote
def endswithc(value):
    if value[-1] == 'c':
        return True
    else:
        return False
    
def removeend(string):
    return string[:-1]

#Checks compound name to determine if it is a valid IUPAC name; 
#returns list of unrecognized compound names
def check_names(names):
    #Code adapted from Pharmsci 175/275
    bad_names = []
    for name in names:
    #Create a new OEMol
        mol_from_name = OEMol()
        if OEParseIUPACName(mol_from_name, name) == True:
            pass
        else:
            bad_names.append(name)
    return bad_names

#Type in help(method) to see the method abbreviations
def method():
    '''
    GCR: Gas-liquid chromatography with gas phase correction
    GC: Gas-liquid chromatography with no gas phase correction
    EBUL: Differential ebulliometry
    DP: Dew-point method
    NSGC: Non-steady-state gas chromatography
    RGC: Relative gas-liquid chromatography
    HS: Headspace chromatography
    GS: Gas-stripping
    STAT: Differential static cell equilibrium
    LLC: Liquid-liquid chromatography
    KHW: Hexadecane-water partition cofficient indirect method
    KGW: Estimation from gas-water partition coefficients from solubility measurements
    EXT: Extrapolated VLE
    '''
    return 

In [45]:
table = pd.read_csv('rawtable3.csv', delimiter = ',')
table = table[pd.notnull(table['Solvent'])]
table = table[pd.notnull(table['Temp'])]
table = table.reset_index(drop = True)

In [48]:
bad_names = check_names(table.Solvent.unique())
#table.loc[table.Solvent == bad_names[0]] #use this line to view all rows of a given solvent
#Change to IUPAC Name, if possible
table.Solvent = table.Solvent.replace({
    bad_names[0]: '1,1,2,2-Tetrachloroethane',
    bad_names[1]: '1,1-Dichloroethane',
    bad_names[2]: '4-(Chloromethyl)-1,3-dioxolan-2-one',#replaced with IUPAC Name
    bad_names[3]: '4-(Chloromethyl)-1,3-dioxolan-2-one',
    bad_names[4]: 'Ethanol',
    bad_names[5]: '1-Propanol',
    bad_names[6]: '2-Propen-1-ol',
    bad_names[7]: 'Isobutyl alcohol',
    bad_names[8]: 'Tert-butyl alcohol',
    bad_names[9]: 'Triethylene glycol',
    bad_names[14]: 'PEG 7500',
    bad_names[15]: 'Tetrahydrofuran',
    bad_names[21]: 'Butyl phosphate',
    bad_names[22]: 'Dibutyl phosphate',
    bad_names[23]: 'Tributyl phosphate',
    bad_names[24]: 'Methyl diphenyl phosphate',
    bad_names[26]: 'Dinonyl benzene-1,2-dicarboxylate',
    bad_names[27]: 'N,N-Dimethylformamide'
})

In [49]:
bad_names = check_names(table.Solvent.unique())
bad_names #unclear what the proper IUPAC name for these compounds is

['PEG 400',
 'PEG 600',
 'PEG 1000',
 'PEG 1500',
 'PEG 7500',
 'Tetraethylene Glycol-Dimethyl Ether',
 'Tripropylene Glycol-Dimethyl Ether',
 'Triethylene Glycol-Dibutyl Ether',
 'Diethylene Glycol-Dibutyl Ether',
 'Tripropylene Glycol-Dibutyl Ether',
 'Phosphoric Acid, Tri-(2-Butoxyethyl) Ester']

In [50]:
#Looks through all the reference numbers and returns ones that aren't proper numbers
table['Ref.'] = table['Ref.'].astype(str)
bad_ref = []
for entry in table['Ref.']:
    if entry.isnumeric() == True:
        pass
    else:
        bad_ref.append(entry)
bad_ref

['nan', ';2!', '3:}', ',34', '2,8']

In [51]:
#Fix the reference numbers by inspection
table['Ref.'] = table['Ref.'].replace({
    bad_ref[0]: '7',
    bad_ref[1]: '22',
    bad_ref[2]: '33',
    bad_ref[3]: '34',
    bad_ref[4]: '28'
})

In [52]:
def most(series):
    lst = list(series)
    return max(lst,key=lst.count)

#Values from the same reference use the same method. However, the method string is parsed incorrectly
#for some entries. By taking the most common entry, we typically get the correct method string.

reference_dict = {}
for entry in table['Ref.'].unique():
    series = table.loc[table['Ref.'] == entry].Method
    reference_dict[entry] = most(series)

#Fix manually
reference_dict['10'] = 'EXT'    

#Matches all reference numbers with the correct method for every row
for key in reference_dict:
    table.loc[table['Ref.'] == key, 'Method'] = reference_dict[key]

#For values with a footnote
table.loc[table.Method.apply(endswithb) == True, 'Notes'] = 'In the original paper, the experimental method was not clearly indicated. The experimental method indicated is guess from the context.' 


### Fixing Temperature Values

In [53]:
temp_probs = check_value(table['Temp'])
table.iloc[temp_probs, :]

Unnamed: 0,Solvent,Temp,Measured,Method,Ref.,Notes
119,Triethylene glycol,33145,066,GC,19,
132,Triethylene glycol,37815,0.807,GC,9,
149,PEG 600,.348.15,0.428,GC,9,
177,PEG 1500,.343.15,0.222,GC,9,
178,PEG 1500,.348.15,0.238,GC,9,
225,Tetrahydrofuran,".7,43.15",7.77,EXT,28,
282,Nitromethane,.343.35,8.6 + 0.4,EBUL,8,
299,Aniline,32315,255,EXT,14,
300,Aniline,37315,5.81,EXT,28,


In [54]:
#Apply temperature cleaning function on all rows with bad temperature values
table.loc[table.Temp.apply(is_number) == False, 'Temp'] = table.Temp.apply(clean_temp)
temp_probs = check_value(table['Temp'])
table.iloc[225,:].Temp = '343.15'

### Fixing Measured Values

In [55]:
#Saves index of all rows with messed up measured values
measure_probs = check_value(table['Measured'])

#Display list of problematic measured values
len(table.iloc[measure_probs,:])

45

In [56]:
#Apply cleaning function to all rows with value not recognized as a number
table.loc[table.Measured.apply(is_number) == False, 'Measured'] = table.Measured.apply(clean_measure)
measure_probs = check_value(table['Measured'])
print(len(measure_probs))


36


In [57]:
#Splits entries with an uncertainty into two
table['Measured'] = table.Measured.apply(split_uncertain)

#Creates an uncertainty column with uncertainty if applicable
table['Uncertain'] = table.Measured.apply(extractuncertain)

#Replaces measured column with a single value if it was previously split
table['Measured'] = table.Measured.apply(extractvalue)

measure_probs = check_value(table['Measured'])
print(len(measure_probs)) #Still have some values we can't read well

5


In [58]:
table.iloc[measure_probs,:]

Unnamed: 0,Solvent,Temp,Measured,Method,Ref.,Notes,Uncertain
147,PEG 600,333.15,0..343,GC,9,,0
183,PEG 1500,373.15,0..7.48,GC,9,,0
242,Ethyl Acetate,288.15,18.1~0.17,STAT,37,,0
263,Tributyl phosphate,298.15,2..34,GC,6,,0
291,"N,N-Dimethylformamide",382.95,1.33e0.07,EBUL,8,,0


In [59]:
#Remaining values we can't fix can just be manually fixed
table.iloc[147,:].Measured = '0.343'
table.iloc[183,:].Measured = '0.348'
table.iloc[242,:].Measured = '18.1'
table.iloc[242,:].Uncertain = '0.17'
table.iloc[263,:].Measured = '2.34'
table.iloc[291,:].Measured = '1.33'
table.iloc[291,:].Uncertain = '0.07'

#### Checking Uncertainty Values

In [60]:
#Same for checking uncertainty values are all correct
uncertain_probs = check_value(table['Uncertain'])
table.iloc[uncertain_probs,:]
table.iloc[223,:].Uncertain = '0.3'

### Check Values

In [68]:
#Set a threshold condition to ensure that values that were considered 'correct' did not
#slip under the cracks. 
error_names = []
compounds = table.Solvent.unique()
for compound in compounds:
    values = table.loc[table.Solvent == compound].Measured
    values = values.tolist()
    values = [float(x) for x in values]
    if max(values) > 5*min(values): #threshold condition
        error_names.append(compound)

In [69]:
error_names = []
compounds = table.Solvent.unique()
for compound in compounds:
    values = table.loc[table.Solvent == compound].Uncertain
    values = values.tolist()
    values = [float(x) for x in values if x!= '0']
    if values == []:
        pass
    else:
        if max(values) > 10*min(values): #threshold condition for uncertainty
            error_names.append(compound)

In [70]:
#Mistakes found while going through the finalized table
table.iloc[68,:].Uncertain = '0.07'
table.iloc[163,:].Measured = '0.324'

In [67]:
#Export finalized table as an Excel file
#table.to_excel("table3.xlsx")