In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame as df
from openeye.oechem import *
from openeye.oeiupac import *

In [2]:
#Returns True if inputted function is a float
def is_number(s):
    if type(s) == tuple:
        return False
    try:
        float(s)
        return True
    except ValueError:
        return False
    
#Returns index of any values in a column that are not floats
def check_value(column):
    problem_index = []
    for i,entry in enumerate(column):
        if is_number(entry) == False:
            problem_index.append(i)
    return problem_index


#Cleans a column of temperature values 
#operations in the function are made by inspection
def clean_temp(value):
    if value[0].isdigit() == False: #fixes terms with characters out in front (i.e. .373.15)
        value = value[1:]
    value = value.replace(',','.')
    value = value.replace('l','1')
    value = value.replace('I','1')
    value = value.replace('i','1')
    value = value.replace(' ', '')
    value = value.replace('A', '.1')
    value = value.replace('t', '1')
    return value

#Cleans a column of measured values 
def clean_measure(value):
    value = value.replace(" ", "")
    value = value.replace(",", ".")
    value = value.replace('l', "1")
    value = value.replace('I', "1")
    value = value.replace('O', '0')
    value = value.replace("x10","e")
    value = value.replace("×10", "e")
    value = value.replace("s10", "e")
    value = value.replace("10s", "5")
    value = value.replace('es', "e5")
    value = value.replace('i', '1')
    value = value.replace('>', "")
    value = value.replace('¢', "c")
    value = value.replace('-.', "")
    value = value.replace('g', '9')
    value = value.replace('/.', 'e')
    value = value.replace('/,', 'e')
    value = value.replace('/', 'e')
    value = value.replace('103', '3')
    value = value.replace('104', '4')
    value = value.replace('105', '5')
    value = value.replace('106', '6')
    value = value.replace('107', '7')
    value = value.replace('10T', '7')
    value = value.replace('T', '7')
    value = value.replace('_', 'e')
    return value

def findOccur(s, ch):
    return [i for i, letter in enumerate(s) if letter == ch]

#Finds the maximum value of a list of lists
def max_value(inputlist):
    maxs = []
    for sublist in inputlist:
        maxs.append(max(sublist))
    return max(maxs)

#Finds the minimum value of a list of lists
def min_value(inputlist):
    mins = []
    for sublist in inputlist:
        mins.append(min(sublist))
    return min(mins)


#If an entry has an uncertainty, return the 
def split_uncertain(value):
    chars = set('+-_±')
    all_index = []
    
    for c in chars:
        if len(findOccur(value,c)) == 0:
            pass
        else:
            all_index.append(findOccur(value,c))
            
    if len(all_index) == 0:
        return value
    
    else:   
        max_i = max_value(all_index)
        min_i = min_value(all_index)
    
        return value[:min_i], value[(max_i+1):]


def extractvalue(measured):
    if type(measured) == str:
        return measured
    else:
        return measured[0]

def extractuncertain(measured):
    if type(measured) == str:
        return '0'
    else:
        return measured[1]

def endswitha(value):
    if value[-1] == 'a':
        return True
    else:
        return False    

def endswithb(value):
    if value[-1] == 'b':
        return True
    else:
        return False

def endswithc(value):
    if value[-1] == 'c':
        return True
    else:
        return False
    
def removeend(string):
    return string[:-1]
    
def check_names(names):
    #Code adapted from Pharmsci 175/275
    bad_names = []
    for name in names:
    #Create a new OEMol
        mol_from_name = OEMol()
        if OEParseIUPACName(mol_from_name, name) == True:
            pass
        else:
            bad_names.append(name)
    return bad_names

def method():
    '''
    GCR: Gas-liquid chromatography with gas phase correction
    GC: Gas-liquid chromatography with no gas phase correction
    EBUL: Differential ebulliometry
    DP: Dew-point method
    NSGC: Non-steady-state gas chromatography
    RGC: Relative gas-liquid chromatography
    HS: Headspace chromatography
    GS: Gas-stripping
    STAT: Differential static cell equilibrium
    LLC: Liquid-liquid chromatography
    KHW: Hexadecane-water partition cofficient indirect method
    KGW: Estimation from gas-water partition coefficients from solubility measurements
    EXT: Extrapolated VLE
    '''
    return 

In [3]:
table = pd.read_csv('table2.csv', delimiter = ',',  encoding='latin-1')
table = table.drop(columns = ['Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8'])
table = table[pd.notnull(table['Solvent'])]
table = table[pd.notnull(table['Temp'])]
table = table[pd.notnull(table['Measured'])]
table = table.reset_index(drop = True)
table2 = table

In [4]:
#Many solvent names had an additional footnote
table.loc[table.Solvent.apply(endswitha) == True, 'Notes'] = 'This compound can also be found in Table 1'
table.loc[table.Solvent.apply(endswitha) == True, 'Solvent'] = table.Solvent.apply(removeend)


In [5]:
b= check_names(table.Solvent.unique()) #function from OpenEye package

In [6]:
#Entries 98, 103 were purposely skipped
table.Solvent = table.Solvent.replace({
    b[0]: 'Isobutane', b[1]: '2-Methylhexane', b[2]: 'Cycloheptane', 
    b[3]: 'Methylcyclohexane', b[4]: 'Cyclooctane', b[5]: 'Cis-1,2-dimethyl cyclohexane',
    b[6]: 'Pentylcyclopentane', b[7]: '2-Methylpropene', b[8]: '3-Methyl-1-butene',
    b[9]: '2-Methyl-2-butene', b[10]: '2-Methyl-1-pentene', b[11]: '4-Methyl-1-pentene',
    b[12]: '2,3-dimethyl-1-butene', b[13]: '2-Heptene', b[14]: '2-methyl-1,3-butadiene',
    b[15]: '1-Pentyne', b[16]: '1,8-Nonanediyne', b[17]: 'p-Methyl styrene',
    b[18]: '1,2,3-Trimethylbenzene', b[19]: 'Tert-butylbenzene', b[20]: '1-Ethylnapthalene', 
    b[21]: '11~{H}-benzo[a]fluorene', b[22]: '11~{H}-benzo[b]fluorene', b[23]: '1,2-Benzanthracene', 
    b[24]: '7,12-dimethylbenzo[a]anthracene', b[25]: 'Benzo[a]pyrene', b[26]: '3,4-Benzopyrene', 
    b[27]: '3-Methylcholanthrene', b[28]: 'Benzo[ghi]perylene', b[29]: 'Trichlorofluoroethane', 
    b[30]: 'Dichlorodifluoroethane', b[31]: 'Chlorotrifluoromethane', b[32]: 'Nitrotrichloromethane', 
    b[33]: 'Dichlorofluoromethane', b[34]: 'Chlorodifluoromethane', b[35]: 'Difluoromethane', 
    b[36]: 'Chloromethane', b[37]: 'Hexachloroethane', b[38]: 'Chloropentafluoroethane', 
    b[39]: 'Tetrafluoroethene', b[40]: 'Pentachloroethane', b[41]: '1-Bromo-2-chloroethane', 
    b[42]: 'Cis-1,2-dichloroethene', b[43]: '3-Chloro-1-propene', b[44]: '1,2,3-Trichloropropane', 
    b[45]: '1,2-Dibromopropane', b[46]: '1,3-Dichloropropane', b[47]: '2-Iodopropane', 
    b[48]: 'Octafluorocyclobutane', b[49]: '1-Chloro-2-methylpropane', b[50]: '2-Chloro-2-methylbutane', 
    b[51]: 'm-Dichlorobenzene', b[52]: 'o-Dichlorobenzene', b[53]: '1,2,3-Trichlorobenzene', 
    b[54]: '1,2,4-Trichlorobenzene', b[55]: '1,2,3,4-Tetrachlorobenzene', b[56]: '1,2,4,5-Tetrachlorobenzene', 
    b[57]: 'Pentachlorobenzene', b[58]: 'm-Difluorobenzene', b[59]: 'p-Difluorobenzene', 
    b[60]: '1-Pentanol', b[61]: '2-Methyl-1-butanol', b[62]: '2,2-Dimethyl-1-propanol', 
    b[63]: '3-Hexanol', b[64]: '2-Methyl-2-pentanol', b[65]: '2-Methyl-3-pentanol', 
    b[66]: '4-Methyl-2-pentanol', b[67]: '3-Methyl-2-pentanol', b[68]: '3-Methyl-3-pentanol', 
    b[69]: '2,2-Dimethyl-1-butanol', b[70]: '2,3-Dimethyl-2-butanol', b[71]: '3,3-Dimethyl-2-butanol', 
    b[72]: '2-Methyl-4-pentene-3-ol', b[73]: '2-Methyl-2-hexanol', b[74]: '3-Methyl-3-hexanol', 
    b[75]: '2,3-Dimethyl-2-pentanol', b[76]: '2,4-Dimethyl-2-pentanol', b[77]: '2,2-Dimethyl-3-pentanol', 
    b[78]: '2,3-Dimethyl-3-pentanol', b[79]: '2,4-Dimethyl-3-pentanol', b[80]: '3-Ethyl-3-Pentanol', 
    b[81]: '2,2,3-Trimethyl-3-pentanol', b[82]: '1-Nonanol', b[83]: '1-Dodecanol', 
    b[84]: '1-Tetradecanol', b[85]: '1-Hexadecanol', b[86]: '1-Heptadecanol', 
    b[87]: '1,3-Nonanediol', b[88]: '1,4-Dimethyl-2,4-octanediol', b[89]: '2,4-Dimethyl-2,4-nonanediol', 
    b[90]: '2-Propyl-1,3-heptanediol', b[91]: '2-Methyl-3-pentanone', b[92]: '3-Methyl-2-pentanone', 
    b[93]: '4-Methyl-2-pentanone', b[94]: '3,3-Dimethyl-2-butanone', b[95]: '2,4-Dimethyl-3-pentanone', 
    b[96]: '5-Methyl-2-hexanone', b[97]: '2,6-Dimethyl-4-heptanone', b[98]: 'Vinyl acetate',
    b[99]: 'Ethyl propenoate', b[100]: 'Ethyl propionate', b[101]: 'Propyl butyrate', 
    b[102]: 'Isopropyl butyrate', b[103]: '~{tert}-butyl hypochlorite', b[104]: 'Sec-butyl methyl ether', 
    b[105]: 'Sec-butyl methyl, 2-methyl ether', b[106]: '1-propan-2-yloxypropane', b[107]: 'Nonanal', 
    b[108]: '1-Ethyl piperidine', b[109]: '3-Ethyl-4-methylpyridine', b[110]: '1-Propyl piperidine', 
    b[111]: '2-Amino toluene', b[112]: '2-Nitropropane', b[113]: '2-Nitro-1-methoxybenzene', 
    b[114]: 'Carbon bisulfide'
    
})

In [7]:
mol_from_name = OEMol()
name = '11~{H}-benzo[b]fluorene'
OEParseIUPACName(mol_from_name, name)

True

In [8]:
table.loc[table.Solvent == '1,8-Nonanediyne']

Unnamed: 0,Solvent,Temp,Measured,Ref,Notes
102,"1,8-Nonanediyne",298.15,53400,5,


In [9]:
bad_names = check_names(table.Solvent.unique())
bad_names #couple names to fix

['1,8-Nonanediyne',
 '1,2-Benzanthracene',
 '3,4-Benzopyrene',
 'Benzo[ghi]perylene',
 'Sec-butyl methyl, 2-methyl ether']

In [10]:
#Fixing References
table['Ref'] = table['Ref'].replace({
    "15'": '15', "t6": '16', "I0,12": '10,12',"~ 3": '2,3'})

### Cleaning Temperature Values

In [11]:
#Saves index of all rows with messed up temperature values
temp_probs = check_value(table['Temp'])

#Apply temperature cleaning function on all rows with bad temperature values
table.loc[table.Temp.apply(is_number) == False, 'Temp'] = table.Temp.apply(clean_temp)

#Manually changing one value
table.iloc[244,:].Temp = '298.15'

temp_probs = check_value(table['Temp'])
print(len(temp_probs))

0


### Cleaning Measured Values

In [12]:
table.loc[table.Measured.apply(is_number) == False, 'Measured'] = table.Measured.apply(clean_measure)
measure_probs = check_value(table['Measured'])
print(len(measure_probs))


76


In [13]:
table.iloc[6,:].Measured = '9.09e4'
table.iloc[9,:].Measured = '7.42e4'
table.iloc[25,:].Measured = '2.33e6'
table.iloc[36,:].Measured = '9.08e6'
table.iloc[46,:].Measured = '6.21e7'
table.iloc[47,:].Measured = '1.52e8'
table.iloc[48,:].Measured = '1.58e8'
table.iloc[49,:].Measured = '3.43e8'
table.iloc[53,:].Measured = '8.13e4'
table.iloc[61,:].Measured = '2.02e2'
table.iloc[62,:].Measured = '1.04e3'
table.iloc[64,:].Measured = '4.79e3'
table.iloc[65,:].Measured = '5.07e3'
table.iloc[67,:].Measured = '2.63e4'
table.iloc[68,:].Measured = '1.92e4'
table.iloc[69,:].Measured = '2.50e4'
table.iloc[79,:].Measured = '2.31e6'
table.iloc[81,:].Measured = '4.09e3'
table.iloc[86,:].Measured = '1.21e5'
table.iloc[122,:].Measured = '3.676e4'
table.iloc[123,:].Measured = '3.484e4'
table.iloc[126,:].Measured = '3.006e4'
table.iloc[135,:].Measured = '3.518e4'
table.iloc[137,:].Measured = '3.750e4'
table.iloc[145,:].Measured = '9.250e4'
table.iloc[149,:].Measured = '1.181e5'
table.iloc[152,:].Measured = '1.449e5'
table.iloc[156,:].Measured = '1.134e5'
table.iloc[163,:].Measured = '1.121e5'
table.iloc[169,:].Measured = '4.24e5'
table.iloc[170,:].Measured = '2.53e5'
table.iloc[171,:].Measured = '2.14e6'
table.iloc[172,:].Measured = '6.83e4'
table.iloc[173,:].Measured = '6.42e4b'
table.iloc[181,:].Measured = '4.69e5'
table.iloc[182,:].Measured = '6.53e5b'
table.iloc[185,:].Measured = '4.33e5b'
table.iloc[187,:].Measured = '4.33e5b'
table.iloc[190,:].Measured = '5.93e5b'
table.iloc[191,:].Measured = '1.76e6'
table.iloc[192,:].Measured = '1.50e6b'
table.iloc[193,:].Measured = '1.84e6'
table.iloc[194,:].Measured = '1.72e6b'
table.iloc[195,:].Measured = '2.35e6'
table.iloc[196,:].Measured = '4.17e6b'
table.iloc[197,:].Measured = '1.14e7b'
table.iloc[198,:].Measured = '5.77e6b'
table.iloc[199,:].Measured = '4.17e6'
table.iloc[200,:].Measured = '3.87e6b'
table.iloc[201,:].Measured = '6.02e6b'
table.iloc[202,:].Measured = '6.73e6b'
table.iloc[203,:].Measured = '1.60e7b'
table.iloc[204,:].Measured = '3.41e7b'
table.iloc[205,:].Measured = '5.65e6b'
table.iloc[206,:].Measured = '1.43e7b'
table.iloc[207,:].Measured = '4.24e7b'
table.iloc[208,:].Measured = '5.41e7'
table.iloc[209,:].Measured = '2.60e7b'
table.iloc[210,:].Measured = '1.15e8b'
table.iloc[211,:].Measured = '3.75e8'
table.iloc[212,:].Measured = '1.21e8b'
table.iloc[213,:].Measured = '1.61e8b'
table.iloc[214,:].Measured = '1.89e8b'
table.iloc[215,:].Measured = '9.82e6b'
table.iloc[232,:].Measured = '1.64e5'
table.iloc[273,:].Measured = '1.02e5'
table.iloc[276,:].Measured = '2.77e6'
table.iloc[278,:].Measured = '2.011e7'
table.iloc[279,:].Measured = '2.472e7'
table.iloc[309,:].Measured = '1.765e5'
table.iloc[310,:].Measured = '3.876e6'
table.iloc[312,:].Measured = '1.43e8'
table.iloc[313,:].Measured = '3.85e8'
table.iloc[314,:].Measured = '1.79e9'
table.iloc[315,:].Measured = '1.43e10'
table.iloc[323,:].Measured = '361'

In [14]:
#Deals with values that have footnotes attached
table.loc[table.Measured.apply(endswithb) == True, 'Notes'] = 'See Mackay and Shiu(1977) for fugacity ratio calculation'
table.loc[table['Notes'] == 'See Mackay and Shiu(1977) for fugacity ratio calculation', 'Measured'] = table.Measured.apply(removeend)
measure_probs = check_value(table['Measured'])
print(len(measure_probs))


0


In [15]:
table.iloc[7,:].Measured = '1.04e5'
table.iloc[51,:].Measured = '2.558e9'
table.iloc[60,:].Measured = '1.99e2'
table.iloc[78,:].Measured = '3.63e5'
table.iloc[90,:].Measured = '1.03e5'
table.iloc[94,:].Measured = '1.04e2'
table.iloc[95,:].Measured = '5.83e2'
table.iloc[96,:].Measured = '2.41e3'

In [16]:
#table.to_excel("table2_cleaned.xlsx")