In [1]:
#Author: Chris Zhang
#Rotation Project for Professor David Mobley

In [54]:
import numpy as np
import pandas as pd
from pandas import DataFrame as df
from openeye.oechem import *
from openeye.oeiupac import *

In [252]:
#Returns True if inputted function is a float
def is_number(s):
    if type(s) == tuple:
        return False
    try:
        float(s)
        return True
    except ValueError:
        return False
    
#Returns index of any values in a column that are not floats
def check_value(column):
    problem_index = []
    for i,entry in enumerate(column):
        if is_number(entry) == False:
            problem_index.append(i)
    return problem_index


#Cleans a column of temperature values 
#operations in the function are made by inspection
def clean_temp(value):
    if value[0].isdigit() == False: #fixes terms with characters out in front (i.e. .373.15)
        value = value[1:]
    value = value.replace(',','.')
    value = value.replace('l','1')
    value = value.replace('I','1')
    value = value.replace('i','1')
    value = value.replace(' ', '')
    value = value.replace('A', '.1')
    return value

#Cleans a column of measured values 
def clean_measure(value):
    value = value.replace(" ", "")
    value = value.replace(",", ".")
    value = value.replace('l', "1")
    value = value.replace('I', "1")
    value = value.replace("x10","e")
    value = value.replace("×10", "e")
    value = value.replace("s10", "e")
    value = value.replace("10s", "5")
    value = value.replace('es', "e5")
    value = value.replace('i', '1')
    value = value.replace('>', "")
    value = value.replace('¢', "c")
    value = value.replace('-.', "")
    value = value.replace('O', '0')
    return value

def findOccur(s, ch):
    return [i for i, letter in enumerate(s) if letter == ch]

#Finds the maximum value of a list of lists
def max_value(inputlist):
    maxs = []
    for sublist in inputlist:
        maxs.append(max(sublist))
    return max(maxs)

#Finds the minimum value of a list of lists
def min_value(inputlist):
    mins = []
    for sublist in inputlist:
        mins.append(min(sublist))
    return min(mins)


#If an entry has an uncertainty, return the 
def split_uncertain(value):
    chars = set('+-_±')
    all_index = []
    
    for c in chars:
        if len(findOccur(value,c)) == 0:
            pass
        else:
            all_index.append(findOccur(value,c))
            
    if len(all_index) == 0:
        return value
    
    else:   
        max_i = max_value(all_index)
        min_i = min_value(all_index)
    
        return value[:min_i], value[(max_i+1):]


def extractvalue(measured):
    if type(measured) == str:
        return measured
    else:
        return measured[0]

def extractuncertain(measured):
    if type(measured) == str:
        return '0'
    else:
        return measured[1]

def endswithb(value):
    if value[-1] == 'b':
        return True
    else:
        return False

def endswithc(value):
    if value[-1] == 'c':
        return True
    else:
        return False
    
def removeend(string):
    return string[:-1]
    
def check_names(names):
    #Code adapted from Pharmsci 175/275
    bad_names = []
    for name in names:
    #Create a new OEMol
        mol_from_name = OEMol()
        if OEParseIUPACName(mol_from_name, name) == True:
            pass
        else:
            bad_names.append(name)
    return bad_names

def method():
    '''
    GCR: Gas-liquid chromatography with gas phase correction
    GC: Gas-liquid chromatography with no gas phase correction
    EBUL: Differential ebulliometry
    DP: Dew-point method
    NSGC: Non-steady-state gas chromatography
    RGC: Relative gas-liquid chromatography
    HS: Headspace chromatography
    GS: Gas-stripping
    STAT: Differential static cell equilibrium
    LLC: Liquid-liquid chromatography
    KHW: Hexadecane-water partition cofficient indirect method
    KGW: Estimation from gas-water partition coefficients from solubility measurements
    EXT: Extrapolated VLE
    '''
    return 

In [227]:
#Read in prepared .csv file
table = pd.read_csv('table1.csv', delimiter = ',')

#Rename temperature column to something nicer
table = table.rename(columns = {"T/ K" : "Temp"})

#Removes empty columns leftover from csv import
table = table.drop(columns = ['Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9'])

#Removes rows with no measurement values
table = table[pd.notnull(table['Solute'])]
table = table[pd.notnull(table['Temp'])]

### Cleaning up Compound Names

In [228]:
bad_names = check_names(table.Solute.unique()) #function from OpenEye package
#table.loc[table.Solute == bad_names[0]] 
#the above line was run while changing the index and looking at the actual PDF to verify 
#compound names were being altered correctly

In [284]:
#Some names were not parsed wrong, but could not be converted
#into a molecule by OpenEye

#Couldn't think of a better way than to double-check with the orginal
#document and hard-code the edits...
table.Solute = table.Solute.replace(
    {bad_names[0]: "Chlorodibromomethane",
     bad_names[1]: "Bromodichloromethane",
     bad_names[2]: "1,1,2,2-Tetrachloroethane",
     bad_names[3]: "1,1,1-Trichloroethane",
     bad_names[4]: "1,1-Dichloroethene",
     bad_names[7]: "1-Iodopropane",
     bad_names[8]: "Chlorohexane",
     bad_names[9]: "Ethanol",
     bad_names[10]: "2-Propen-1-ol",
     bad_names[12]: "Tert-Butanol",
     bad_names[13]: "o-Cresol",
     bad_names[14]: "Valeraldehyde",
     bad_names[15]: "3-Hexanone",
     bad_names[16]: "Acetic Acid",
     bad_names[17]: "Butyric Acid",
     bad_names[18]: "Propionic Acid, 2-Methyl, -Methyl",
     bad_names[19]: "Isopropyl Acetate",
     bad_names[20]: "Isobutyl Acetate",
     bad_names[21]: "Isopropyl Ether",
     bad_names[22]: "Diisopropylamine",
     bad_names[23]: "N,N-Dimethylformamide",
     bad_names[24]: "Acrylonitrile",
     bad_names[25]: "Butyronitrile",
})

In [230]:
bad_names2 = check_names(table.Solute.unique())
bad_names2 #leftover names OpenEye still doesn't like

['Cis 1,2-Dichloroethene',
 'Trans 1,2-Dichloroethene',
 'Iso-Butyl alcohol',
 'Tert-Butanol',
 'Propionic Acid, 2-Methyl, -Methyl',
 'Carbon disulfide']

### Fixing Methods and Reference Numbers

Type help(method) to get a description of all the different abbreviations!

In [231]:
#Finding Reference Values that weren't properly parsed
bad_ref = []
for entry in table['Ref.']:
    if entry.isnumeric() == True:
        pass
    else:
        bad_ref.append(entry)

#Manually replacing Faulty Entries
table['Ref.'] = table['Ref.'].replace({'I1': '11', 'q': '9'})

In [232]:
def most(series):
    lst = list(series)
    return max(lst,key=lst.count)

#All references numbers have the same method. However, the method string is parsed incorrectly
#for some entries. By taking the most common entry, we typically get the correct method string.

reference_dict = {}
for entry in table['Ref.'].unique():
    series = table.loc[table['Ref.'] == entry].Method
    reference_dict[entry] = most(series)

In [318]:
#After this was done, a couple entries still needed to be fixed manually.
reference_dict['21'] = 'GC'
reference_dict['49'] = 'EBUL c'
reference_dict['52'] = 'HS'
reference_dict['6'] = 'EXT'


#This line replaces all methods with the proper string. 
for key in reference_dict:
    table.loc[table['Ref.'] == key, 'Method'] = reference_dict[key]

#Add note for entries with extra footnote
table.loc[table.Method.apply(endswithc) == True, 'Notes'] = 'Data obtained by extrapolating $\gamma_{i}$ as $x_{i} \rightarrow 0$'


### Cleaning Temperature Values

In [234]:
#Saves index of all rows with messed up temperature values
temp_probs = check_value(table['Temp'])
table.iloc[temp_probs, :]

Unnamed: 0,Solute,Temp,Measured,Method,Ref.,Notes
35,Benzene,.373.15,1080,--,10,
37,Toluene,29615,10400,GS,44,
91,Dibromomethane,323.I5,740-+32,STAT,41,
105,Carbon Tetrachloride,31315,8706± 860,RGC,66,
296,Chlorobenzene,32315,4952,GCR,8,
297,Bromobenzene,29815,22460,KHW,37,
298,Bromobenzene,29815,2.09 × 104,KGW,2,
299,Iodobenzene,29815,54130,KHW,37,
474,"1,2-Ethanediol",.348.05,I. 2-+0. 1,DP,62,
484,"1,3-Propanediol",32865,1.8+0.2,DP,62,


In [235]:
#Apply temperature cleaning function on all rows with bad temperature values
table.loc[table.Temp.apply(is_number) == False, 'Temp'] = table.Temp.apply(clean_temp)
temp_probs = check_value(table['Temp'])
print(temp_probs) #All rows fixed!

[]


## Part B: Cleaning Measured Values

In [237]:
#Saves index of all rows with messed up measured values
measure_probs = check_value(table['Measured'])

#Display list of problematic measured values
table.iloc[measure_probs,:].head()

Unnamed: 0,Solute,Temp,Measured,Method,Ref.,Notes
12,Cyclohexane,313.15,1 0 2 6 . 5 + 3 0,GS,14,
18,Benzene,296.15,2360--.52,GS,14,
28,Benzene,303.15,2402 ¢,HS,23,
32,Benzene,313.15,1635-+49,GS,14,
41,Toluene,313.15,3249+-38,GS,14,


In [238]:
print(len(measure_probs)) #Lots of messed up measured values!

348


In [239]:
#Apply measured cleaning function on all rows with bad measured values
table.loc[table.Measured.apply(is_number) == False, 'Measured'] = table.Measured.apply(clean_measure)
measure_probs = check_value(table['Measured'])
print(len(measure_probs))

240


In [240]:
#Values with uncertainties aren't being dealt with properly
table.iloc[measure_probs,:].head()

Unnamed: 0,Solute,Temp,Measured,Method,Ref.,Notes
12,Cyclohexane,313.15,1026.5+30,GS,14,
18,Benzene,296.15,2360-52,GS,14,
28,Benzene,303.15,2402c,HS,23,
32,Benzene,313.15,1635-+49,GS,14,
41,Toluene,313.15,3249+-38,GS,14,


In [241]:
#Splits entries with an uncertainty into two
table['Measured'] = table.Measured.apply(split_uncertain)

#Creates an uncertainty column with uncertainty if applicable
table['Uncertain'] = table.Measured.apply(extractuncertain)

#Replaces measured column with a single value if it was previously split
table['Measured'] = table.Measured.apply(extractvalue)


In [242]:
measure_probs = check_value(table['Measured'])
print(len(measure_probs)) #Still have some values we can't read well

28


In [243]:
#Deals with values that have footnotes attached
table.loc[table.Measured.apply(endswithb) == True, 'Notes'] = 'Data interpolated from other temperature measurements'
table.loc[table.Measured.apply(endswithc) == True, 'Notes'] = 'Data obtained by extrapolating $\gamma_{i}$ as $x_{i} \rightarrow 0$'
table.loc[table['Notes'].isnull() == False, 'Measured'] = table.Measured.apply(removeend)

In [244]:
measure_probs = check_value(table['Measured'])
print(len(measure_probs)) #Still have lots of values we can't read well

20


In [245]:
measure_probs = check_value(table['Measured'])
uncertain_probs = check_value(table['Uncertain'])
total_probs = sorted(measure_probs + uncertain_probs)

In [246]:
table = table.reset_index(drop = True)
table.iloc[total_probs,:]

Unnamed: 0,Solute,Temp,Measured,Method,Ref.,Notes,Uncertain
50,Butylbenzene,298.15,5.33~10~,KHW,37,,0
57,m-Xylene,298.15,3.90e~,KHW,37,,0
67,Mesitylene,298.15,1.28e~,KHW,37,,0
100,Carbon Tetrachloride,298.15,3.45/,HS,9,,103
155,"1,1,1,2-Tetrachloroethane",293.15,396t0,GCR,4,,0
201,"1,1-Dichloroethene",303.15,1930,RGC,66,,=270
219,"Trans 1,2-Dichloroethene",313.15,13704:75,STAT,41,,0
260,"1,3-Dichloropropylene",298.15,1400u,STAT,41,,0
289,Fluorobenzene,298.15,"3.44""10:~",KGW,2,,0
293,Chlorobenzene,298.15,1.27,KGW,2,,"""104"


In [273]:
## At this point, there isn't much of a way to fix these problems except for hard-coding
#the correct values

table.iloc[50,:].Measured = '5.33e5'
table.iloc[57,:].Measured = '3.90e4'
table.iloc[67,:].Measured = '1.28e5'
table.iloc[100,:].Measured = '3.45e3'
table.iloc[100,:].Uncertain = '0'
table.iloc[155,:].Measured = '3960'
table.iloc[201,:].Uncertain = '270'
table.iloc[219,:].Measured = '1370'
table.iloc[219,:].Uncertain = '75'
table.iloc[260,:].Measured = '1400'
table.iloc[260,:].Notes = 'Data interpolated from other temperature measurements'
table.iloc[289,:].Measured = '3.44e3'
table.iloc[293,:].Measured = '1.27e4'
table.iloc[293,:].Uncertain = '0'
table.iloc[302,:].Measured = '6.39e4'
table.iloc[304,:].Measured = '3.59e4'
table.iloc[312,:].Measured = '1.46'
table.iloc[325,:].Measured = '1.49'
table.iloc[325,:].Uncertain = '0.05'
table.iloc[480,:].Measured = '1.3'
table.iloc[533,:].Measured = '35.8'
table.iloc[545,:].Measured = '65.38'
table.iloc[584,:].Measured = '30.2'
table.iloc[614,:].Measured = '2.48e4'
table.iloc[651,:].Measured = '233.8'
table.iloc[727,:].Measured = '0.81'
table.iloc[727,:].Uncertain = '0.08'
table.iloc[734,:].Measured = '1.35'



### Part C: Collecting all Compound Names with Potential Error

In [330]:
#Collect compound names with errors
err = table.iloc[total_probs,:].Solute.unique()
error_names = []
for name in err:
    error_names.append(name)

#Remove whatever compound names were added above from the list we are testing
compounds = table.Solute.unique()
compounds = compounds.tolist()
compounds = list(set(compounds) - set(error_names))

In [340]:
#Flagging compounds with potentially incorrect measured values
error_names = []
for compound in compounds:
    values = table.loc[table.Solute == compound].Measured
    values = values.tolist()
    values = [float(x) for x in values]
    if max(values) > 5*min(values): #threshold condition
        error_names.append(compound)

In [342]:
compounds = table.Solute.unique()
compounds = compounds.tolist()
compounds = list(set(compounds) - set(error_names))

In [343]:
#Flagging compounds with potentially incorrect uncertainty values
for compound in compounds:
    values = table.loc[table.Solute == compound].Uncertain
    values = values.tolist()
    values = [float(x) for x in values if x!= '0']
    if values == []:
        pass
    else:
        if max(values) > 10*min(values): #threshold condition for uncertainty
            error_names.append(compound)

In [335]:
table.iloc[145,:].Measured = '9280'
for i in range(153,165):
    table.iloc[i,:].Solute = '1,1,2,2-Tetrachloroethane'
table.iloc[156,:].Measured = '3460'
table.iloc[259,:].Measured = '1360'
table.iloc[259,:].Uncertain = '30'
table.iloc[262,:].Measured = '1460'
table.iloc[262,:].Uncertain = '85'
table.iloc[422,:].Measured = '68.03'
table.iloc[452,:].Measured = '126'
table.iloc[557,:].Measured = '61.86'
table.iloc[582,:].Measured = '29.5'