# Data mining of the Karr thesis 

## Reactions

The Reactions information of the excel file is data mined through Pandas. The "S3O - Reactions" spreadsheet is opened in Pandas and processed into a JSON. The final JSON file possesses the full reaction name, the reaction, and the available forwards and backwards kinetic information for the specific enzyme.

In [None]:
#import statement
import pandas
import numpy
import json
import re

#loading the excel file
data_file = "tableS3.xlsx"
reactions_spreadsheet = pandas.read_excel(data_file, sheet_name = 'S3O-Reactions')


#renaming the column headers to reflect either forwards or backwards kinetics
iteration = 0
kinetics_headings = ['Rate Law', 'Km', 'Vmax', 'Vmax Unit']
kinetics_headings_current = []
for column in reactions_spreadsheet:
    header = reactions_spreadsheet.loc[4,column]
    #print(header)
    if header in kinetics_headings:
        if iteration == 0:
            reactions_spreadsheet.loc[4,column] = header + ' - Forward'
            kinetics_headings_current.append(reactions_spreadsheet.loc[4,column])
            if header == 'Vmax Unit':
                iteration += 1
        elif iteration == 1:
            reactions_spreadsheet.loc[4,column] = header + ' - Backward'
            kinetics_headings_current.append(reactions_spreadsheet.loc[4,column])
            

#estalblish new column headers 
reactions_spreadsheet.columns = reactions_spreadsheet.iloc[4]
reactions_spreadsheet = reactions_spreadsheet.drop(labels=range(5), axis=0)
#display(reactions_spreadsheet)


'''#develop the data dictionary for each enzyme 
empty_cells = ['NaN', None, numpy.nan]
enzyme_data = {}
for index, row in reactions_spreadsheet.iterrows():
    for column in reactions_spreadsheet:
        if re.search('(.+\s-\s\w+)', column):
            cell = reactions_spreadsheet.loc[index, column]
            if cell not in empty_cells:
                enzyme_data['Enzyme ID'] = enzyme_data[reactions_spreadsheet.loc[index, 'ID']] 
                enzyme_data['Reaction'] = reactions_spreadsheet.loc[index, 'Stoichiometry']'''
                
                
                
#develop the data dictionary for each enzyme 
empty_cells = ['NaN', None, numpy.nan]
enzyme_data = {}
for index, row in reactions_spreadsheet.iterrows():
    for column in kinetics_headings_current:
        cell = reactions_spreadsheet.loc[index, column]
        if cell not in empty_cells:
            enzyme_data[reactions_spreadsheet.loc[index, 'ID']] = {
                                'Enzyme Name':reactions_spreadsheet.loc[index, 'Name'], 
                                'BiGG ID':reactions_spreadsheet.loc[index, 'BiGG'],
                                'Reaction':reactions_spreadsheet.loc[index, 'Stoichiometry'],
                                'Forward Rate Law':reactions_spreadsheet.loc[index, 'Rate Law - Forward'],
                                'Forward Km':reactions_spreadsheet.loc[index, 'Km - Forward'],
                                'Forward Vmax':reactions_spreadsheet.loc[index, 'Vmax - Forward'],
                                'Forward Vmax Unit':reactions_spreadsheet.loc[index, 'Vmax Unit - Forward'],
                                'Backward Rate Law':reactions_spreadsheet.loc[index, 'Rate Law - Backward'],
                                'Backward Km':reactions_spreadsheet.loc[index, 'Km - Backward'],
                                'Backward Vmax':reactions_spreadsheet.loc[index, 'Vmax - Backward'],
                                'Backward Vmax Unit':reactions_spreadsheet.loc[index, 'Vmax Unit - Backward']}
            
            break

            
#export the dictionary as a JSON file
with open('2021-03-17_APF_Karr et al. kinetic data.json', 'w') as output:
    json.dump(enzyme_data, output, indent = 5)

## Metabolites

The Metabolites information is data mined through the Pandas library. The "S3G - Metabolites" spreadsheet is opened in Pandas and processed into a JSON. The final JSON file possesses the metabolite name, the associated reactions, the chemical formula, the charge, hydrophobicity, and the molecular weight.

In [None]:
#import statement
import pandas
import numpy
import json
import re

#loading the excel file
data_file = "tableS3.xlsx"
metabolites_spreadsheet = pandas.read_excel(data_file, sheet_name = 'S3G-Metabolites')
#display(metabolites_spreadsheet)


#estalblish new column headers 
metabolites_spreadsheet.columns = metabolites_spreadsheet.iloc[2]
metabolites_spreadsheet = metabolites_spreadsheet.drop(labels=range(3), axis=0)
desired_metabolite_headers = ['ID', 'Name', 'Category', 'Empirical Formula', 'Charge', 
                              'Hydrophobic', 'Molecular Weight','Reactions']
#display(metabolites_spreadsheet)
               
                
             
#develop the data dictionary for each enzyme 
empty_cells = ['NaN', None, numpy.nan]
metabolite_data = {}
for index, row in metabolites_spreadsheet.iterrows():
    for column in desired_metabolite_headers:
        cell = metabolites_spreadsheet.loc[index, column]
        if cell not in empty_cells:
            metabolite_data[metabolites_spreadsheet.loc[index, 'ID']] = {
                                'Name':metabolites_spreadsheet.loc[index, 'Name'], 
                                'BiGG ID':metabolites_spreadsheet.loc[index, 'BiGG'],
                                'Class':metabolites_spreadsheet.loc[index, 'Category'],
                                'Chemical Formula':metabolites_spreadsheet.loc[index, 'Empirical Formula'],
                                'Charge':metabolites_spreadsheet.loc[index, 'Charge'],
                                'Hydrophobic':metabolites_spreadsheet.loc[index, 'Hydrophobic'],
                                'Molecular Mass (amu)':metabolites_spreadsheet.loc[index, 'Molecular Weight'],
                                'Associated Reactions':metabolites_spreadsheet.loc[index, 'Reactions']}
            
            break
    

#export the dictionary as a JSON file
with open('2021-03-17_APF_Karr et al. metabolite data.json', 'w') as output:
    json.dump(metabolite_data, output, indent = 5)

## References 

The Karr et al. supplementary excel file was mined and organized. The information was exported in a JSON file. The reference information was combined with the citations of the reactions information in the " ...reactions + references" JSON file that is used as a singular representative data file for the Karr et al. reactions database.  

In [None]:
# import statement
import pandas
import numpy
import json
import re

# loading the excel file
data_file = "tableS3.xlsx"
references_spreadsheet = pandas.read_excel(data_file, sheet_name = 'S3S-References')


# estalblish new column headers 
references_spreadsheet.columns = references_spreadsheet.iloc[1]
references_spreadsheet = references_spreadsheet.drop(labels=range(2), axis=0)
#display(references_spreadsheet)

# develop the data dictionary for each enzyme 
empty_cells = ['NaN', None, numpy.nan]
reference_information = {}
for index, row in references_spreadsheet.iterrows():
    for column in references_spreadsheet:
        id = references_spreadsheet.at[index,'ID']
        if references_spreadsheet.at[index, 'Type'] == 'article':
            reference_information[id] = {'PubMed':references_spreadsheet.at[index,'PubMed'],
                                        'author(s)':references_spreadsheet.at[index,'Authors'],
                                        'title':references_spreadsheet.at[index,'Title'],
                                       'year':references_spreadsheet.at[index,'Year'],
                                       'journal':references_spreadsheet.at[index,'Publication'],
                                       'volume':references_spreadsheet.at[index,'Volume'],
                                       'issue':references_spreadsheet.at[index,'Issue'],
                                       'pages':references_spreadsheet.at[index,'Pages'],
                                       'comments':references_spreadsheet.at[index,'Comments']}
        elif references_spreadsheet.at[index, 'Type'] == 'book':
            reference_information[id] = {'ISBN':references_spreadsheet.at[index,'ISBN'],
                                        'author(s)':references_spreadsheet.at[index,'Authors'],
                                        'title':references_spreadsheet.at[index,'Title'],
                                       'editor':references_spreadsheet.at[index,'Editors'],
                                       'year':references_spreadsheet.at[index,'Year'],
                                       'publication':references_spreadsheet.at[index,'Publication'],
                                       'publisher':references_spreadsheet.at[index,'Publisher'],
                                       'pages':references_spreadsheet.at[index,'Pages'],
                                       'url':references_spreadsheet.at[index,'URL'],
                                       'comments':references_spreadsheet.at[index,'Comments']}
        elif references_spreadsheet.at[index, 'Type'] == 'misc':            
            reference_information[id] = {'author(s)':references_spreadsheet.at[index,'Authors'],
                                       'title':references_spreadsheet.at[index,'Title'],
                                       'year':references_spreadsheet.at[index,'Year'],
                                       'url':references_spreadsheet.at[index,'URL'],
                                       'comments':references_spreadsheet.at[index,'Comments']}
            
        elif references_spreadsheet.at[index, 'Type'] == 'thesis':
            reference_information[id] = {'author(s)':references_spreadsheet.at[index,'Authors'],
                                       'title':references_spreadsheet.at[index,'Title'],
                                       'year':references_spreadsheet.at[index,'Year'],
                                       'publisher':references_spreadsheet.at[index,'Publisher'],
                                       'url':references_spreadsheet.at[index,'URL'],
                                       'comments':references_spreadsheet.at[index,'Comments']}

            
# export the dictionary as a JSON file
with open('2021-03-24_APF_Karr et al. references.json', 'w') as output:
    json.dump(reference_information, output, indent = 5)

# Initial concentrations

The initial media conditions of the WCM are calculated. The concentrations with the cited references was infused with the refereces JSON file. 

In [None]:
# import statement
import pandas
import numpy
import json
import re

# load references
references = json.load(open('2021-03-24_APF_Karr et al. references.json'))

# loading the excel file
data_file = "tableS3.xlsx"
concentrations_spreadsheet = pandas.read_excel(data_file, sheet_name = 'S3H-Media composition')

comment = """In silico media composition. The media composition is based on the experimentally characterized 
composition of the SP-4 media on which M. genitalium is generally cultured, with additional essential molecules
added to support in silico cellular growth. Non-gaseous molecules are added to the extracellular environment at the 
beginning of the simulation and either drain or accumulate over the length of the simulation. Baseous molecules are 
continuously perfused into the extracellular environment and are maintained at a constant concentration throughout 
the simulation."""

# estalblish new column headers 
concentrations_spreadsheet.columns = concentrations_spreadsheet.iloc[1]
concentrations_spreadsheet = concentrations_spreadsheet.drop(labels=range(2), axis=0)

media_concentrations = {}
empty_cells = ['NaN', None, numpy.nan]
for index, row in concentrations_spreadsheet.iterrows():
    id = concentrations_spreadsheet.at[index, 'ID']
    name = concentrations_spreadsheet.at[index, 'Name']
    concentration = concentrations_spreadsheet.at[index, 'Concentration (mM)']
    comment = concentrations_spreadsheet.at[index, 'Comments']
    reference = concentrations_spreadsheet.at[index, 'reference']
    if reference not in empty_cells:
        for reference2 in references:
            if reference == reference2:
                #print(reference2)
                reference_information = []
                for key, value in references[reference2].items():
                    reference_information.append(value)
                reference = reference_information
    
    media_concentrations[id] = {'name':name,
                               'concentration':concentration,
                               'unit':'mM',
                               'comment':comment,
                               'reference':reference}
        
        
with open('2021-03-24_APF_Karr et al. media.json', 'w') as output:
    json.dump(media_concentrations, output, indent = 5)