# Karr et al. data processing

## Consolidating the reactions data

The reactions JSON file from the WCKB was reformatted and exported as a new JSON file. 

In [34]:
import json
import numpy

reactions = json.load(open('./json_data/reactions.json'))
         
iteration = 0
enzyme_names = []
enzyme_data ={}
for enzyme in reactions['data']:
    #print(enzyme['references'])
    if enzyme['name'] not in enzyme_names:
        enzyme_names.append(enzyme['name'])
        
for enzyme in enzyme_names:
    references_per_enzyme = []
    stoichiometry_per_enzyme = []
    temperatures_per_enzyme = []
    phs_per_enzyme = []
    gibbs_free_energies_per_enzyme = []
    keqs_per_enzyme = []
    forward_kinetics_per_enzyme = []
    backward_kinetics_per_enzyme = []
    for entry in reactions['data']: 
        if entry['name'].lower() == enzyme.lower():
            #print(enzyme, '\n', entry)
            if entry['references'] not in references_per_enzyme:
                references_per_enzyme.append(entry['references'])
            if entry['stoichiometry'] not in stoichiometry_per_enzyme:
                stoichiometry_per_enzyme.append(entry['stoichiometry'])
            if entry['optimal_temperature'] not in temperatures_per_enzyme:
                temperatures_per_enzyme.append(entry['optimal_temperature'])
            if entry['delta_g'] not in gibbs_free_energies_per_enzyme:
                gibbs_free_energies_per_enzyme.append(entry['delta_g'])
            if entry['keq'] not in keqs_per_enzyme:          
                keqs_per_enzyme.append(entry['keq'])
            if entry['kinetics_forward'] not in forward_kinetics_per_enzyme:
                forward_kinetics_per_enzyme.append(entry['kinetics_forward'])
            if entry['kinetics_backward'] not in backward_kinetics_per_enzyme:
                backward_kinetics_per_enzyme.append(entry['kinetics_backward'])
            if entry['optimal_ph'] not in phs_per_enzyme:
                phs_per_enzyme.append(entry['optimal_ph'])
    
        enzyme_data[enzyme] = {'reference':references_per_enzyme,
                                  'stoichiometry':stoichiometry_per_enzyme,
                                  'optimal temperature':temperatures_per_enzyme,
                                  'optimum pH':phs_per_enzyme,
                                  'Gibbs free energy':gibbs_free_energies_per_enzyme,
                                  'keq':keqs_per_enzyme,
                                  'forward kinetics':forward_kinetics_per_enzyme,
                                  'backward kinetics':backward_kinetics_per_enzyme}

# export the dictionary as a JSON file
with open('2021-03-25_APF_WCKB reactions, reorganized.json', 'w') as output:
    json.dump(enzyme_data, output, indent = 5)

## Combining the references and reactions JSON files

The reformatted reactions JSON file from the WCKB is embedded with references. The references were added both to the entire reaction and to the specific forward and backward kinetics data in the respective code blocks.

In [35]:
# import the libraries and JSON files
import json
import numpy
import re

reactions = json.load(open('2021-03-25_APF_WCKB reactions, reorganized.json'))
references = json.load(open('2021-03-24_APF_Karr et al. references.json'))
         
    
# iteratively insert the references into the corresponding section of the reactions JSON       
empty_cells = ['NaN', None, numpy.nan, '', ' ', [], {}]
for enzyme, information in reactions.items():
    
    # loop through the main references
    for dict in information['reference']:
        #print(id)
        new_references_list = []
        for reference in dict:
            for id, key in references.items(): 
                #print('list:\t', list)
                #print('reference:\t',reference, '\t\tID:', id)
                if reference == id:
                    new_references_list.append(key)
                    #print('main:\t', key)
                    #print(new_references_list)
            
        #print(new_references_list)
        #print(new_references_list)
        information['reference'] = new_references_list
        
    # loop through the forward kinetics references
    for dict in information['forward kinetics']:
        #print(list)
        if (dict and dict['evidence']) not in empty_cells:
            evidence_loop = 0
            for reference in dict['evidence']:
                new_references_list = []
                for key2, value2 in reference.items():
                    if key2 == 'references':
                        for reference_item in value2:
                            #print(reference_item)
                            #print('reference:\t',reference)
                            #print('key2:\t\t', key2)
                            for id, key in references.items(): 
                                if reference_item == id:
                                    new_references_list.append(key)
                                    #print('forward:\t', key)

                #print(new_references_list)
                #print(list)
                dict['evidence'][evidence_loop]['references'] = new_references_list      
                evidence_loop += 1

    # loop through the backward kinetics references
    for dict in information['backward kinetics']:
        #print(list)
        if (dict and dict['evidence']) not in empty_cells:
            evidence_loop = 0
            for reference in dict['evidence']:
                new_references_list = []
                for key2, value2 in reference.items():
                    if key2 == 'references':
                        for reference_item in value2:
                            for id, key in references.items(): 
                                if reference_item == id:
                                    new_references_list.append(key)
                                    #print('backward:\t', key)

                #print(new_references_list)
                dict['evidence'][evidence_loop]['references'] = new_references_list      
                evidence_loop += 1
        
# export the dictionary as a JSON file
with open('2021-03-25_APF_WCKB reactions + references.json', 'w') as output:
    json.dump(reactions, output, indent = 5)

## Contrasting the Karr et al. data sources 

The reactions from the supplemental excel file and the exported and scraped resources from the WholeCellKB.org were compared. The exported data possessed 27 more kinetically described enzymes than the scraped WholeCellKB.org data. The exported data was equivalent to the scraped data with respect to thermodynamic data. The exported data was therefore concluded to be representative of the WholeCellKB.org database. 

In [None]:
# import the libraries and the JSON data files        
import pandas
import numpy
import json
import re

wckb_scraped = pandas.read_csv('reactions.csv')
wckb_json = json.load(open('2021-03-22_APF_WCKB reactions + references.json'))
karr_excel_reactions = json.load(open('2021-03-17_APF_Karr et al. kinetic data.json'))
empty_cells = ['NaN', None, numpy.nan, '', ' ', [], {}, 'null', 'None']    

    
    
# examine the downloaded reactions from the WholeCellKB.org
wckb_json_enzymes = []
thermodynamic_reactions = []
kinetics_reactions = []
for enzyme, value in wckb_json.items():
    if enzyme not in wckb_json_enzymes:
        #print(enzyme)
        wckb_json_enzymes.append(enzyme)
        for keq in value['keq']:
            if keq not in empty_cells:
                thermodynamic_reactions.append(enzyme)
        for gibbs in value['Gibbs free energy']:
            if gibbs not in empty_cells and enzyme not in thermodynamic_reactions:
                thermodynamic_reactions.append(enzyme)
        for forward_kinetics in value['forward kinetics']:
            if forward_kinetics not in empty_cells:
                if enzyme not in kinetics_reactions:
                    kinetics_reactions.append(enzyme)
        for backward_kinetics in value['backward kinetics']:
            if backward_kinetics not in empty_cells:
                if enzyme not in kinetics_reactions:
                    kinetics_reactions.append(enzyme)
thermodynamics_and_kinetics_export = set(thermodynamic_reactions).intersection(set(kinetics_reactions))
print('thermodynamics and kinetics:\t', len(thermodynamics_and_kinetics_export))
print('WCKB exported quantity:\t\t', len(wckb_json_enzymes))   
print('WCKB exported thermodynamic quantity:\t', len(thermodynamic_reactions))
print('WCKB exported kinetic quantity:\t', len(kinetics_reactions))
        
        
# examine the scraped reactions of the WholeCellKB.org 
wckb_scraped_enzymes = []
thermodynmic_reactions_scraped = []
kinetic_reactions_scraped = []
for index, row in wckb_scraped.iterrows():
    enzyme = wckb_scraped.at[index, 'Name']
    if enzyme not in wckb_scraped_enzymes:
        #print(enzyme)
        wckb_scraped_enzymes.append(enzyme)
        if wckb_scraped.at[index, 'deltaG'] not in empty_cells or wckb_scraped.at[index, 'Keq'] not in empty_cells:
            if enzyme not in thermodynmic_reactions_scraped:
                thermodynmic_reactions_scraped.append(enzyme)
        if wckb_scraped.at[index, 'Forward kinetics'] not in empty_cells or wckb_scraped.at[index, 'Backward kinetics'] not in empty_cells:
            if enzyme not in kinetic_reactions_scraped:
                kinetic_reactions_scraped.append(enzyme)
print('WCKB scraped quantity:\t\t', len(wckb_scraped_enzymes))   
print('WCKB scraped thermodynamic quantity:\t', len(thermodynmic_reactions_scraped))
print('WCKB scraped kinetic quantity:\t', len(kinetic_reactions_scraped))
    
    
# the thermodynamnically described reactions between the exported data and the scraped data are equivalent
thermodynamic_difference_scraped_export = set(thermodynmic_reactions_scraped) - set(thermodynamic_reactions)  
thermodynamic_difference_export_scraped = set(thermodynamic_reactions) - set(thermodynmic_reactions_scraped) 
'''print('thermodynamic_difference_export_scraped:\t', len(thermodynamic_difference_export_scraped))
print('\nthermodynamic_difference_scraped_export:\t', len(thermodynamic_difference_scraped_export))
for enzyme in thermodynamic_difference_scraped_export:
    print(enzyme)
for index, row in wckb_scraped.iterrows():
    enzyme = wckb_scraped.at[index, 'Name']
    if enzyme in thermodynamic_difference_scraped_export:
        deltag = wckb_scraped.at[index, 'deltaG']
            for enzyme2, value in wckb_json.items():
                if enzyme2 == enzyme:
                    value['Gibbs free energy'] = deltag
'''    
    #print(enzyme)

    
# the kinetically described reactions between the exported data and the scraped data are equivalent
kinetic_difference_export_scraped = set(kinetics_reactions) - set(kinetic_reactions_scraped)   
kinetic_difference_scraped_export = set(kinetic_reactions_scraped) - set(kinetics_reactions)
'''print('\nkinetic_difference_export_scraped:\t', len(kinetic_difference_export_scraped))
for enzyme in kinetic_difference_export_scraped:
    print(enzyme)
print('kinetic_difference_scraped_export:\t', len(kinetic_difference_scraped_export))
for enzyme in kinetic_difference_scraped_export:
    print(enzyme)'''
    

# the downloaded and scraped datasets were demonstrated to be identical with the named enzymes   
scraped_minus_export = set(wckb_scraped_enzymes) - set(wckb_json_enzymes)
export_minus_scraped = set(wckb_json_enzymes) - set(wckb_scraped_enzymes) 
#print(len(export_minus_scraped - scraped_minus_export))
comma_removed_from_export = []
for enzyme2 in export_minus_scraped:
    #print(enzyme2)
    if enzyme2 not in empty_cells:
        cleaned_enzyme = re.sub(',','', enzyme2)
        comma_removed_from_export.append(enzyme2) 
        
noncomma_differences = scraped_minus_export.intersection(comma_removed_from_export)
print('\nnoncomma_differences:\t\t', len(noncomma_differences))

comma_removed_duplicates_from_export = []
comma_removed_duplicates_from_scraped = []
for enzyme in scraped_minus_export:
    #print(enzyme)
    for enzyme2 in export_minus_scraped:
        #print(enzyme) 
        cleaned_enzyme = re.sub(',','', enzyme2)
        if cleaned_enzyme == enzyme:
            if enzyme not in comma_removed_duplicates_from_export and enzyme not in export_minus_scraped:
                comma_removed_duplicates_from_export.append(enzyme)
                #print(enzyme, '\n', enzyme2, '\n\n')
    
    if enzyme not in empty_cells:
        cleaned_enzyme = re.sub(',','', enzyme)
        for enzyme2 in export_minus_scraped:
            if cleaned_enzyme == enzyme2:
                if enzyme2 not in comma_removed_duplicates_from_scraped and enzyme2 not in scraped_minus_export:
                    comma_removed_duplicates_from_scraped.append(enzyme2)
                    #print(enzyme2, '\n', enzyme, '\n\n')
comma_removed_duplicates = set(comma_removed_duplicates_from_scraped).union(set(comma_removed_duplicates_from_export))
print('comma_removed_duplicates:\t',len(comma_removed_duplicates))
print('comma_removed_duplicates_from_export:\t', len(comma_removed_duplicates_from_export))
print('comma_removed_duplicates_from_scraped:\t', len(comma_removed_duplicates_from_scraped))



# the excel file dataset was determined to possess identical reaction data as the WholeCellKB datasets
all_karr_enzymes = []
not_in_json_files = []
missing_thermodynamic_enzymes = []
missing_kinetic_enzymes = []
for enzyme in wckb_json_enzymes:
    if enzyme not in all_karr_enzymes:
        all_karr_enzymes.append(enzyme)
for id2, value2 in karr_excel_reactions.items():  
    enzyme2 = re.sub(',', '', value2['Enzyme Name']) 
    if enzyme2 not in all_karr_enzymes:
        print(enzyme2)
        not_in_json_files.append(enzyme2)
    for enzyme, value in wckb_json.items():
        if enzyme == enzyme2:
            for forward_kinetics in value['forward kinetics']:
                if forward_kinetics not in empty_cells and value2['Forward Km'] not in empty_cells:
                    if enzyme not in missing_kinetic_enzymes:
                        missing_kinetic_enzymes.append(enzyme)
            for backward_kinetics in value['backward kinetics']:
                if backward_kinetics not in empty_cells and value2['Backward Km'] not in empty_cells:
                    if enzyme not in missing_kinetic_enzymes:
                        missing_kinetic_enzymes.append(enzyme)
print('not_in_json_files:\t\t', len(not_in_json_files))
print('missing_kinetic_enzymes:\t', len(missing_kinetic_enzymes))


# the excel file dataset was determined 
#print('Karr total quantity:\t', len(karr_excel_reactions_enzymes))   
#print('Karr total thermodynamic quantity:\t', len())
#print('Karr total kinetic quantity:\t', len())

#print('Scraped minus export data:\t', len(scraped_minus_export.union(export_minus_scraped)))
#print('\n\n\n\n')

'''for enzyme in scraped_minus_export.union(export_minus_scraped):
      print(enzyme)'''

## format-data.py

The exported JSON files from the WCKB were reformatted and saved as CSV files.

In [None]:
# -*- coding: utf-8 -*-
"""
@authors: Ethan Chan, Matthew Freiburger
"""

# import libraries
import pandas as pd 
import json 
import string #String for removing non-encodable characters

# Import scraped data 
reactions_import = pd.read_json("./data/reactions.json")
metabolites_import = pd.read_json("./data/metabolites.json")
processes_import = pd.read_json("./data/processes.json")
stimuli_import = pd.read_json("./data/stimuli.json")
states_import = pd.read_json("./data/states.json")
compartments_import = pd.read_json("./data/compartments.json")
protein_monomers_import = pd.read_json("./data/protein_monomers.json")
protein_complexes_import = pd.read_json("./data/protein_complexes.json")


#==========================================================================================================================

# create the output file
reactions_out = open("./data/reactions.csv", "w")
reactions_out.write("""name,enzyme,enzyme_compartment,delta_g,direction,keq,kinetics_backward_rate_law,
                        kinetics_forward_rate_law,model,optimal_tempature,pathways,processes,stoichiometry""")

# Loop through all imported reactions
for data in reactions_import["data"]:

    # transpose the reactions JSON file into a csv file
    kinetics_forward_rate_law = "None"
    if not data["forward kinetics"] == None:
        kinetics_forward_rate_law = data["forward kinetics"]["evidence"]['value']
    kinetics_backward_rate_law = "None"
    if not data["backward kinetics"] == None:
        kinetics_backward_rate_law = data["backward kinetics"]["evidence"]['value']
    optimal_temperature = "None"
    if not data["optimal_temperature"] == None:
        optimal_temperature = data["optimal_temperature"]["value"]
    stoichiometry = "None"
    first = True
    for molecule in data["stoichiometry"]:
        if first:
            stoichiometry = molecule["coefficient"] + ":" + molecule["compartment"] + ":" + molecule["molecule"]
            first = False
        else:
            stoichiometry += ";" + molecule["coefficient"] + ":" + molecule["compartment"] + ":" + molecule["molecule"]

    pathways_string = "None"
    if not data["pathways"] == []:
        pathways_string = data["pathways"][0]
    enzyme_protein_string = "None"
    enzyme_compartment_string = "None"
    if not data["enzyme"] == None:
        enzyme_protein_string = data["enzyme"]["protein"]
        enzyme_compartment_string = data["enzyme"]["compartment"]
        
    keq_string = "None"
    if not data["keq"] == None and not data["keq"]["value"] == None:
        keq_string = data["keq"]["value"]
    
    type_string = "None"
    if not data["type"] == None and not data["type"] == []:
        if not data["type"][0] == None:
            type_string = data["type"][0]

    name_string = "None"
    if not data["name"] == None:
        name_string = data["name"]
        
    delta_g_string = "None"
    if not data["delta_g"] == None:
        delta_g_string = data["delta_g"]
    
    direction_string = "None"
    if not data["direction"] == None:
        direction_string = data["direction"]
        
    processes_string = "None"
    if not data["processes"] == None:
        processes_string = data["processes"]
        
    name_string = name_string.replace(",", "")
    enzyme_compartment_string = enzyme_compartment_string.replace(",", "")
    delta_g_string = delta_g_string.replace(",", "")
    direction_string = direction_string.replace(",", "")
    keq_string = keq_string.replace(",", "")
    kinetics_backward_vmax = kinetics_backward_vmax.replace(",", "")
    
    # create and filter the final output string
    out_string = str(name_string) + "," + str(enzyme_protein_string) + "," + str(enzyme_compartment_string) + "," + "," + str(delta_g_string) + "," + "," + str(keq_string) + "," + str(kinetics_backward_vmax) + "," + str(kinetics_forward_vmax) + "," + str(model_string) + "," + str(optimal_ph) + "," + str(optimal_temperature) + "," + str(pathways_string) + "," + str(processes_string) + "," + "," + str(stoichiometry)
    printable = set(string.printable)
    out_string = "".join(filter(lambda x: x in printable, out_string))
    reactions_out.write("\n")
    reactions_out.write(out_string)
    
#Close the reactions CSV file
reactions_out.close()

#==========================================================================================================================

#Open files to write results to
metabolites_out = open("./data/molecules.csv", "w")

#Write headers to file
metabolites_out.write("WID,charge,delta_g,pi,logp,logd,")

#Loop through all reactions imported
for df in metabolites_import["data"]:
    
    a = df
    
    # transpose the metabolites JSON file into a csv file        
    out_string = str(df["wid"]) + "," + "," + str(df["charge"]) + "," + "," + "," + str(df["deltag_formation"]) + "," + str(df["pi"]) + "," + str(df["log_p"]) + "," + str(df["log_d"]) 
        
    # Write out collected data to file
    metabolites_out.write("\n")
    metabolites_out.write(out_string)

metabolites_out.close()

#==========================================================================================================================

#Open files to write results to
compartments_out = open("./data/compartments.csv", "w")

#Write headers to file
compartments_out.write("WID,name,protein_monomers,biomass_compositions")

#Loop through all reactions imported
for df in compartments_import["data"]:
    
    #Handlers for None or unexpected type(s) in fields
    wid_string = "None"
    if not df["wid"] == None:
        wid_string = df["wid"]
        
    name_string = "None"
    if not df["name"] == None:
        name_string = df["name"]
        
    protein_monomers_string = "None"
    first = True
    for protein in df["protein_monomers"]:
        if first:
            protein_monomers_string = protein
            first = False
        else:
            protein_monomers_string += ":" + protein

    biomass_compositions_string = "None"
    first = True
    for molecules in df["biomass_compositions"]:
        if first:
            biomass_compositions_string = molecules["concentration"] + ":" + molecules["compartment"] + ":" + molecules["metabolites"]
            first = False
        else:
            biomass_compositions_string += ";" + molecules["concentration"] + ":" + molecules["compartment"] + ":" + molecules["metabolites"]

    out_string = str(wid_string) + "," + str(name_string) + "," + str(protein_monomers_string) + "," + str(biomass_compositions_string)
        
    #Write out collected data to file
    compartments_out.write("\n")
    compartments_out.write(out_string)

#Close file
compartments_out.close()
