# Consolidating the NIST Keq data 

The Keqs of the NIST CSV file are condensed into a JSON file. The Keq values are complemented in the JSON with the experimental temperatures and pH values that will guide user selection of the appropriate Keq value to describe the system. The values are respectively listed with the references to facilitate verification of the reference source for each provided datum. The JSON may readily be imported to a Python scripted and used in calculations or model development.

In [4]:
#import libraries
import pandas
import numpy
import math
import json
import re



#import the final CSV file
final_csv = pandas.read_csv('2021-03-21_vetted + reorganized NIST_1.csv')


#acquire a list of all enzymes
enzyme_list = []
empty_cell = ['nan', 'NaN', 'none', 'not given', '', ' ', None, numpy.nan]
for index, row in final_csv.iterrows():
    if final_csv.at[index, 'Enzyme'] not in enzyme_list and final_csv.at[index, 'Enzyme'] not in empty_cell:
        enzyme_list.append(final_csv.at[index, 'Enzyme'])      
        
enzymes = []
for original_enzyme in enzyme_list:
    enzyme_name = re.search('(\w.*)',original_enzyme)
    enzymes.append(enzyme_name.group())

data_per_enzyme = {}
for enzyme in enzymes:
    # lists of the database varialbes
    keq_values_per_enzyme = []
    km_values_per_enzyme = []
    enthalpy_values_per_enzyme = []
    temperatures_per_enzyme = []
    phs_per_enzyme = []
    
    # lists of identifying whether the reference contains the identified variable 
    references_of_an_enzyme = []
    km_in_the_reference = []
    enthalpy_in_the_reference = []
    keqs_in_a_reference = []
    for index, row in final_csv.iterrows():
        iteration = 0
        if final_csv.at[index, 'Enzyme'] == ' %s' %(enzyme):
            references_of_an_enzyme.append(final_csv.at[index, 'Reference:'])
            
            # clean keqs are added to a list
            if final_csv.at[index, 'Keq'] not in empty_cell:
                #print(final_csv.at[index, 'Keq'])
                cleaned_keq = re.search('(\-?\d+\.?\d*)', '%s' %(final_csv.at[index, 'Keq']))
                #print(cleaned_keq.group())
                keq_values_per_enzyme.append(float(cleaned_keq.group())) 
                keqs_in_a_reference.append('True')
                temperatures_per_enzyme.append(final_csv.at[index, 'T [K]'])
                if final_csv.at[index, 'pH '] not in empty_cell:
                    phs_per_enzyme.append(final_csv.at[index, 'pH '])
                elif final_csv.at[index, 'pH '] in empty_cell:
                    phs_per_enzyme.append('nan')
            
            elif final_csv.at[index, 'Keq'] in empty_cell:
                keqs_in_a_reference.append('False')    
                
            # clean kms are added to a list
            if final_csv.at[index, 'Km'] not in empty_cell:
                #print(final_csv.at[index, 'Km\'])
                cleaned_km = re.search('(\-?\d+\.?\d*)', '%s' %(final_csv.at[index, 'Km']))
                #print(cleaned_km.group())
                km_values_per_enzyme.append(float(cleaned_km.group())) 
                if final_csv.at[index, 'Keq'] in empty_cell:
                    temperatures_per_enzyme.append(final_csv.at[index, 'T [K]'])
                    if final_csv.at[index, 'pH '] not in empty_cell:
                        phs_per_enzyme.append(final_csv.at[index, 'pH '])
                    elif final_csv.at[index, 'pH '] in empty_cell:
                        phs_per_enzyme.append('nan')
                    
            elif final_csv.at[index, 'Km'] in empty_cell:
                km_in_the_reference.append('False')
                
            # clean enthalpy values are added to a list
            if final_csv.at[index, 'Enthalpy [kJ / mol]'] not in empty_cell:
                #print(final_csv.at[index, 'Km\'])
                cleaned_enthalpy = re.search('(\-?\d+\.?\d*)', '%s' %(final_csv.at[index, 'Enthalpy [kJ / mol]']))
                #print(cleaned_km.group())
                enthalpy_in_a_reference.append('True')
                enthalpy_values_per_enzyme.append(float(cleaned_enthalpy.group())) 
                if final_csv.at[index, 'Keq'] in empty_cell and final_csv.at[index, 'Km'] in empty_cell:
                    temperatures_per_enzyme.append(final_csv.at[index, 'T [K]'])
                    if final_csv.at[index, 'pH '] not in empty_cell:
                        phs_per_enzyme.append(final_csv.at[index, 'pH '])
                    elif final_csv.at[index, 'pH '] in empty_cell:
                        phs_per_enzyme.append('nan')
                    
            elif final_csv.at[index, 'Enthalpy [kJ / mol]'] in empty_cell:
                enthalpy_in_the_reference.append('False')
                
                
            #loop through the unlabeled rows of each enzyme
            while final_csv.at[index + iteration, 'Enzyme'] in empty_cell:
                if final_csv.at[index, 'Keq'] not in empty_cell:
                    #clean keqs are added to a list
                    cleaned_keq = re.search('(\-?\d+\.?\d*)', '%s' %(final_csv.at[index, 'Keq']))
                    #print(cleaned_keq)
                    keq_values_per_enzyme.append(float(cleaned_keq.group())) 
                    keqs_in_a_reference.append('True')
                    temperatures_per_enzyme.append(final_csv.at[index, 'T [K]'])
                    if final_csv.at[index, 'pH '] not in empty_cell:
                        phs_per_enzyme.append(final_csv.at[index, 'pH '])
                    elif final_csv.at[index, 'pH '] in empty_cell:
                        phs_per_enzyme.append('nan')

                elif final_csv.at[index, 'Keq'] in empty_cell:
                    keqs_in_a_reference.append('False')  
                    
                #clean kms are added to a list
                if final_csv.at[index, 'Km'] not in empty_cell:
                    #print(final_csv.at[index, 'Km\'])
                    cleaned_km = re.search('(\-?\d+\.?\d*)', '%s' %(final_csv.at[index, 'Km']))
                    #print(cleaned_km.group())
                    km_in_a_reference.append('True')
                    km_values_per_enzyme.append(float(cleaned_km.group())) 
                    if final_csv.at[index, 'Keq'] in empty_cell:
                        temperatures_per_enzyme.append(final_csv.at[index, 'T [K]'])
                        if final_csv.at[index, 'pH '] not in empty_cell:
                            phs_per_enzyme.append(final_csv.at[index, 'pH '])
                        elif final_csv.at[index, 'pH '] in empty_cell:
                            phs_per_enzyme.append('nan')

                elif final_csv.at[index, 'Km'] in empty_cell:
                    km_in_the_reference.append('False')
                    
                # clean enthalpy values are added to a list
                if final_csv.at[index, 'Enthalpy [kJ / mol]'] not in empty_cell:
                    #print(final_csv.at[index, 'Km\'])
                    cleaned_ethalpy = re.search('(\-?\d+\.?\d*)', '%s' %(final_csv.at[index, 'Enthalpy [kJ / mol]']))
                    #print(cleaned_km.group())
                    enthalpy_in_a_reference.append('True')
                    enthalpy_values_per_enzyme.append(float(cleaned_enthalpy.group())) 
                    if final_csv.at[index, 'Keq'] in empty_cell and final_csv.at[index, 'Km'] in empty_cell:
                        temperatures_per_enzyme.append(final_csv.at[index, 'T [K]'])
                        if final_csv.at[index, 'pH '] not in empty_cell:
                            phs_per_enzyme.append(final_csv.at[index, 'pH '])
                        elif final_csv.at[index, 'pH '] in empty_cell:
                            phs_per_enzyme.append('nan')

                elif final_csv.at[index, 'Enthalpy [kJ / mol]'] in empty_cell:
                    enthalpy_in_the_reference.append('False')
                    
                    
                #proceed to the next loop
                if iteration + index < 3979:
                    iteration += 1

    #processing the average and standard deviation Keq values
    if len(keq_values_per_enzyme) != 0:
        average_keq_per_enzyme = sum(keq_values_per_enzyme) / len(keq_values_per_enzyme)
        standard_deviation_keq_per_enzyme = math.sqrt(sum([(x - average_keq_per_enzyme)**2 for x in keq_values_per_enzyme]) / len(keq_values_per_enzyme))

    elif len(keq_values_per_enzyme) == 0:
        average_keq_per_enzyme = 'nan'
        standard_deviation_keq_per_enzyme = 'nan'
        
    #processing the average and standard deviation Km values
    if len(km_values_per_enzyme) != 0:
        average_km_per_enzyme = sum(km_values_per_enzyme) / len(km_values_per_enzyme)
        standard_deviation_km_per_enzyme = math.sqrt(sum([(x - average_km_per_enzyme)**2 for x in km_values_per_enzyme]) / len(km_values_per_enzyme))

    elif len(km_values_per_enzyme) == 0:
        average_km_per_enzyme = 'nan'
        standard_deviation_km_per_enzyme = 'nan' 
        
    #processing the average and standard deviation enthalpy values
    if len(enthalpy_values_per_enzyme) != 0:
        average_enthalpy_per_enzyme = sum(enthalpy_values_per_enzyme) / len(enthalpy_values_per_enzyme)
        standard_deviation_enthalpy_per_enzyme = math.sqrt(sum([(x - average_enthalpy_per_enzyme)**2 for x in enthalpy_values_per_enzyme]) / len(enthalpy_values_per_enzyme))

    elif len(enthalpy_values_per_enzyme) == 0:
        average_enthalpy_per_enzyme = 'nan'
        standard_deviation_enthalpy_per_enzyme = 'nan' 
        
        
    #store the information into a nested dictionary structure
    data_per_enzyme[enzyme] = {'experimental temperatures':temperatures_per_enzyme,
                               'experimental phs':phs_per_enzyme,
                               'keq reference':references_of_an_enzyme,
                               'Keq':{'keq values in the reference':keqs_in_a_reference,
                                       'keqs':keq_values_per_enzyme, 
                                       'keq quantity':len(keq_values_per_enzyme), 
                                       'keq average':average_keq_per_enzyme, 
                                       'keq standard deviation':standard_deviation_keq_per_enzyme},
                               'Km':{'km values in the reference':km_in_the_reference,
                                    'km values':km_values_per_enzyme,
                                    'km average':average_km_per_enzyme,
                                    'km standard deviation':standard_deviation_km_per_enzyme},
                              'Enthalpy':{'enthalpy values in the reference':enthalpy_in_the_reference,
                                         'enthalpy values':enthalpy_values_per_enzyme,
                                         'enthalpy average':average_enthalpy_per_enzyme,
                                         'enthalpy standard deviation':standard_deviation_enthalpy_per_enzyme}}


    '''    elif final_csv[index, 'Enzyme'] in empty_cell:
        cleaned_keq = re.search('(\d\.?\d*)', '%s' %(final_csv.at[index, 'Keq']))
        #print('yes')
        keq_values_per_enzyme.append(float(cleaned_keq.group(1))) '''

#print(enzyme, data_per_enzyme[enzyme])


#export the dictionary as a JSON file
with open('2021-03-21_APF_NIST consolidated_01.json', 'w') as output:
    json.dump(data_per_enzyme, output, indent = 5)

NameError: name 'enthalpy_in_a_reference' is not defined

## Interpreting the Keq NIST data

The enzymatic information of the Keq data from the NIST database is investigated. The quantity and identity of enzymes from the NIST database that are also present in the core metabolism of the E. coli model data from the BiGG database is printed from the following code. The results suggest that ~1/3 enzymes of the E. coli model are described by the NIST database; thus, other resources must be used to establish a complete thermodynamic description of E. coli.

In [None]:
import json
import re

data = json.load(open('2021-03-04_APF_NIST consolidated.json'))
data2 = json.load(open('2021-03-04_APF_NIST consolidated_01.json'))
data3 = json.load(open('2021-03-21_APF_NIST consolidated_01.json'))


bigg_model_ecoli_ids = ['ACALD', 'ACALDt', 'ACKr','ACONTa','ACONTb','ACt2r','ADK1','AKGDH','AKGt2r','ALCD2x',
                        'ATPM','ATPS4r','BIOMASS_Ecoli_core_w_GAM','CO2t','CS','CYTBD','D_LACt2','ENO','ETOHt2r',
                        'EX_acald_e','EX_ac_e','EX_akg_e','EX_co2_e','EX_etoh_e','EX_for_e','EX_fru_e',
                        'EX_fum_e','EX_glc__D_e','EX_gln__L_e','EX_glu__L_e','EX_h2o_e','EX_h_e','EX_lac__D_e',
                        'EX_mal__L_e','EX_nh4_e','EX_o2_e','EX_pi_e','EX_pyr_e','EX_succ_e','FBA','FBP',
                        'FORt','FORt2','FRD7','FRUpts2','FUM','FUMt2_2','G6PDH2r','GAPD','GLCpts',
                        'GLNabc','GLNS','GLUDy','GLUN','GLUSy','GLUt2r','GND','H2Ot','ICDHyr','ICL',
                        'LDH_D','MALS','MALt2_2','MDH','ME1','ME2','NADH16','NADTRHD','NH4t','O2t','PDH',
                        'PFK','PFL','PGI','PGK','PGL','PGM','PIt2r','PPC','PPCK','PPS','PTAr','PYK','PYRt2','RPE',
                        'RPI','SUCCt2_2','SUCCt3','SUCDi','SUCOAS','TALA','THD2','TKT1','TKT2','TPI']



bigg_model_ecoli_names = ['Acetaldehyde dehydrogenase (acetylating)', 'Acetaldehyde reversible transport',
                         'Acetate kinase','Aconitase (half-reaction A, Citrate hydro-lyase)',
                         'Aconitase (half-reaction B, Isocitrate hydro-lyase)',
                         'Acetate reversible transport via proton symport', 'Adenylate kinase',
                         '2-Oxogluterate dehydrogenase', '2 oxoglutarate reversible transport via symport',
                         'Alcohol dehydrogenase (ethanol)', 'ATP maintenance requirement', 
                         'ATP synthase (four protons for one ATP)', 'Biomass Objective Function with GAM',
                         'CO2 transporter via diffusion', 'Citrate synthase', 
                         'Cytochrome oxidase bd (ubiquinol-8: 2 protons)','D lactate transport via proton symport',
                         'Enolase','Ethanol reversible transport via proton symport','Acetaldehyde exchange',
                         'Acetate exchange', '2-Oxoglutarate exchange', 'CO2 exchange', 'Ethanol exchange',
                         'Formate exchange', 'D-Fructose exchange', 'Fumarate exchange', 'D-Glucose exchange',
                         'L-Glutamine exchange', 'L-Glutamate exchange', 'H2O exchange','H+ exchange','D-lactate exchange',
                         'L-Malate exchange','Ammonia exchange','O2 exchange','Phosphate exchange','Pyruvate exchange',
                         'Succinate exchange','Fructose-bisphosphate aldolase','Fructose-bisphosphatase',
                         'Formate transport via diffusion','Formate transport in via proton symport','Fumarate reductase',
                         'Fructose transport via PEP:Pyr PTS (f6p generating)','Fumarase',
                         'Fumarate transport via proton symport (2 H)','Glucose 6-phosphate dehydrogenase',
                         'Glyceraldehyde-3-phosphate dehydrogenase','D-glucose transport via PEP:Pyr PTS',
                         'L-glutamine transport via ABC system','Glutamine synthetase','Glutamate dehydrogenase (NADP)',
                         'Glutaminase','Glutamate synthase (NADPH)','L glutamate transport via proton symport reversible',
                         'Phosphogluconate dehydrogenase', 'H2O transport via diffusion','Isocitrate dehydrogenase (NADP)',
                         'Isocitrate lyase','D-lactate dehydrogenase','Malate synthase',
                         'Malate transport via proton symport (2 H)','Malate dehydrogenase','Malic enzyme (NAD)',
                         'Malic enzyme (NADP)','NADH dehydrogenase (ubiquinone-8 & 3 protons)',
                         'NAD transhydrogenase','Ammonia reversible transport','O2 transport diffusion',
                         'Pyruvate dehydrogenase','Phosphofructokinase','Pyruvate formate lyase',
                         'Glucose-6-phosphate isomerase','Phosphoglycerate kinase','6-phosphogluconolactonase',
                         'Phosphoglycerate mutase','Phosphate reversible transport via symport',
                         'Phosphoenolpyruvate carboxylase','Phosphoenolpyruvate carboxykinase',
                         'Phosphoenolpyruvate synthase','Phosphotransacetylase','Pyruvate kinase',
                         'Pyruvate transport in via proton symport','Ribulose 5-phosphate 3-epimerase',
                         'Ribose-5-phosphate isomerase','Succinate transport via proton symport (2 H)',
                         'Succinate transport out via proton antiport','Succinate dehydrogenase (irreversible)',
                         'Succinyl-CoA synthetase (ADP-forming)','Transaldolase','NAD(P) transhydrogenase',
                         'Transketolase','Transketolase','Triose-phosphate isomerase']



#print(data[' chorismate mutase'])
#print('2:')
#print(data2['chorismate mutase'])


#data2['arginase']


keys = []
for key in data3:
    keys.append(key)   

    
described_enzymes = []
for enzyme in bigg_model_ecoli_names:
    for key in keys:
        if re.search('%s'%(enzyme), key, flags=re.IGNORECASE):
            described_enzymes.append(enzyme)
    
kinetic_described_enyzymes = []
for enzyme in described_enzymes:
    if 
    
undescribed_enzymes = []
for enzyme in bigg_model_ecoli_names:
    if enzyme not in described_enzymes:
        undescribed_enzymes.append(enzyme)


print('Total enzymes:', len(bigg_model_ecoli_names))
print('Described enzymes:', len(described_enzymes))
print(described_enzymes)
print('Undescribed enzymes:', len(undescribed_enzymes))
print(undescribed_enzymes)







'''keys.sort()
for key in keys:
    print(key)'''
