# Consolidating the NIST Keq data 

The Keqs of the NIST CSV file are condensed into a JSON file. The Keq values are complemented in the JSON with the experimental temperatures and pH values that will guide user selection of the appropriate Keq value to describe the system. The values are respectively listed with the references to facilitate verification of the reference source for each provided datum. The JSON may readily be imported to a Python scripted and used in calculations or model development.

In [1]:
#import libraries
import pandas
import numpy
import math
import json
import re



#import the final CSV file
final_csv = pandas.read_csv('2021-03-21_vetted + reorganized NIST_1.csv')


#acquire a list of all enzymes
enzyme_list = []
empty_cell = ['nan', 'NaN', 'none', 'not given', '', ' ', None, numpy.nan]
for index, row in final_csv.iterrows():
    if final_csv.at[index, 'Enzyme'] not in enzyme_list and final_csv.at[index, 'Enzyme'] not in empty_cell:
        enzyme_list.append(final_csv.at[index, 'Enzyme'])      
        
enzymes = []
for original_enzyme in enzyme_list:
    enzyme_name = re.search('(\w.*)',original_enzyme)
    enzymes.append(enzyme_name.group())

data_per_enzyme = {}
for enzyme in enzymes:
    # lists of the database varialbes
    keq_values_per_enzyme = []
    km_values_per_enzyme = []
    enthalpy_values_per_enzyme = []
    temperatures_per_enzyme = []
    phs_per_enzyme = []
    
    # lists of identifying whether the reference contains the identified variable 
    references_of_an_enzyme = []
    reaction_of_an_enzyme = []
    km_in_the_reference = []
    enthalpy_in_the_reference = []
    keqs_in_a_reference = []
    for index, row in final_csv.iterrows():
        iteration = 0
        if final_csv.at[index, 'Enzyme'] == ' %s' %(enzyme):
            references_of_an_enzyme.append(final_csv.at[index, 'Reference:'])
            reaction_of_an_enzyme.append(final_csv.at[index, 'Reaction'])
            
            # clean keqs are added to a list
            if final_csv.at[index, 'Keq'] not in empty_cell:
                #print(final_csv.at[index, 'Keq'])
                cleaned_keq = re.search('(\-?\d+\.?\d*)', '%s' %(final_csv.at[index, 'Keq']))
                #print(cleaned_keq.group())
                keq_values_per_enzyme.append(float(cleaned_keq.group())) 
                keqs_in_a_reference.append('True')
                temperatures_per_enzyme.append(final_csv.at[index, 'T [K]'])
                if final_csv.at[index, 'pH '] not in empty_cell:
                    phs_per_enzyme.append(final_csv.at[index, 'pH '])
                elif final_csv.at[index, 'pH '] in empty_cell:
                    phs_per_enzyme.append('nan')
            
            elif final_csv.at[index, 'Keq'] in empty_cell:
                keqs_in_a_reference.append('False')    
                
            # clean kms are added to a list
            if final_csv.at[index, 'Km'] not in empty_cell:
                #print(final_csv.at[index, 'Km\'])
                cleaned_km = re.search('(\-?\d+\.?\d*)', '%s' %(final_csv.at[index, 'Km']))
                #print(cleaned_km.group())
                km_values_per_enzyme.append(float(cleaned_km.group())) 
                if final_csv.at[index, 'Keq'] in empty_cell:
                    temperatures_per_enzyme.append(final_csv.at[index, 'T [K]'])
                    if final_csv.at[index, 'pH '] not in empty_cell:
                        phs_per_enzyme.append(final_csv.at[index, 'pH '])
                    elif final_csv.at[index, 'pH '] in empty_cell:
                        phs_per_enzyme.append('nan')
                    
            elif final_csv.at[index, 'Km'] in empty_cell:
                km_in_the_reference.append('False')
                
            # clean enthalpy values are added to a list
            if final_csv.at[index, 'Enthalpy [kJ / mol]'] not in empty_cell:
                #print(final_csv.at[index, 'Km\'])
                cleaned_enthalpy = re.search('(\-?\d+\.?\d*)', '%s' %(final_csv.at[index, 'Enthalpy [kJ / mol]']))
                #print(cleaned_km.group())
                enthalpy_in_the_reference.append('True')
                enthalpy_values_per_enzyme.append(float(cleaned_enthalpy.group())) 
                if final_csv.at[index, 'Keq'] in empty_cell and final_csv.at[index, 'Km'] in empty_cell:
                    temperatures_per_enzyme.append(final_csv.at[index, 'T [K]'])
                    if final_csv.at[index, 'pH '] not in empty_cell:
                        phs_per_enzyme.append(final_csv.at[index, 'pH '])
                    elif final_csv.at[index, 'pH '] in empty_cell:
                        phs_per_enzyme.append('nan')
                    
            elif final_csv.at[index, 'Enthalpy [kJ / mol]'] in empty_cell:
                enthalpy_in_the_reference.append('False')
                
                
            #loop through the unlabeled rows of each enzyme
            while final_csv.at[index + iteration, 'Enzyme'] in empty_cell:
                if final_csv.at[index, 'Keq'] not in empty_cell:
                    #clean keqs are added to a list
                    cleaned_keq = re.search('(\-?\d+\.?\d*)', '%s' %(final_csv.at[index, 'Keq']))
                    #print(cleaned_keq)
                    keq_values_per_enzyme.append(float(cleaned_keq.group())) 
                    keqs_in_a_reference.append('True')
                    temperatures_per_enzyme.append(final_csv.at[index, 'T [K]'])
                    if final_csv.at[index, 'pH '] not in empty_cell:
                        phs_per_enzyme.append(final_csv.at[index, 'pH '])
                    elif final_csv.at[index, 'pH '] in empty_cell:
                        phs_per_enzyme.append('nan')

                elif final_csv.at[index, 'Keq'] in empty_cell:
                    keqs_in_a_reference.append('False')  
                    
                #clean kms are added to a list
                if final_csv.at[index, 'Km'] not in empty_cell:
                    #print(final_csv.at[index, 'Km\'])
                    cleaned_km = re.search('(\-?\d+\.?\d*)', '%s' %(final_csv.at[index, 'Km']))
                    #print(cleaned_km.group())
                    km_in_a_reference.append('True')
                    km_values_per_enzyme.append(float(cleaned_km.group())) 
                    if final_csv.at[index, 'Keq'] in empty_cell:
                        temperatures_per_enzyme.append(final_csv.at[index, 'T [K]'])
                        if final_csv.at[index, 'pH '] not in empty_cell:
                            phs_per_enzyme.append(final_csv.at[index, 'pH '])
                        elif final_csv.at[index, 'pH '] in empty_cell:
                            phs_per_enzyme.append('nan')

                elif final_csv.at[index, 'Km'] in empty_cell:
                    km_in_the_reference.append('False')
                    
                # clean enthalpy values are added to a list
                if final_csv.at[index, 'Enthalpy [kJ / mol]'] not in empty_cell:
                    #print(final_csv.at[index, 'Km\'])
                    cleaned_ethalpy = re.search('(\-?\d+\.?\d*)', '%s' %(final_csv.at[index, 'Enthalpy [kJ / mol]']))
                    #print(cleaned_km.group())
                    enthalpy_in_the_reference.append('True')
                    enthalpy_values_per_enzyme.append(float(cleaned_enthalpy.group())) 
                    if final_csv.at[index, 'Keq'] in empty_cell and final_csv.at[index, 'Km'] in empty_cell:
                        temperatures_per_enzyme.append(final_csv.at[index, 'T [K]'])
                        if final_csv.at[index, 'pH '] not in empty_cell:
                            phs_per_enzyme.append(final_csv.at[index, 'pH '])
                        elif final_csv.at[index, 'pH '] in empty_cell:
                            phs_per_enzyme.append('nan')

                elif final_csv.at[index, 'Enthalpy [kJ / mol]'] in empty_cell:
                    enthalpy_in_the_reference.append('False')
                    
                    
                #proceed to the next loop
                if iteration + index < 3979:
                    iteration += 1

    #processing the average and standard deviation Keq values
    if len(keq_values_per_enzyme) != 0:
        average_keq_per_enzyme = sum(keq_values_per_enzyme) / len(keq_values_per_enzyme)
        standard_deviation_keq_per_enzyme = math.sqrt(sum([(x - average_keq_per_enzyme)**2 for x in keq_values_per_enzyme]) / len(keq_values_per_enzyme))

    elif len(keq_values_per_enzyme) == 0:
        average_keq_per_enzyme = 'nan'
        standard_deviation_keq_per_enzyme = 'nan'
        
    #processing the average and standard deviation Km values
    if len(km_values_per_enzyme) != 0:
        average_km_per_enzyme = sum(km_values_per_enzyme) / len(km_values_per_enzyme)
        standard_deviation_km_per_enzyme = math.sqrt(sum([(x - average_km_per_enzyme)**2 for x in km_values_per_enzyme]) / len(km_values_per_enzyme))

    elif len(km_values_per_enzyme) == 0:
        average_km_per_enzyme = 'nan'
        standard_deviation_km_per_enzyme = 'nan' 
        
    #processing the average and standard deviation enthalpy values
    if len(enthalpy_values_per_enzyme) != 0:
        average_enthalpy_per_enzyme = sum(enthalpy_values_per_enzyme) / len(enthalpy_values_per_enzyme)
        standard_deviation_enthalpy_per_enzyme = math.sqrt(sum([(x - average_enthalpy_per_enzyme)**2 for x in enthalpy_values_per_enzyme]) / len(enthalpy_values_per_enzyme))

    elif len(enthalpy_values_per_enzyme) == 0:
        average_enthalpy_per_enzyme = 'nan'
        standard_deviation_enthalpy_per_enzyme = 'nan' 
        
        
    #store the information into a nested dictionary structure
    data_per_enzyme[enzyme] = {'reaction':reaction_of_an_enzyme,
                               'experimental temperatures':temperatures_per_enzyme,
                               'experimental phs':phs_per_enzyme,
                               'keq reference':references_of_an_enzyme,
                               'Keq':{'keq values in the reference':keqs_in_a_reference,
                                       'keqs':keq_values_per_enzyme, 
                                       'keq quantity':len(keq_values_per_enzyme), 
                                       'keq average':average_keq_per_enzyme, 
                                       'keq standard deviation':standard_deviation_keq_per_enzyme},
                               'Km':{'km values in the reference':km_in_the_reference,
                                    'km values':km_values_per_enzyme,
                                    'km average':average_km_per_enzyme,
                                    'km standard deviation':standard_deviation_km_per_enzyme},
                               'Enthalpy':{'enthalpy values in the reference':enthalpy_in_the_reference,
                                         'enthalpy values':enthalpy_values_per_enzyme,
                                         'enthalpy average':average_enthalpy_per_enzyme,
                                         'enthalpy standard deviation':standard_deviation_enthalpy_per_enzyme}}


#export the dictionary as a JSON file
with open('2021-03-29_APF_NIST consolidated_01.json', 'w') as output:
    json.dump(data_per_enzyme, output, indent = 5)