# Contrasting the TECR files

In [19]:
# import the requisite content
from numpy import nan
import pandas
import re

winter_2021 = pandas.read_csv('2021-08-04_vetted & reorganized NIST database_01.csv')
winter_2021 = winter_2021.fillna(' ')
equilibrator_2008 = pandas.read_csv('TECRDB_Elad_Noor.csv')
equilibrator_2008 = equilibrator_2008.fillna(' ')
master_file = pandas.read_csv('2021-08-12_master_TECR_3.csv')
master_file = master_file.fillna(' ')


# define the printing function
def set_contrast(data_description, master_set, set_1, set_1_description, set_2, set_2_description, verbose_printing = False, total_values = True):
    # print the original sets
    if total_values:
        print('\n{} in the master file: '.format(data_description), len(master_set))
        print('{} in the {} file: '.format(data_description, set_1_description), len(set_1))
        print('{} in the {} file: '.format(data_description, set_2_description), len(set_2))
    
    # contrast the sets
    extra_set_1 = master_set - set_1
    missing_set_1 = set_1 - master_set
    extra_set_2 = master_set - set_2
    missing_set_2 = set_2 - master_set
    if verbose_printing:
        print('\nExtra {} in the master file, versus {}: '.format(data_description, set_1_description), len(extra_set_1), '\n', extra_set_1)
        print('\nExtra {} in the master file, versus {}: '.format(data_description, set_2_description), len(extra_set_2), '\n', extra_set_2)
        print('\nMissing {} in the master file, versus {}: '.format(data_description, set_1_description), len(missing_set_1), '\n', missing_set_1)
        print('\nMissing {} in the master file, versus {}: '.format(data_description, set_2_description), len(missing_set_2), '\n', missing_set_2)
    else:
        print('\nExtra {} in the master file, versus {}: '.format(data_description, set_1_description), len(extra_set_1))
        print('Extra {} in the master file, versus {}: '.format(data_description, set_2_description), len(extra_set_2))
        print('Missing {} in the master file, versus {}: '.format(data_description, set_1_description), len(missing_set_1))
        print('Missing {} in the master file, versus {}: '.format(data_description, set_2_description), len(missing_set_2))
        
    return missing_set_1, missing_set_2

# define the series of enzymes
winter_enzyme_names = set(winter_2021['Enzyme:'])
striped_enzymes = set()
for enzyme in winter_enzyme_names:
    enzyme = enzyme.strip()
    striped_enzymes.add(enzyme)
    
equilibrator_enzyme_names = set(equilibrator_2008['enzyme_name'])  
master_enzyme_names = set(master_file['Enzyme:'])

# contrast the enzymes
set_contrast('enzymes', master_enzyme_names, striped_enzymes, 'Winter 2021', equilibrator_enzyme_names, 'eQuilibrator 2008', verbose_printing = True)

# define the series of references
winter_references = set(winter_2021['Reference ID:'])
striped_references = set()
for reference in winter_references:
    substituted_reference = re.sub('_.+', '', reference)
    striped_references.add(substituted_reference)
    
equilibrator_references = set(equilibrator_2008['reference'])
master_references = set(master_file['Reference ID:'])
master_striped_references = set()
for reference in master_references:
    if reference not in [' ']:
        substituted_reference = re.sub('_.+', '', reference)
        master_striped_references.add(substituted_reference)

# contrast the references
set_contrast('references', master_striped_references, striped_references, 'Winter 2021', equilibrator_references, 'eQuilibrator 2008')


enzymes in the master file:  492
enzymes in the Winter 2021 file:  425
enzymes in the eQuilibrator 2008 file:  431

Extra enzymes in the master file, versus Winter 2021:  69 
 {'(2-aminoethyl)phosphonate-pyruvate transferase', 'UDP-N-acetylglucosamine acyltransferase', '3-Hexulose-6-Phosphate synthase', 'saccharopine dehydrogenase (NAD+, L-lysine-forming)', 'inosine triphosphate pyrophosphohydrolase', 'phosphoenolpyruvate mutase', '3-dehydroquinate synthase', 'isopentenyl-diphosphate-isomerase', 'phosphoribosylaminoimidazole carboxylase', 'dihydropyrimidinase', 'anandamide amidohydrolase', 'carnitine dehydratase', 'xanthine oxidase', 'NAD(P)+ transhydrogenase (B-specific)', 'glutathione reductase (NADPH)', 'lysine-tRNA ligase', '-glu-X carboxypeptidase', 'fructose-bisphosphate aldolase', 'nitrilase', '2-dehydro-3-deoxyheptonate aldolase', "5 '-methylthioadenosine phosphorylase", 'histidine t-RNA ligase', 'anthranilate synthase', 'trehalose-6-phosphate phosphorylase', '2,5-diketo-D-glu

# Brainstorming

In [None]:
import pandas
import numpy

old = pandas.read_csv('2021-03-21_vetted + reorganized NIST_1.csv')
old = old.fillna(' ')
#display(old)
new = pandas.read_csv('2021-05-06_vetted & reorganized NIST database_01.csv')
#display(new)

empty_cell = ['nan', 'NaN', 'none', 'not given', '', ' ', None, numpy.nan]

# match the indexes for the reference IDs
reference_ids = {}
old_ids = old['Reference ID:'].tolist()
new_ids = new['Reference ID:'].tolist()
for id in old_ids:
    if id in empty_cell:
        continue
        
    old_id = old_ids.index(id)
    new_id = new_ids.index(id)
    reference_ids[id] = {'Old index': old_id,
                         'New index': new_id}

for id in reference_ids:
    print('\n')
    problem = False
    valid = True
    old_index = reference_ids[id]['Old index']
    new_index = reference_ids[id]['New index']
    '''display(old.iloc[old_index])
    display(new.iloc[new_index])'''
    error_string = []
    print('Old Reference ID:', old.at[old_index, 'Reference ID:'])
    if old.at[old_index, 'Reference ID:'] != new.at[new_index, 'Reference ID:']:
        error_string.append('Reference ID consistency: False')
        
    while valid:
        '''if old.at[old_index, 'Enzyme'] != new.at[new_index, 'Enzyme:']:
            if old.at[old_index, 'Enzyme'] not in empty_cell:
                print('Old {}, index {}:'.format(old_index), old.at[old_index, 'Enzyme'])
                print('New {}, index {}:'.format(new_index), new.at[new_index, 'Enzyme:'])
                valid = False
        
        if old.at[old_index, 'Reaction'] != new.at[new_index, 'Reaction:']:
            if old.at[old_index, 'Reaction'] not in empty_cell:
                print('Old reaction, index {}:'.format(old_index), old.at[old_index, 'Reaction'])
                print('New reaction, index {}:'.format(new_index), new.at[new_index, 'Reaction:'])
                valid = False'''
        
        if old.at[old_index, 'T [K]'] != new.at[new_index, 'T [K]']:
            if old.at[old_index, 'T [K]'] not in empty_cell:
                error_string.extend(['Old temperature, index {}: {}'.format(old_index, old.at[old_index, 'T [K]']),
                                    'New temperature, index {}: {}'.format(new_index, new.at[new_index, 'T [K]'])]
                                 )
                
                #valid = False
                problem = True
        
        if old.at[old_index, 'Keq'] != new.at[new_index, 'Keq']:
            if old.at[old_index, 'Keq'] not in empty_cell:
                error_string.extend(['Old Keq, index {}: {}'.format(old_index, old.at[old_index, 'Keq']),
                                    'New Keq, index {}: {}'.format(new_index, new.at[new_index, 'Keq'])]
                                 )
                
                #valid = False
                problem = True
        
        if old.at[old_index, 'Enthalpy [kJ / mol]'] != new.at[new_index, 'Enthalpy [kJ / mol]']:
            if old.at[old_index, 'Enthalpy [kJ / mol]'] not in empty_cell:
                error_string.extend(['Old enthalpy, index {}: {}'.format(old_index, old.at[old_index, 'Enthalpy [kJ / mol]']),
                                    'New enthalpy, index {}: {}'.format(new_index, new.at[new_index, 'Enthalpy [kJ / mol]'])]
                                 )

                #valid = False
                problem = True
        
        if old.at[old_index, 'Km'] != new.at[new_index, 'Km']:
            if old.at[old_index, 'Km'] not in empty_cell:
                error_string.extend(['Old Km, index {}: {}'.format(old_index, old.at[old_index, 'Km']),
                                    'New Km, index {}: {}'.format(new_index, new.at[new_index, 'Km'])]
                                 )

                #valid = False
                problem = True
            
        if old.at[old_index, 'pH '] != new.at[new_index, 'pH ']:
            if old.at[old_index, 'pH '] not in empty_cell:
                error_string.extend(['Old pH, index {}: {}'.format(old_index, old.at[old_index, 'pH ']),
                                    'New pH, index {}: {}'.format(new_index, new.at[new_index, 'pH '])]
                                 )

                #valid = False
                problem = True
        
        
        if valid:
            old_index += 1
            new_index += 1
            
        if old.at[old_index, 'Reference ID:'] != (id or ' '):
            valid = False
            if not problem:
                print('{} is equivalent.'.format(id))
        
    if problem:
        print('ERROR: {}\n'.format(id), '=' * len('{} ERROR'.format(id)))
        print('\n'.join(error_string))
        
    '''for index, row in old.iterrows():
        if id == old.at[index, row['Reference ID:']]: 
            for index2, row2 in new.iterrows():
                while old.at[index, row] == new.at[index2, row2]:'''            

In [1]:
import pandas

# import Elad's spreadsheet
elad_file = pandas.read_csv('TECRDB_Elad_Noor.csv')
display(elad_file.drop(elad_file.columns[[0, 1, 2]], axis = 1))

length = [1 for datum in range(len(elad_file))]
print(len(length))

Unnamed: 0,eval,EC,enzyme_name,reaction,description,K,K_prime,temperature,ionic_strength,p_h,p_mg
0,A,1.1.1.87,homoisocitrate dehydrogenase,kegg:C05662 + kegg:C00003 = kegg:C00322 + kegg...,"(1R,2S)-1-hydroxybutane-1,2,4-tricarboxylate(a...",,0.450,298.15,,7.5,
1,C,3.5.4.9,methenyltetrahydrofolate cyclohydrolase,kegg:C00445 + kegg:C00001 = kegg:C00234,"5,10-methenyltetrahydrofolate(aq) + H2O(l) = 1...",,4.200,298.15,,6.5,
2,B,4.2.1.3,aconitate hydratase,kegg:C00311 = kegg:C00158,isocitrate(aq) = citrate(aq),,18.000,310.15,,7.3,2.96
3,B,4.2.1.3,aconitate hydratase,kegg:C00311 = kegg:C00158,isocitrate(aq) = citrate(aq),,25.000,310.15,,7.3,2.80
4,B,4.2.1.3,aconitate hydratase,kegg:C00311 = kegg:C00158,isocitrate(aq) = citrate(aq),,33.000,310.15,,7.3,2.30
...,...,...,...,...,...,...,...,...,...,...,...
4539,E,4.1.1.21,phosphoribosylaminoimidazole carboxylase,kegg:C03373 + kegg:C00288 = kegg:C04751 + kegg...,Aminoimidazole ribotide + CO2 = 1-(5-Phospho-D...,,1.800,310.15,,7.8,
4540,E,2.1.2.3,phosphoribosylaminoimidazolecarboxamide formyl...,kegg:C00234 + kegg:C04677 = kegg:C00101 + kegg...,10-Formyltetrahydrofolate + 1-(5'-Phosphoribos...,,0.024,298.15,,7.5,
4541,E,4.3.-.-,formaldehyde condensation with THF,kegg:C00101 + kegg:C00067 = kegg:C00143,"THF(aq) + formaldehyde(aq) = 5,10-CH2-THF(aq)",,7700.000,293.15,,7.2,
4542,E,4.1.2.43,3-Hexulose-6-Phosphate synthase,kegg:C00199 + kegg:C00067 = kegg:C06019,D-Ribulose 5-phosphate + Formaldehyde = D-arab...,,25000.000,303.15,,7.0,


4544
