# Import the scrapings

### Refining the freiburger scraping 

In [1]:
from numpy import unique, nan 
import re 

empty = [nan, 'nan', None, ' ', '', 'NaN']


def refine_freiburger(freiburger_file):
    # strip spaces from the freiburger_file
    for column in freiburger_file:
        for index, entry in freiburger_file[column].iteritems():
            if entry is float:
                freiburger_file.at[index, column] = str(entry).strip(' ?~')
                freiburger_file.at[index, column] = float(entry)
            elif entry is int:
                freiburger_file.at[index, column] = str(entry).strip(' ?~')
                freiburger_file.at[index, column] = int(entry)
            else:
                freiburger_file.at[index, column] = str(entry).strip(' ?~')

    # extrapolate metadata to all datums
    for index, row in freiburger_file.iterrows():    
        if row['Reference ID:'] not in empty:
            reference_id = row['Reference ID:']
            reference = row['Reference:']
            methods = row['Method:']
            buffer = row['Buffer:']
            ec = row['EC Value:']
        elif row['Reference ID:'] in empty:
            freiburger_file.at[index, 'Reference ID:'] = reference_id
            freiburger_file.at[index, 'Reference:'] = reference
            freiburger_file.at[index, 'Method:'] = methods
            freiburger_file.at[index, 'Buffer:'] = buffer
            freiburger_file.at[index, 'EC Value:'] = ec
        else:
            print('ERROR: The reference has an unidentified structure.')
        
    # strip the EC values
    for index, master_ec in freiburger_file['EC Value:'].iteritems():
        if master_ec not in empty:  
            master_ec = re.search('([\d|\.|\-]+)', master_ec).group()
            freiburger_file['EC Value:'].iloc[index] = master_ec
            
    # consolidate the solutes column 
    for index, solute in freiburger_file['solutes [mol / dm^3]'].iteritems():
        if solute not in empty:  
            solute_list = solute.split(' & ')
            if len(solute_list) > 1:
                solute_list = unique(solute_list)
                if len(solute_list) > 2:
                    print('Large solute set {}'.format(solute_list))
                else:
                    freiburger_file['solutes [mol / dm^3]'].iloc[index] = ' & '.join([x for x in list(solute_list)])

            elif solute_list[0] in ['#NAME']:
                freiburger_file['solutes [mol / dm^3]'].iloc[index] = ''

    # create the additional columns
    column_nans = ['' for row in range(len(freiburger_file))]  
    freiburger_file.insert(0, 'noor_index', column_nans)  
    freiburger_file.insert(0, 'du_index', column_nans)  
    freiburger_file.insert(0, 'freiburger_index', freiburger_file.index)
    freiburger_file.insert(4, 'KEGG Reaction:', column_nans)
    freiburger_file.insert(5, 'CID Reaction:', column_nans)

    for index, value in freiburger_file[['Ionic strength [mol / dm^3]', 'Ionic strength [mol / kg]']].iterrows():
        if value['Ionic strength [mol / kg]'] in empty:
            freiburger_file.at[index, 'Ionic strength [mol / kg]'] = value['Ionic strength [mol / dm^3]']
        elif value['Ionic strength [mol / kg]'] not in empty:
            if value['Ionic strength [mol / dm^3]'] not in empty:
                if value['Ionic strength [mol / dm^3]'] != value['Ionic strength [mol / kg]']:
                    print(value['Ionic strength [mol / dm^3]'], '\t', value['Ionic strength [mol / kg]'])
                    freiburger_file.at[index, 'Ionic strength [mol / kg]'] = average(value['Ionic strength [mol / kg]'], value['Ionic strength [mol / dm^3]'])

    
    return freiburger_file

### import the CSV files

In [2]:
import pandas

noor_dataframe = pandas.read_csv('TECRDB_Elad_Noor.csv').fillna(' ')
manual_noor_curation = pandas.read_csv('Manual curation of Noor et al., comma delimited.txt', header = 0)

du_dataframe = pandas.read_excel('mmc2.xlsx', sheet_name = 'Table S1. TECRDB Keqs')
du_dataframe.rename(columns = {'Reaction.1': 'reaction_string'}, inplace = True)
manual_du_curation = pandas.read_csv('Manual curation of Du et al., comma delimited.txt', header = 0)

freiburger_dataframe = pandas.read_csv('2021-08-04_vetted & reorganized NIST database_01.csv').fillna(' ')
freiburger_dataframe = refine_freiburger(freiburger_dataframe)
freiburger_dataframe.head()

  warn(msg)


Unnamed: 0,freiburger_index,du_index,noor_index,Enzyme:,KEGG Reaction:,CID Reaction:,Reaction:,Reference:,Reference ID:,T [K],...,Km,Method:,Buffer:,Experimental conditions,EC Value:,solutes [mol / kg],solutes [mol / dm^3],Ionic strength [mol / dm^3],Ionic strength [mol / kg],Enthalpy [kJ / mol]
0,0,,,aspartate ammonia-lyase,,,L-aspartate(aq) = fumarate(aq) + ammonia(aq),Quastel J.H.; Woolf B.; Biochem. J.; 20 545 (1...,26QUA/WOO_1205,310.15,...,,chemical analysis,phosphate,,4.3.1.1,,,,,
1,1,,,aspartate ammonia-lyase,,,L-aspartate(aq) = fumarate(aq) + ammonia(aq),Woolf B.; Biochem. J.; 23 472 (1929).,29WOO_1206,310.15,...,,chemical analysis and polarimetry,phosphate,,4.3.1.1,,,,,
2,2,,,fumarate hydratase,,,fumarate(aq) + H2O(l) = (S)-malate(aq),Borsook H.; Schott H.F.; J. Biol. Chem.; 92 55...,31BOR/SCH_1141,298.15,...,,electrochemistry,,,4.2.1.2,,,,,
3,3,,,fumarate hydratase,,,fumarate(aq) + H2O(l) = (S)-malate(aq),Borsook H.; Schott H.F.; J. Biol. Chem.; 92 55...,31BOR/SCH_1141,298.15,...,,electrochemistry,,,4.2.1.2,,,,,
4,4,,,fumarate hydratase,,,fumarate(aq) + H2O(l) = (S)-malate(aq),Jacobsohn K.P.; Biochem. Z.; 274 167 (1934).,34JAC_1142,278.15,...,,polarimetry,barbital,,4.2.1.2,,,,,


# Merge the scrapings

In [3]:
%run ../../merging_datasets.py

# merging the noor scraping
elad_enzymes = 'enzyme_name'
elad_references = 'reference'
scraping_name = 'noor'
mrgpkg = merge_package(freiburger_dataframe, noor_dataframe, scraping_name)
# mrgpkg.add_new(elad_enzymes, elad_references) #, export = True)
# mrgpkg.merge_existing()#export = True)
# mrgpkg.incorporate_manual_curation(manual_noor_curation, export = True)
# mrgpkg.confirm_merging()
noor_master_merge = mrgpkg.merge(elad_enzymes, elad_references, manual_noor_curation, export = False)

# merging the du scraping
du_enzymes = 'Reaction'
du_references = 'Reference_id'
scraping_name = 'du'
mrgpkg = merge_package(noor_master_merge, du_dataframe, scraping_name) #, first = False)
# mrgpkg.add_new(du_enzymes, du_references, export = False)
# mrgpkg.merge_existing()#export = True)
# mrgpkg.incorporate_manual_curation(manual_du_curation, export = False)
mrgpkg.merge(du_enzymes, du_references, manual_du_curation, export = True)
missing_indices = mrgpkg.confirm_merging()

if missing_indices is not None:
    print(len(missing_indices))
    for index in missing_indices:
        print(f'{index},\t\tReaction string,\t\t\t')
# final_master_file = mrgpkg.merge(du_enzymes, du_references, manual_du_curation, export = True)


enzymes in the master file:  425
enzymes in the new file:  431
Extra enzymes in the master file, versus new file:  60
Missing enzymes in the master file, versus new file:  66

references in the master file:  911
references in the new file:  919
Extra references in the master file, versus new file:  96
Missing references in the master file, versus new file:  104
before 4041
after 4934
total additions 893
--> Failed index to match:  38 ___ hexokinase
--> Failed index to match:  79 ___ shikimate dehydrogenase
--> Failed index to match:  80 ___ shikimate dehydrogenase
--> Failed index to match:  114 ___ fructose-biphosphate aldolase and triose-phosphate isomerase
--> Failed index to match:  270 ___ alkaline phosphatase
--> Failed index to match:  271 ___ alkaline phosphatase
--> Failed index to match:  283 ___ glucose-6-phosphate 1-epimerase
--> Failed index to match:  315 ___ adenylate kinase
--> Failed index to match:  358 ___ creatine kinase
--> Failed index to match:  449 ___ phosphos

--> Failed index to match:  2186 ___ aconitate hydratase
--> Failed index to match:  2229 ___ prephenate dehydratase
--> Failed index to match:  2259 ___ alkaline phosphatase
--> Failed index to match:  2331 ___ glyceraldehyde-3-phosphate dehydrogenase
--> Failed index to match:  2373 ___ adenosine deaminase
--> Failed index to match:  2374 ___ adenosine deaminase
--> Failed index to match:  2375 ___ adenosine deaminase
--> Failed index to match:  2376 ___ adenosine deaminase
--> Failed index to match:  2377 ___ adenosine deaminase
--> Failed index to match:  2378 ___ adenosine deaminase
--> Failed index to match:  2379 ___ adenosine deaminase
--> Failed index to match:  2380 ___ adenosine deaminase
--> Failed index to match:  2381 ___ adenosine deaminase
--> Failed index to match:  2382 ___ adenosine deaminase
--> Failed index to match:  2477 ___ creatine kinase
--> Failed index to match:  2498 ___ creatine kinase
--> Failed index to match:  2519 ___ alanine racemase
--> Failed index 

--> Failed index to match:  4170 ___ dihydrolipoamide dehydrogenase
--> Failed index to match:  4191 ___ dihydrolipoamide dehydrogenase
--> Failed index to match:  4367 ___ alcohol dehydrogenase
--> Failed index to match:  4420 ___ 1-piperidine-2-carboxylate reductase
--> Failed index to match:  4421 ___ 1-piperidine-2-carboxylate reductase
--> Failed index to match:  4422 ___ 1-piperidine-2-carboxylate reductase
--> Failed index to match:  4423 ___ 1-piperidine-2-carboxylate reductase
--> Failed index to match:  4424 ___ 1-piperidine-2-carboxylate reductase
--> Failed index to match:  4425 ___ 1-piperidine-2-carboxylate reductase
--> Failed index to match:  4467 ___ aconitate hydratase
--> Failed index to match:  4493 ___ glucose-6-phosphate isomerase
Unmatched indices:  298
The [38] new_id is a duplicate.
The [77] new_id is a duplicate.
79
[79, 80]
0
[708, 709]
80
[79, 80]
1
[708, 709]
The [114] new_id is a duplicate.
--> ERROR: Repeated noor index < 242 >
270
[270]
0
[1762]
271
[271

--> Failed index to match:  103 ___ alcohol dehydrogenase
--> Failed index to match:  104 ___ alcohol dehydrogenase
--> Failed index to match:  105 ___ alcohol dehydrogenase
--> Failed index to match:  106 ___ alcohol dehydrogenase
--> Failed index to match:  107 ___ alcohol dehydrogenase
--> Failed index to match:  108 ___ alcohol dehydrogenase
--> Failed index to match:  109 ___ alcohol dehydrogenase
--> Failed index to match:  110 ___ alcohol dehydrogenase
--> Failed index to match:  111 ___ alcohol dehydrogenase
--> Failed index to match:  112 ___ alcohol dehydrogenase
--> Failed index to match:  113 ___ alcohol dehydrogenase
--> Failed index to match:  114 ___ alcohol dehydrogenase
--> Failed index to match:  115 ___ alcohol dehydrogenase
--> Failed index to match:  116 ___ alcohol dehydrogenase
--> Failed index to match:  117 ___ alcohol dehydrogenase
--> Failed index to match:  118 ___ alcohol dehydrogenase
--> Failed index to match:  120 ___ isocitrate lyase
--> Failed index to

--> Failed index to match:  790 ___ adenylate kinase
--> Failed index to match:  791 ___ adenylate kinase
--> Failed index to match:  792 ___ adenylate kinase
--> Failed index to match:  793 ___ adenylate kinase
--> Failed index to match:  794 ___ adenylate kinase
--> Failed index to match:  795 ___ adenylate kinase
--> Failed index to match:  796 ___ adenylate kinase
--> Failed index to match:  797 ___ adenylate kinase
--> Failed index to match:  798 ___ adenylate kinase
--> Failed index to match:  803 ___ ribose-5-phosphate isomerase
--> Failed index to match:  812 ___ retinal isomerase
--> Failed index to match:  816 ___ alcohol dehydrogenase
--> Failed index to match:  817 ___ alcohol dehydrogenase
--> Failed index to match:  818 ___ alcohol dehydrogenase
--> Failed index to match:  819 ___ alcohol dehydrogenase
--> Failed index to match:  820 ___ alcohol dehydrogenase
--> Failed index to match:  821 ___ alcohol dehydrogenase
--> Failed index to match:  828 ___ myo-inositol 2-dehyd

--> Failed index to match:  1515 ___ pyruvate carboxylase
--> Failed index to match:  1516 ___ pyruvate carboxylase
--> Failed index to match:  1517 ___ pyruvate carboxylase
--> Failed index to match:  1518 ___ pyruvate carboxylase
--> Failed index to match:  1519 ___ phosphoenolpyruvate carboxykinase (diphosphate)
--> Failed index to match:  1526 ___ glutamate dehydrogenase
--> Failed index to match:  1527 ___ glutamate dehydrogenase
--> Failed index to match:  1529 ___ glutamate dehydrogenase
--> Failed index to match:  1532 ___ glutamate dehydrogenase
--> Failed index to match:  1534 ___ glutamate dehydrogenase
--> Failed index to match:  1544 ___ glutamate dehydrogenase
--> Failed index to match:  1547 ___ glutamate dehydrogenase
--> Failed index to match:  1556 ___ glutamate dehydrogenase
--> Failed index to match:  1560 ___ glutamate dehydrogenase
--> Failed index to match:  1561 ___ glutamate dehydrogenase
--> Failed index to match:  1562 ___ glutamate dehydrogenase
--> Failed i

--> Failed index to match:  2103 ___ adenylate kinase
--> Failed index to match:  2104 ___ adenylate kinase
--> Failed index to match:  2106 ___ adenylate kinase
--> Failed index to match:  2108 ___ adenylate kinase
--> Failed index to match:  2109 ___ adenylate kinase
--> Failed index to match:  2110 ___ adenylate kinase
--> Failed index to match:  2111 ___ adenylate kinase
--> Failed index to match:  2115 ___ adenylate kinase
--> Failed index to match:  2120 ___ adenylate kinase
--> Failed index to match:  2121 ___ adenylate kinase
--> Failed index to match:  2123 ___ adenylate kinase
--> Failed index to match:  2124 ___ adenylate kinase
--> Failed index to match:  2126 ___ adenylate kinase
--> Failed index to match:  2131 ___ adenylate kinase
--> Failed index to match:  2132 ___ adenylate kinase
--> Failed index to match:  2134 ___ adenylate kinase
--> Failed index to match:  2135 ___ adenylate kinase
--> Failed index to match:  2136 ___ adenylate kinase
--> Failed index to match:  

--> Failed index to match:  2996 ___ phosphoserine transaminase
--> Failed index to match:  2998 ___ phosphoserine transaminase
--> Failed index to match:  3000 ___ phosphoserine transaminase
--> Failed index to match:  3003 ___ phosphoserine transaminase
--> Failed index to match:  3025 ___ phosphoserine transaminase
--> Failed index to match:  3027 ___ phosphoserine transaminase
--> Failed index to match:  3028 ___ pyrroline-5-carboxylate reductase
--> Failed index to match:  3055 ___ aminoacylase
--> Failed index to match:  3081 ___ glycerate dehydrogenase
--> Failed index to match:  3083 ___ glycerate dehydrogenase
--> Failed index to match:  3084 ___ glycerate dehydrogenase
--> Failed index to match:  3085 ___ glycerate dehydrogenase
--> Failed index to match:  3086 ___ glycerate dehydrogenase
--> Failed index to match:  3087 ___ glycerate dehydrogenase
--> Failed index to match:  3088 ___ glycerate dehydrogenase
--> Failed index to match:  3089 ___ glycerate dehydrogenase
--> Fai

--> Failed index to match:  3452 ___ dihydrolipoamide dehydrogenase
--> Failed index to match:  3453 ___ dihydrolipoamide dehydrogenase
--> Failed index to match:  3454 ___ dihydrolipoamide dehydrogenase
--> Failed index to match:  3455 ___ glycine dehydrogenase (decarboxylating)
--> Failed index to match:  3456 ___ glycine dehydrogenase (decarboxylating)
--> Failed index to match:  3457 ___ glycine dehydrogenase (decarboxylating)
--> Failed index to match:  3458 ___ glycine dehydrogenase (decarboxylating)
--> Failed index to match:  3459 ___ glycine dehydrogenase (decarboxylating)
--> Failed index to match:  3460 ___ glycine dehydrogenase (decarboxylating)
--> Failed index to match:  3461 ___ glycine dehydrogenase (decarboxylating)
--> Failed index to match:  3462 ___ glycine dehydrogenase (decarboxylating)
--> Failed index to match:  3463 ___ glycine dehydrogenase (decarboxylating)
--> Failed index to match:  3464 ___ glycine dehydrogenase (decarboxylating)
--> Failed index to match:

--> Failed index to match:  4035 ___ creatine kinase
--> Failed index to match:  4038 ___ creatine kinase
--> Failed index to match:  4043 ___ creatine kinase
--> Failed index to match:  4047 ___ creatine kinase
--> Failed index to match:  4053 ___ homoserine dehydrogenase
--> Failed index to match:  4055 ___ glycine hydroxymethyltransferase
--> Failed index to match:  4056 ___ glycine hydroxymethyltransferase
--> Failed index to match:  4057 ___ glycine hydroxymethyltransferase
--> Failed index to match:  4059 ___ glycine hydroxymethyltransferase
--> Failed index to match:  4060 ___ D-(-)-phenylglycyl-_-lactamide amidohydrolase
--> Failed index to match:  4061 ___ D-(-)-phenylglycyl-_-lactamide amidohydrolase
--> Failed index to match:  4063 ___ alkaline phosphatase
--> Failed index to match:  4068 ___ sinapate 1-glucosyltransferase
--> Failed index to match:  4069 ___ 2-arylpropionyl-CoA epimerase
--> Failed index to match:  4070 ___ nicotinate phosphoribosyltransferase
--> Failed in

647
[647, 648, 649, 650]
0
[596, 597, 598, 599]
648
[647, 648, 649, 650]
1
[596, 597, 598, 599]
649
[647, 648, 649, 650]
2
[596, 597, 598, 599]
650
[647, 648, 649, 650]
3
[596, 597, 598, 599]
661
[661, 662]
0
[613, 614]
662
[661, 662]
1
[613, 614]
663
[663]
0
[610]
664
[664]
0
[612]
665
[665]
0
[611]
714
[714]
0
[637]
761
[761]
0
[687]
764
[764]
0
[689]
769
[769]
0
[705]
883
[883]
0
[854]
940
[940]
0
[910]
943
[943]
0
[918]
944
[944]
0
[917]
948
[948]
0
[919]
1064
[1064]
0
[4913]
1065
[1065]
0
[954]
1072
[1072, 1073, 1074]
0
[959, 960, 961]
1073
[1072, 1073, 1074]
1
[959, 960, 961]
1074
[1072, 1073, 1074]
2
[959, 960, 961]
1075
[1075]
0
[958]
1082
[1082]
0
[971]
1083
[1083]
0
[970]
1084
[1084]
0
[969]
1085
[1085]
0
[968]
1088
[1088]
0
[973]
1094
[1094]
0
[982]
1095
[1095]
0
[981]
1096
[1096]
0
[980]
1097
[1097]
0
[978]
1098
[1098]
0
[977]
1099
[1099]
0
[976]
1100
[1100]
0
[983]
1101
[1101]
0
[984]
1125
[1125]
0
[1008]
--> ERROR: Repeated du index < 1155 >
1158
[1158]
0
[1043]
1166
[116

3400
[3400]
0
[3203]
3401
[3401]
0
[3200]
3402
[3402]
0
[3206]
3403
[3403, 3404, 3405]
0
[3186, 3187, 3188]
3404
[3403, 3404, 3405]
1
[3186, 3187, 3188]
3405
[3403, 3404, 3405]
2
[3186, 3187, 3188]
3407
[3407]
0
[3169]
3408
[3408]
0
[3192]
3409
[3409]
0
[3171]
3410
[3410]
0
[3170]
3411
[3411]
0
[3190]
3412
[3412]
0
[3194]
3413
[3413]
0
[3189]
3414
[3414]
0
[3190]
proposed new index 3414
--> ERROR: The master_index < 3190 > is predefined as < 3411 >.
3415
[3415]
0
[3194]
proposed new index 3415
--> ERROR: The master_index < 3194 > is predefined as < 3412 >.
3416
[3416]
0
[3193]
3417
[3417]
0
[3172]
3418
[3418, 3419]
0
[3196, 3197]
3419
[3418, 3419]
1
[3196, 3197]
3420
[3420]
0
[3177]
3421
[3421]
0
[3198]
3422
[3422]
0
[3173]
3423
[3423]
0
[3202]
3424
[3424]
0
[3174]
3425
[3425]
0
[3207]
3426
[3426]
0
[3174]
proposed new index 3426
--> ERROR: The master_index < 3174 > is predefined as < 3424 >.
3427
[3427]
0
[3199]
3428
[3428]
0
[3201]
3429
[3429]
0
[3181]
3430
[3430]
0
[3176]
3431
[3431

### Master file statistics

In [5]:
from datetime import date
master = pandas.read_csv(f'{date.today()}_master_TECR_3.csv')
enzymes = set(list(master['Enzyme:']))
unique_references = set()
for reference in master['Reference ID:']:
    striped_ref = re.sub('(_.+)', '', str(reference))
    unique_references.add(striped_ref)

print('Master file datums: ', len(master))
print('Master file enzymes: ', len(enzymes))
print('Master file references: ', len(unique_references))

Master file datums:  5813
Master file enzymes:  505
Master file references:  1028


# Contrast the scrapings

In [7]:
%run ../../comparisons.py

ecoli_bigg = json.load(open('e_coli_core.json'))

comp = comparison({'freiburger':freiburger_dataframe},{'noor': noor_dataframe},{'du': du_dataframe})
enzyme_diff = comp.three_way_comparison('enzymes')
reference_diff = comp.three_way_comparison('references')
print('Enzymes')
for key, value in enzyme_diff.items():
    print(key, ': ', len(value))
print('\nReferences')
for key, value in reference_diff.items():
    print(key, ': ', len(value))  

print('\n')
master_file = pandas.read_csv(f'{date.today()}_master_TECR_3.csv')
comp.bigg_comparison(ecoli_bigg, master_file)

import pandas
print('master file')
glycolysis = pandas.read_csv('EC values of glycolysis enzymes.csv')
for index, row in glycolysis.iterrows():
    mask = master_file['EC Value:'].str.contains(row['EC'])
    if row['Enzyme'] == 'triose phosphate isomerase':
        questionable_mask = mask
    print(row['Enzyme'], '\t', row['EC'])
    print(mask.value_counts()[True])
    
print('\n\nfreiburger scraping')
glycolysis = pandas.read_csv('EC values of glycolysis enzymes.csv')
for index, row in glycolysis.iterrows():
    mask = freiburger_dataframe['EC Value:'].str.contains(row['EC'])
    if row['Enzyme'] == 'triose phosphate isomerase':
        questionable_mask = mask
    print(row['Enzyme'], '\t', row['EC'])
    print(mask.value_counts()[True])

Enzymes
noor, not in du :  65
noor, not in freiburger :  0
du, not in noor :  47
du, not in freiburger :  1
freiburger, not in noor :  74
freiburger, not in du :  93

References
noor, not in du :  122
noor, not in freiburger :  0
du, not in noor :  79
du, not in freiburger :  0
freiburger, not in noor :  109
freiburger, not in du :  152


Phosphofructokinase 	 ['2.7.1.11']
20
Pyruvate formate lyase 	 ['2.3.1.54']
2
Glucose-6-phosphate isomerase 	 ['5.3.1.9']
64
Phosphoglycerate kinase 	 ['2.7.2.3']
80
6-phosphogluconolactonase 	 ['3.1.1.31']
6
Acetaldehyde dehydrogenase (acetylating) 	 ['1.2.1.10']
7
Phosphoglycerate mutase 	 ['5.4.2.1', '5.4.2.11', '5.4.2.12']
73
Alcohol dehydrogenase (ethanol) 	 ['1.1.1.71', '1.1.1.1']
271
Acetate kinase 	 ['2.7.2.15', '2.7.2.1']
29
Phosphoenolpyruvate carboxylase 	 ['4.1.1.31']
0
0
Aconitase (half-reaction A, Citrate hydro-lyase) 	 ['4.2.1.3']
94
Aconitase (half-reaction B, Isocitrate hydro-lyase) 	 ['4.2.1.3']
94
ATP maintenance requirement 	 ['3.6

# Add reference mapping

In [8]:
%run ../../reference_mapping.py

# import the files
mappings = pandas.read_csv('references_with_abstracts.csv')
mappings.fillna(' ')

# master_file = httpx.request("GET", "https://raw.githubusercontent.com/freiburgermsu/Biochemical-databases/main/openTECR/TECR_files/core_scripts/Notebooks/2021-09-11_master_TECR_3.csv")
# master_file = StringIO(master_file.content.decode("UTF-8"))
# master_file = pandas.read_csv(master_file)
master_file = pandas.read_csv(f'{date.today()}_master_TECR_3.csv')
master_file.fillna(' ')

# define the parameters
reference_ids_column = 'reference_code_in_online_database'
mappings_path = "https://raw.githubusercontent.com/roberts-farm-of-ideas/8/56bab7c99ef340a6c3853b595f4d777ac7288c54/materials/reference_code_mappings/reference_code_mappings.csv"
doi_column = 'doi_from_robert'
pmid_column = 'pmid'

# apply the reference mapping
refmap = reference_mapping()
mapped_master = refmap.apply_mapping(master_file, mappings, reference_ids_column, doi_column, pmid_column, export = True)
mapped_master.head()

References added to 3045 datums


Unnamed: 0,freiburger_index,du_index,noor_index,Enzyme:,KEGG Reaction:,CID Reaction:,PMID,DOI,Reaction:,Reference:,...,Km,Method:,Buffer:,Experimental conditions,EC Value:,solutes [mol / kg],solutes [mol / dm^3],Ionic strength [mol / dm^3],Ionic strength [mol / kg],Enthalpy [kJ / mol]
0,0.0,140.0,4336.0,aspartate ammonia-lyase,kegg:C00049 = kegg:C00122 + kegg:C00014,,16743691.0,10.1042/bj0200545,L-aspartate(aq) = fumarate(aq) + ammonia(aq),Quastel J.H.; Woolf B.; Biochem. J.; 20 545 (1...,...,,chemical analysis,phosphate,,4.3.1.1,,,,,
1,1.0,141.0,2910.0,aspartate ammonia-lyase,kegg:C00049 = kegg:C00122 + kegg:C00014,,16744231.0,10.1042/bj0230472,L-aspartate(aq) = fumarate(aq) + ammonia(aq),Woolf B.; Biochem. J.; 23 472 (1929).,...,,chemical analysis and polarimetry,phosphate,,4.3.1.1,,,,,
2,2.0,142.0,2129.0,fumarate hydratase,kegg:C00122 + kegg:C00001 = kegg:C00149,,,,fumarate(aq) + H2O(l) = (S)-malate(aq),Borsook H.; Schott H.F.; J. Biol. Chem.; 92 55...,...,,electrochemistry,,,4.2.1.2,,,,,
3,3.0,143.0,2130.0,fumarate hydratase,kegg:C00122 + kegg:C00001 = kegg:C00149,,,,fumarate(aq) + H2O(l) = (S)-malate(aq),Borsook H.; Schott H.F.; J. Biol. Chem.; 92 55...,...,,electrochemistry,,,4.2.1.2,,,,,
4,4.0,148.0,791.0,fumarate hydratase,kegg:C00122 + kegg:C00001 = kegg:C00149,,,,fumarate(aq) + H2O(l) = (S)-malate(aq),Jacobsohn K.P.; Biochem. Z.; 274 167 (1934).,...,,polarimetry,barbital,,4.2.1.2,,,,,


### Phosphofructokinase content for Table 1

In [9]:
from datetime import date
master_file = pandas.read_csv(f'{date.today()}_mapped_master.csv')

pfk_content = master_file.loc[master_file['Enzyme:'] == '6-phosphofructokinase']
pandas.set_option('display.max_columns', None)
display(pfk_content)

Unnamed: 0.1,Unnamed: 0,freiburger_index,du_index,noor_index,Enzyme:,KEGG Reaction:,CID Reaction:,PMID,DOI,Reaction:,Reference:,Reference ID:,T [K],pH,Keq,Km,Method:,Buffer:,Experimental conditions,EC Value:,solutes [mol / kg],solutes [mol / dm^3],Ionic strength [mol / dm^3],Ionic strength [mol / kg],Enthalpy [kJ / mol]
2103,2103,2103.0,2194.0,1774.0,6-phosphofructokinase,kegg:C00002 + kegg:C00085 = kegg:C00008 + kegg...,,4270771.0,,ATP(aq) + D-fructose 6-phosphate(aq) = ADP(aq)...,Hanson R.L.; Rudolph F.B.; Lardy H.A.; J. Biol...,73HAN/RUD,303.15,8.0,2290.0,,enzymatic assay; spectrophotometry,Tris (0.033 mol dm-3) + HCl,,2.7.1.11,,,,,
2282,2282,2282.0,2506.0,3993.0,6-phosphofructokinase,kegg:C00002 + kegg:C00085 = kegg:C00008 + kegg...,,241184.0,,ATP(aq) + D-fructose 6-phosphate(aq) = ADP(aq)...,Bvhme H.-J.; Schellenberger W.; Hofmann E.; Ac...,75BOH/SCH,298.15,7.0,800.0,,calorimetry,Tris (0.1 mol dm-3) + HCl,,2.7.1.11,,,,,
2283,2283,2283.0,2509.0,3997.0,6-phosphofructokinase,kegg:C00002 + kegg:C00085 = kegg:C00008 + kegg...,,241184.0,,ATP(aq) + D-fructose 6-phosphate(aq) = ADP(aq)...,Bvhme H.-J.; Schellenberger W.; Hofmann E.; Ac...,75BOH/SCH,298.15,8.0,2800.0,,calorimetry,Tris (0.1 mol dm-3) + HCl,,2.7.1.11,,,,,
2284,2284,2284.0,2507.0,3994.0,6-phosphofructokinase,kegg:C00002 + kegg:C00085 = kegg:C00008 + kegg...,,241184.0,,ATP(aq) + D-fructose 6-phosphate(aq) = ADP(aq)...,Bvhme H.-J.; Schellenberger W.; Hofmann E.; Ac...,75BOH/SCH,303.15,7.0,1000.0,,calorimetry,Tris (0.1 mol dm-3) + HCl,,2.7.1.11,,,,,
2285,2285,2285.0,2510.0,3992.0,6-phosphofructokinase,kegg:C00002 + kegg:C00085 = kegg:C00008 + kegg...,,241184.0,,ATP(aq) + D-fructose 6-phosphate(aq) = ADP(aq)...,Bvhme H.-J.; Schellenberger W.; Hofmann E.; Ac...,75BOH/SCH,303.15,8.0,2900.0,,calorimetry,Tris (0.1 mol dm-3) + HCl,,2.7.1.11,,,,,
2286,2286,2286.0,2508.0,3995.0,6-phosphofructokinase,kegg:C00002 + kegg:C00085 = kegg:C00008 + kegg...,,241184.0,,ATP(aq) + D-fructose 6-phosphate(aq) = ADP(aq)...,Bvhme H.-J.; Schellenberger W.; Hofmann E.; Ac...,75BOH/SCH,310.15,7.0,2700.0,,calorimetry,Tris (0.1 mol dm-3) + HCl,,2.7.1.11,,,,,
2287,2287,2287.0,2511.0,3996.0,6-phosphofructokinase,kegg:C00002 + kegg:C00085 = kegg:C00008 + kegg...,,241184.0,,ATP(aq) + D-fructose 6-phosphate(aq) = ADP(aq)...,Bvhme H.-J.; Schellenberger W.; Hofmann E.; Ac...,75BOH/SCH,310.15,8.0,4800.0,,calorimetry,Tris (0.1 mol dm-3) + HCl,,2.7.1.11,,,,,
2288,2288,2288.0,,3998.0,6-phosphofructokinase,kegg:C00002 + kegg:C00085 = kegg:C00008 + kegg...,,241184.0,,ATP(aq) + D-fructose 6-phosphate(aq) = ADP(aq)...,Bvhme H.-J.; Schellenberger W.; Hofmann E.; Ac...,75BOH/SCH,298.15,7.0,,,calorimetry,Tris (0.1 mol dm-3) + HCl,,2.7.1.11,,,,,-84.2
2289,2289,2289.0,,4003.0,6-phosphofructokinase,kegg:C00002 + kegg:C00085 = kegg:C00008 + kegg...,,241184.0,,ATP(aq) + D-fructose 6-phosphate(aq) = ADP(aq)...,Bvhme H.-J.; Schellenberger W.; Hofmann E.; Ac...,75BOH/SCH,298.15,8.0,,,calorimetry,Tris (0.1 mol dm-3) + HCl,,2.7.1.11,,,,,-70.4
2290,2290,2290.0,,4001.0,6-phosphofructokinase,kegg:C00002 + kegg:C00085 = kegg:C00008 + kegg...,,241184.0,,ATP(aq) + D-fructose 6-phosphate(aq) = ADP(aq)...,Bvhme H.-J.; Schellenberger W.; Hofmann E.; Ac...,75BOH/SCH,303.15,7.0,,,calorimetry,Tris (0.1 mol dm-3) + HCl,,2.7.1.11,,,,,-77.4


# Convert the master file to JSONs

In [None]:
%run ../../csv_to_json.py

master_file = pandas.read_csv(f'{date.today()}_mapped_master.csv')
json_template = json.load(open('tecrdb_template.json'))

tojson = to_json(master_file, json_template)
tojson.parse_to_json()