# Import the scrapings

### Refining the freiburger scraping 

In [1]:
from numpy import unique, nan 
import re 

empty = [nan, 'nan', None, ' ', '', 'NaN']


def refine_freiburger(freiburger_file):
    # strip spaces from the freiburger_file
    for column in freiburger_file:
        for index, entry in freiburger_file[column].iteritems():
            if entry is float:
                freiburger_file.at[index, column] = str(entry).strip(' ?~')
                freiburger_file.at[index, column] = float(entry)
            elif entry is int:
                freiburger_file.at[index, column] = str(entry).strip(' ?~')
                freiburger_file.at[index, column] = int(entry)
            else:
                freiburger_file.at[index, column] = str(entry).strip(' ?~')

    # extrapolate metadata to all datums
    for index, row in freiburger_file.iterrows():    
        if row['Reference ID:'] not in empty:
            reference_id = row['Reference ID:']
            reference = row['Reference:']
            methods = row['Method:']
            buffer = row['Buffer:']
            ec = row['EC Value:']
        elif row['Reference ID:'] in empty:
            freiburger_file.at[index, 'Reference ID:'] = reference_id
            freiburger_file.at[index, 'Reference:'] = reference
            freiburger_file.at[index, 'Method:'] = methods
            freiburger_file.at[index, 'Buffer:'] = buffer
            freiburger_file.at[index, 'EC Value:'] = ec
        else:
            print('ERROR: The reference has an unidentified structure.')
        
    # strip the EC values
    for index, master_ec in freiburger_file['EC Value:'].iteritems():
        if master_ec not in empty:  
            master_ec = re.search('([\d|\.|\-]+)', master_ec).group()
            freiburger_file['EC Value:'].iloc[index] = master_ec
            
    # consolidate the solutes column 
    for index, solute in freiburger_file['solutes [mol / dm^3]'].iteritems():
        if solute not in empty:  
            solute_list = solute.split(' & ')
            if len(solute_list) > 1:
                solute_list = unique(solute_list)
                if len(solute_list) > 2:
                    print('Large solute set {}'.format(solute_list))
                else:
                    freiburger_file['solutes [mol / dm^3]'].iloc[index] = ' & '.join([x for x in list(solute_list)])

            elif solute_list[0] in ['#NAME']:
                freiburger_file['solutes [mol / dm^3]'].iloc[index] = ''

    # create the additional columns
    column_nans = ['' for row in range(len(freiburger_file))]  
    freiburger_file.insert(0, 'noor_index', column_nans)  
    freiburger_file.insert(0, 'du_index', column_nans)  
    freiburger_file.insert(0, 'freiburger_index', freiburger_file.index)
    freiburger_file.insert(4, 'KEGG Reaction:', column_nans)
    freiburger_file.insert(5, 'CID Reaction:', column_nans)

    
    return freiburger_file

### import the CSV files

In [2]:
import pandas

noor_dataframe = pandas.read_csv('TECRDB_Elad_Noor.csv').fillna(' ')
manual_noor_curation = pandas.read_csv('Manual curation of the programmatically unmatched datums, comma delimited.txt', header = 0)

du_dataframe = pandas.read_excel('mmc2.xlsx', sheet_name = 'Table S1. TECRDB Keqs')
du_dataframe.rename(columns = {'Reaction.1': 'reaction_string'}, inplace = True)
manual_du_curation = pandas.read_csv('Manual curation of Du et al., comma delimited.txt', header = 0)

freiburger_dataframe = pandas.read_csv('2021-08-04_vetted & reorganized NIST database_01.csv').fillna(' ')
freiburger_dataframe = refine_freiburger(freiburger_dataframe)
freiburger_dataframe.head()

  warn(msg)


Unnamed: 0,freiburger_index,du_index,noor_index,Enzyme:,KEGG Reaction:,CID Reaction:,Reaction:,Reference:,Reference ID:,T [K],...,Km,Method:,Buffer:,Experimental conditions,EC Value:,solutes [mol / kg],solutes [mol / dm^3],Ionic strength [mol / dm^3],Ionic strength [mol / kg],Enthalpy [kJ / mol]
0,0,,,aspartate ammonia-lyase,,,L-aspartate(aq) = fumarate(aq) + ammonia(aq),Quastel J.H.; Woolf B.; Biochem. J.; 20 545 (1...,26QUA/WOO_1205,310.15,...,,chemical analysis,phosphate,,4.3.1.1,,,,,
1,1,,,aspartate ammonia-lyase,,,L-aspartate(aq) = fumarate(aq) + ammonia(aq),Woolf B.; Biochem. J.; 23 472 (1929).,29WOO_1206,310.15,...,,chemical analysis and polarimetry,phosphate,,4.3.1.1,,,,,
2,2,,,fumarate hydratase,,,fumarate(aq) + H2O(l) = (S)-malate(aq),Borsook H.; Schott H.F.; J. Biol. Chem.; 92 55...,31BOR/SCH_1141,298.15,...,,electrochemistry,,,4.2.1.2,,,,,
3,3,,,fumarate hydratase,,,fumarate(aq) + H2O(l) = (S)-malate(aq),Borsook H.; Schott H.F.; J. Biol. Chem.; 92 55...,31BOR/SCH_1141,298.15,...,,electrochemistry,,,4.2.1.2,,,,,
4,4,,,fumarate hydratase,,,fumarate(aq) + H2O(l) = (S)-malate(aq),Jacobsohn K.P.; Biochem. Z.; 274 167 (1934).,34JAC_1142,278.15,...,,polarimetry,barbital,,4.2.1.2,,,,,


# Merge the scrapings

In [9]:
%run ../merging_datasets.py

# merging the noor scraping
elad_enzymes = 'enzyme_name'
elad_references = 'reference'
scraping_name = 'noor'
mrgpkg = merge_package(freiburger_dataframe, noor_dataframe, scraping_name)
mrgpkg.add_new(elad_enzymes, elad_references) #, export = True)
mrgpkg.incorporate_manual_curation(manual_noor_curation)
mrgpkg.merge_existing() #export = True)
# noor_master_merge = mrgpkg.merge(elad_enzymes, elad_references, manual_noor_curation, export = True)

# merging the du scraping
# du_enzymes = 'Reaction'
# du_references = 'Reference_id'
# scraping_name = 'du'
# mrgpkg = merge_package(noor_master_merge, du_dataframe, scraping_name) #, first = False)
# final_master_file = mrgpkg.merge(du_enzymes, du_references, scraping_name, manual_du_curation, export = True)


enzymes in the master file:  491
enzymes in the new file:  431
Extra enzymes in the master file, versus new file:  60
Missing enzymes in the master file, versus new file:  0
set()

references in the master file:  1015
references in the new file:  919
Extra references in the master file, versus new file:  96
Missing references in the master file, versus new file:  0
set()
before 4934
after 4934
total additions 0
[38]
['--']
The [38] new_id is a duplicate.
[77]
['--']
The [77] new_id is a duplicate.
[79, 80]
[708, 709]
79

matched pair: 
new_index 79
master_index 708 

80

matched pair: 
new_index 80
master_index 709 

[114]
['--']
The [114] new_id is a duplicate.
[270]
[1762]
270

matched pair: 
new_index 270
master_index 1762 

[271]
[1763]
271

matched pair: 
new_index 271
master_index 1763 

[283]
[2009]
283

matched pair: 
new_index 283
master_index 2009 

[315]
['--']
The [315] new_id is a duplicate.
[358]
[' \t\t--']
The [358] new_id is a duplicate.
[449]
[' \t\t--']
The [449] ne

ValueError: invalid literal for int() with base 10: 'New'

# Contrast the scrapings

In [4]:
# %run ../comparisons.py

# ecoli_bigg = json.load(open('e_coli_core.json'))

# print(len(freiburger_dataframe))

# comp = comparison({'freiburger':freiburger_dataframe},{'noor': noor_dataframe},{'du': du_dataframe})
# enzyme_diff = comp.three_way_comparison('enzymes')
# reference_diff = comp.three_way_comparison('references')
# print('Enzymes')
# for key, value in enzyme_diff.items():
#     print(key, ': ', len(value))
# print('\nReferences')
# for key, value in reference_diff.items():
#     print(key, ': ', len(value))  

# print('\n')
# comp.bigg_comparison(ecoli_bigg, noor_master_merge)