# NIST Web Scraping

The following code documents the methods of combining and organizing the scraped data.

## Scraping code

The web scraping was conducted through the following code. Increments of 27 reference ID URLs were scraped to maintain accuracy of the scraper while executing a timely web scrape.

In [None]:
#Import Statements
from bs4 import BeautifulSoup
import requests #Pulling webpages
import pandas 
import numpy
import re
import math

#defining the website
root_url = "https://randr.nist.gov/enzyme/DataDetails.aspx?ID="
end_url = "&finalterm=&data=enzyme"
#===========================================================================================================================

#identify the table and rows of pertinent data
bs = BeautifulSoup(open('Enzyme Thermodynamic Database.html'), 'lxml')
table = bs.find("table", attrs = {'id': 'MainBody_gvSearch'})
body = table.find_all("tr")
#print(body[1])

#defining the initial boundaries of the aggregate dataframe
index_range = 12000
index_count = 0
loop_count = 0 
lower_bound = 1 #math.floor(1*len(body)/50)
upper_bound = math.floor(1*len(body)/50)

#loop through the enzyme id values 
id_values = []
id_refined = ''

for row in range(lower_bound,upper_bound):   #
    id_value = body[row].find("a")
    
    #refining the reference IDs 
    id_refined = re.findall(r"(\d+\w+.\w+.\d+)(?<=)",str(id_value))
    scraped_id = str(id_refined).strip('[]')
    scraped_id2 = re.sub(r'(\')', '', scraped_id)
    total_url = root_url + scraped_id2 + end_url
    
    #defining the soup
    soup = requests.get(total_url).text
    bs = BeautifulSoup(soup, 'lxml')
    
    #identify the table1, body1, and headers
    tables1 = bs.find_all("table", attrs = {"id": "MainBody_extraData"})
    print(scraped_id2, '\t: ', loop_count, '\t, ', len(tables1))
    if len(tables1) != 1:
        continue

    body1 = tables1[0].find_all("tr")
    body_rows1 = body1[1:]
    heads = body1[0]

    headings = []
    for head in heads.find_all("th"):
        head = (head.text).rstrip("\n")
        headings.append(head)


    #identify entries of the body rows
    total_rows = []
    for row_number in range(len(body_rows1)):
        each_row = []
        for row_element in body_rows1[row_number].find_all("td"):
            row_refined = re.sub("(\xa0)|(\n)|,","",row_element.text)
            each_row.append(row_refined)
        total_rows.append(each_row)
    
    #create a dataframe
    index_list_body = range(index_count+lower_bound-1, len(body_rows1)+index_count+lower_bound-1)
    bs_dataframe_table1 = pandas.DataFrame(data = total_rows, columns = headings, index = index_list_body)
    bs_dataframe_table1.drop(bs_dataframe_table1.columns[len(bs_dataframe_table1.columns)-1], axis=1, inplace=True)

    
#===========================================================================================================================

    #identify table2 and body2
    tables = bs.find_all("table", attrs={"id": "MainBody_DataList1"})
    if len(tables) == 0:
        continue
    body2 = tables[0].find_all("tr")
    body_rows2 = body2[1:]

    each_row2 = []
    for row in range(len(body_rows2)):
        for row_element in body_rows2[row].find_all("td"):
            row_refined2 = re.sub("(\xa0)|(\n)|,","",row_element.text)
            each_row2.append(row_refined2)

    information_entries_list = []
    information_values_list = []
    column_count = 0
    for i, element in enumerate(each_row2):
        if i == 0 or i % 2 == 0:
            information_entries_list.append(element)
            column_count += 1
        else:
            information_values_list.append(element)
            column_count += 1
    column_count /= 2


    #create a dataframe
    index_list_reference = range(index_count+lower_bound-1, index_count+1+lower_bound-1)
    bs_dataframe_pretable2 = pandas.DataFrame(data = [information_values_list], columns = information_entries_list, index = index_list_reference)
    bs_dataframe_pretable2.drop(bs_dataframe_pretable2.columns[len(bs_dataframe_pretable2.columns)-2], axis=1, inplace=True)
    
#===========================================================================================================================

    #merge the dataframes of this loop
    this_dataframe = bs_dataframe_table1.join(bs_dataframe_pretable2)
    this_dataframe.index.name = 'index'
    
   #iteratively coalesce the new dataframe into the old dataframe 
    if loop_count == 0:
        old_dataframe = this_dataframe
        old_dataframe.index.name = 'index'
        
    elif loop_count > 0:
        these_columns = []
        for column in this_dataframe:
            these_columns.append(column)
            
        old_columns = []
        for existing_column in old_dataframe:
            old_columns.append(existing_column)
            
        common_columns = list(set(these_columns).intersection(old_columns))
        
        #amalgamate the dataframe with the pre-existing dataframe
        current_dataframe = old_dataframe.merge(this_dataframe, on = common_columns, how = 'outer')
        old_dataframe = current_dataframe  
        

    #amalgamate the dataframe with the pre-existing dataframe
    index_count += len(body_rows1)
    loop_count += 1 


#export total_df to csv 
old_dataframe.to_csv("1.csv")

#print(bs_dataframe_table1)
old_dataframe

## CSV data combination

The following code was combined the individual CSV files. The code was dynamically applied to aggregate CSV files into the segments 1-5, 6-16, 17-36, and 37-50. The code subsequently combined each of the four segments into the complete csv file. The columns were reorganized according to subjective preference with the reference information on the left and the data on the right.

In [None]:
import pandas as pandas
import datetime
import re 

combined_dataframe = pandas.read_csv('')

#dataframes definition
'''df1 = pandas.read_csv("1-5.csv", encoding='cp1252')
df2 = pandas.read_csv("6-16.csv", encoding='cp1252')
df3 = pandas.read_csv("17-36.csv", encoding='cp1252')
df4 = pandas.read_csv("37-50.csv", encoding='cp1252')

dataframes = [df1,df2,df3,df4]
combined_csv = pandas.concat([f for f in dataframes])'''

#defining the columns 
loop_count = 0
these_columns = []
old_columns = []   
total_columns = []

for df in dataframes:  
    for this_column in df:
        if this_column not in total_columns:
            total_columns.append(this_column)
       
right_dataframe_columns = []
left_dataframe_columns = [] 
for column in total_columns:
    if re.match("(\w+)(?=:)", column) or re.match("(\w+\s\w+)(?=:)", column):
        if column not in left_dataframe_columns:
            left_dataframe_columns.append(column)    
            
    if column == 'T(K)':
        right_dataframe_columns.append(column)
    if column == 'pH ':
        right_dataframe_columns.append(column)
    if column == 'K<sub>c</sub>\' ':
        right_dataframe_columns.append(column)
    if column == 'δ<sub>r</sub>H\'<sup>o</sup>(kJ.mol<sup>-1</sup>)':
        right_dataframe_columns.append(column)
      
defined_columns = left_dataframe_columns + right_dataframe_columns 
remaining_columns = list(set(defined_columns).symmetric_difference(set(total_columns)))

#reorganizing the dataframe
final_dataframe_columns = defined_columns + remaining_columns
final_dataframe = combined_csv.reindex(columns = final_dataframe_columns)

#export total_df to csv 
final_dataframe.to_csv("{}_vetted & complete NIST database.csv".format(datetime.date.today()))

## Column combination

The following code combined columns with akin data in the complete CSV. Suffixes were used to further specify data in columns that combined myriad data sources. The columns were finally renamed with generic conventions.

In [None]:
import datetime 
import pandas 
import re

# dataframes definition
combined_dataframe = pandas.read_csv("2021-05-06_vetted & complete NIST database.csv")
df = combined_dataframe.astype(str)

empty_cell = ['nan', 'NaN', 'none', 'not given', '0', '', None]

combined_columns = []

for this_column in df:
    for index, row in df.iterrows():       
        # combine the equilibrium constant columns
        if re.search('(?i)(^K)', this_column) and not re.search('(Km\')', this_column):
            if str(df.at[index, this_column]) not in empty_cell: 
                if str(df.at[index, 'K<sub>c</sub>\' ']) in empty_cell:
                    df.at[index, 'K<sub>c</sub>\' '] = str(df.at[index, this_column])

                if str(df.at[index, this_column]) not in empty_cell:
                    if str(df.at[index, 'K<sub>c</sub>\' ']) != str(df.at[index, this_column]):
                        df.at[index, 'K<sub>c</sub>\' '] = str(df.at[index, 'K<sub>c</sub>\' ']) + ' & ' + str(df.at[index, this_column])

            if this_column !=  'K<sub>c</sub>\' ':
                if this_column not in combined_columns:
                    combined_columns.append(this_column)

        # combine the ethalpy columns
        if re.search('ë«', this_column):
            if str(df.at[index, this_column]) not in empty_cell:
                if str(df.at[index, 'ë«<sub>r</sub>H(kJ.mol<sup>-1</sup>)']) in empty_cell:
                    df.at[index, 'ë«<sub>r</sub>H(kJ.mol<sup>-1</sup>)'] = str(df.at[index, this_column])

                if str(df.at[index, this_column]) not in empty_cell:
                    if str(df.at[index, 'ë«<sub>r</sub>H(kJ.mol<sup>-1</sup>)']) != str(df.at[index, this_column]):
                        df.at[index, 'ë«<sub>r</sub>H(kJ.mol<sup>-1</sup>)'] = str(df.at[index, 'ë«<sub>r</sub>H(kJ.mol<sup>-1</sup>)']) + ' & ' + str(df.at[index, this_column])

            if this_column !=  'ë«<sub>r</sub>H(kJ.mol<sup>-1</sup>)':
                if this_column not in combined_columns:
                    combined_columns.append(this_column)


        # combine the concentration ionic strenth columns 
        if re.search('I<sub>c', this_column): 
            if str(df.at[index, this_column]) not in empty_cell:
                if str(df.at[index, 'I<sub>c</sub>(mol dm<sup>-3</sup>)']) in empty_cell:
                    df.at[index, 'I<sub>c</sub>(mol dm<sup>-3</sup>)'] = str(df.at[index, this_column])

                if str(df.at[index, this_column]) not in empty_cell:
                    if str(df.at[index, 'I<sub>c</sub>(mol dm<sup>-3</sup>)']) != str(df.at[index, this_column]):
                        df.at[index, 'I<sub>c</sub>(mol dm<sup>-3</sup>)'] = str(df.at[index, 'δ<sub>r</sub>H\'<sup>o</sup>(kJ.mol<sup>-1</sup>)']) + ' & ' + str(df.at[index, this_column])

            if this_column !=  'I<sub>c</sub>(mol dm<sup>-3</sup>)':
                if this_column not in combined_columns:
                    combined_columns.append(this_column)


        # combine the solute molar concentrations
        if re.search('(?<=c\()(\w+\d?\+?)(?<!,)', this_column):
            if re.search('(?<=c\()(\w+\d?\+?)(?<!,)', this_column):
                solute = str(re.search('(?<=c\()(\w+\d?\+?)(?<!,)', this_column).group(1))
                
            if str(df.at[index, this_column]) not in empty_cell:
                if str(df.at[index, 'c(glycerol,mol dm<sup>-3</sup>)']) in empty_cell:
                    df.at[index, 'c(glycerol,mol dm<sup>-3</sup>)'] = str(df.at[index, this_column]) + ' ' + solute

                if str(df.at[index, this_column]) not in empty_cell:
                    if str(df.at[index, 'c(glycerol,mol dm<sup>-3</sup>)']) != (str(df.at[index, this_column]) or str(df.at[index, this_column]) + ' ' + solute):
                        df.at[index, 'c(glycerol,mol dm<sup>-3</sup>)'] = str(df.at[index, 'c(glycerol,mol dm<sup>-3</sup>)']) + ' & ' + str(df.at[index, this_column]) + ' ' + solute

            if this_column !=  'c(glycerol,mol dm<sup>-3</sup>)':
                if this_column not in combined_columns:
                    combined_columns.append(this_column)


        # combine the solute molality concentrations
        if re.search('(?<=m\()(\w+\d?\+?)(?<!,)', this_column):
            if re.search('(?<=c\()(\w+\d?\+?)(?<!,)', this_column):
                solute = str(re.search('(?<=c\()(\w+\d?\+?)(?<!,)', this_column).group(1))
                
            if str(df.at[index, this_column]) not in empty_cell:
                if str(df.at[index, 'm(MgCl2,mol.kg<sup>-1</sup>)']) in empty_cell:
                    df.at[index, 'm(MgCl2,mol.kg<sup>-1</sup>)'] = str(df.at[index, this_column]) + ' ' + solute

                if str(df.at[index, this_column]) not in empty_cell:
                    if str(df.at[index, 'm(MgCl2,mol.kg<sup>-1</sup>)']) != (str(df.at[index, this_column]) or str(df.at[index, this_column]) + ' ' + solute):
                        df.at[index, 'm(MgCl2,mol.kg<sup>-1</sup>)'] = str(df.at[index, 'm(MgCl2,mol.kg<sup>-1</sup>)']) + ' & ' + str(df.at[index, this_column]) + ' ' + solute

            if this_column !=  'm(MgCl2,mol.kg<sup>-1</sup>)':
                if this_column not in combined_columns:
                    combined_columns.append(this_column)


        #combine buffer and solution details
        buffer_columns = ['buffer(mol dm<sup>-3</sup>)', 'buffer and/or salt ', 'media ', 'buffer ']
        if this_column in buffer_columns:
            if str(df.at[index, this_column]) not in empty_cell:
                if str(df.at[index, 'Buffer:']) in empty_cell:
                    df.at[index, 'Buffer:'] = str(df.at[index, this_column])

                if str(df.at[index, 'Buffer:']) not in empty_cell:
                    if not re.search(re.escape(str(df.at[index, this_column])), str(df.at[index, 'Buffer:'])):
                        df.at[index, 'Buffer:'] = str(df.at[index, 'Buffer:']) + '  +  ' + str(df.at[index, this_column])
                        #print('Buffer: ERROR, index: ', index)             

            if this_column !=  'Buffer:':
                if this_column not in combined_columns:
                    combined_columns.append(this_column)

        # combine environmental conditions
        solution_columns = ['salt ', 'cosolvent ', 'added solute ', 'protein ', 'added solute ', 'percent(dimethyl sulfoxide) ', 'p(MPa)', 'pMg ']
        if this_column in solution_columns:
            if str(df.at[index, this_column]) not in empty_cell:
                if str(df.at[index, 'solvent ']) in empty_cell:
                    if str(this_column) == 'p(MPa)':
                        #print("yes: ", index)
                        df.at[index, 'solvent '] = str(df.at[index, this_column]) + ' megapascals'  

                    elif str(this_column) == 'pMg ':
                        #print('yes: ', index)
                        df.at[index, 'solvent '] = str(df.at[index, this_column]) + ' = -log[Mg+2]'   
                        
                    elif str(this_column) == 'percent(dimethyl sulfoxide) ':
                        df.at[index, 'solvent '] = str(df.at[index, this_column]) + ' % DMSO'   

                    else:
                        df.at[index, 'solvent '] = str(df.at[index, this_column])

                if str(df.at[index, 'solvent ']) not in empty_cell:
                    if not re.search(re.escape(str(df.at[index, this_column])), str(df.at[index, 'solvent '])):
                        df.at[index, 'solvent '] = str(df.at[index, 'solvent ']) + '  +  ' + str(df.at[index, this_column])

            if this_column !=  'solvent ':
                if this_column not in combined_columns:
                    combined_columns.append(this_column)
       
            
# rename the base columns
df.rename(columns = {'c(glycerol,mol dm<sup>-3</sup>)':'solutes [mol / dm^3]', 
                     'I<sub>c</sub>(mol dm<sup>-3</sup>)':'Ionic strength [mol / dm^3]', 
                     'T(K)':'T [K]', 
                     'I<sub>m</sub>(mol.kg<sup>-1</sup>)':'Ionic strength [mol / kg]', 
                     'm(MgCl2,mol.kg<sup>-1</sup>)':'solutes [mol / kg]', 
                     'solvent ':'Experimental conditions', 
                     'K<sub>c</sub>\' ':'Keq', 
                     'ë«<sub>r</sub>H(kJ.mol<sup>-1</sup>)':'Enthalpy [kJ / mol]',
                     'Km\' ':'Km'},
          inplace = True)
            
for column in combined_columns:
    print(column)
    del df[column]
    
#export total_df to csv 
df.to_csv("{}_vetted & reorganized NIST database_01.csv".format(datetime.date.today()))


## Enzyme names and reactions

The enzyme names and corresponding reactions from each reference are added in an adjacent column. The generated file is the final file of the NIST web scraped database.

In [3]:
#Import Statements
from bs4 import BeautifulSoup
import pandas 
import numpy
import datetime
import os
import re


#import the csv file
total_csv = pandas.read_csv("2021-03-21_APF_vetted and reorganized complete NIST enzyme database_001.csv")
bs = BeautifulSoup(open('Enzyme Thermodynamic Database.html'), 'lxml')
#print(bs.prettify())

#identify the table and rows of pertinent data
table = bs.find("table", attrs = {'id': 'MainBody_gvSearch'})
body = table.find_all("tr")
print(len(body))


# loop through the enzyme id values 
reference_ids = []
enzyme_names = []
reactions = []
name_iteration = 0
id_refined = ''
for row in range(1, len(body)):
    id_value = body[row].find("a").text
    reference_ids.append(id_value)   
    
    enzyme_name = body[row].find('span', attrs = {'id': 'MainBody_gvSearch_lblEnzyme_%s' %(name_iteration)}).text
    enzyme_names.append(enzyme_name)
    
    reaction = body[row].find('span', attrs = {'id': 'MainBody_gvSearch_lblReaction_%s' %(name_iteration)}).text
    reactions.append(reaction)
    
    name_iteration += 1

#print(enzyme_names)
#print('\n\n', reactions)
    
# create the columns of enzyme names and reactions  
enzyme_iteration = 0    
enzyme_column = []
reaction_column = []
for index, row in total_csv.iterrows():
    loop_id = total_csv.at[index, 'Reference ID:']
    print(loop_id)
    try:
        if re.search('(\w+)', loop_id):
            if enzyme_iteration == 1432:
                break 

            enzyme = enzyme_names[enzyme_iteration]
            print(enzyme)
            reaction = reactions[enzyme_iteration]    
            print(reaction)

            enzyme_column.append(enzyme)
            reaction_column.append(reaction)    

            enzyme_iteration += 1

    except:
        #pass
        enzyme_column.append('')
        reaction_column.append('')
        #print('space')
        
# add the enzyme names and reactions to the CSV file
total_csv.insert(1, 'Enzyme', enzyme_column)
total_csv.insert(2, 'Reaction', reaction_column)

#generating the final CSV file
file_number = '1'
while os.path.exists('%s _vetted + reorganized NIST_%s.csv' %(datetime.date.today(), file_number)):
    file_number += 1

total_csv.to_csv("%s_vetted + reorganized NIST_%s.csv" %(datetime.date.today(), file_number))

1433
26QUA/WOO_1205
['26QUA/WOO_1205']
29WOO_1206
['29WOO_1206']
31BOR/SCH_1141
['31BOR/SCH_1141']
34JAC_1142
['34JAC_1142']
34LOH/MEY_1169
['34LOH/MEY_1169']
34MEY/LOH_1100
['34MEY/LOH_1100']
34MEY/LOH_1288
['34MEY/LOH_1288']
35AKA_1170
['35AKA_1170']
35JAC/TAP_1207
['35JAC/TAP_1207']
35MEY/KIE_1385
['35MEY/KIE_1385']
35MEY/KIE2_1386
['35MEY/KIE2_1386']
35MEY/LOH_1102
['35MEY/LOH_1102']
35MEY/LOH_1290
['35MEY/LOH_1290']
35MEY/SCH_797
['35MEY/SCH_797']
35MEY/SCH_798
['35MEY/SCH_798']
35MEY/SCH_802
['35MEY/SCH_802']
35MEY/SCH_805
['35MEY/SCH_805']
35MEY_1101
['35MEY_1101']
35MEY_1289
['35MEY_1289']
36EUL/ADL_7
['36EUL/ADL_7']
36LEH_589
['36LEH_589']
36LEH_614
['36LEH_614']
36MEY/LOH_1099
['36MEY/LOH_1099']
36MEY/SCH_1096
['36MEY/SCH_1096']
36VEI_914
['36VEI_914']
36VEI_915
['36VEI_915']
36VEI_917
['36VEI_917']
36VEI_922
['36VEI_922']
36VEI_923
['36VEI_923']
36VEI_924
['36VEI_924']
36VEI_925
['36VEI_925']
36VEI_926
['36VEI_926']
37ADL/SRE_8
['37ADL/SRE_8']
37EUL/ADL_127
['37EUL/ADL_127']

75GRI/CAR_1396
['75GRI/CAR_1396']
75IZU/REE_1376
['75IZU/REE_1376']
75JEN/NYG_460
['75JEN/NYG_460']
75JES_999
['75JES_999']
75KAP/BAR_980
['75KAP/BAR_980']
75KAP/BAR_981
['75KAP/BAR_981']
75KAP/BAR_982
['75KAP/BAR_982']
75KRI_1285
['75KRI_1285']
75KUR/KON_1095
['75KUR/KON_1095']
75KUR/KON_1098
['75KUR/KON_1098']
75MAN/LAN_336
['75MAN/LAN_336']
75MCG/JOR_547
['75MCG/JOR_547']
75MUR/TSU_461
['75MUR/TSU_461']
75PIE/GUY_388
['75PIE/GUY_388']
75PIE/GUY_391
['75PIE/GUY_391']
75SCH/GRE_218
['75SCH/GRE_218']
75SCH/RIF_138
['75SCH/RIF_138']
75SCH/RIF_150
['75SCH/RIF_150']
75SHI/BEA_560
['75SHI/BEA_560']
75SUN_700
['75SUN_700']
75WYR/GRI_223
['75WYR/GRI_223']
76BER/KLY_1009
['76BER/KLY_1009']
76GOL_548
['76GOL_548']
76GRE/BRI_328
['76GRE/BRI_328']
76GUY_807
['76GUY_807']
76HIL/ATT_1397
['76HIL/ATT_1397']
76JES_139
['76JES_139']
76LAW/GUY_649
['76LAW/GUY_649']
76LAW/GUY_773
['76LAW/GUY_773']
76LAW/GUY_830
['76LAW/GUY_830']
76LLO/KHA_1309
['76LLO/KHA_1309']
76MUR_1413
['76MUR_1413']
76RAO/BUT_618


54CHA_1086
 3(or 17)_-hydroxysteroid dehydrogenase
 4-androstene-17Î²-ol-3-one(aq) + NAD(aq) = 4-androstene-3,17-dione(aq) + NADH(aq)
54GIN_771
 ribose-5-phosphate isomerase
 D-ribose 5-phosphate(aq) = D-ribulose 5-phosphate(aq)
54GIN_779
 nucleoside-diphosphate kinase
 ATP(aq) + inosine 5'-diphosphate(aq) = ADP(aq) + inosine 5'-triphosphate(aq)
nan
nan
nan
nan
54GOL_406
 adenylate kinase
 2 ADP(aq) = AMP(aq) + ATP(aq)
nan
54GRE/MII_264
 ketotetraose-phosphate aldolase
 erythrulose 1-phosphate(aq) = formaldehyde(aq) + glycerone phosphate(aq)
54HAN/CRA_1261
 inorganic pyrophosphatase
 pyrophosphate(aq) + H2O(l) = 2 orthophosphate(aq)
54HEL_1433
 alkaline phosphatase
 D-glucose 6-phosphate(aq) + H2O(l) = D-glucose(aq) + orthophosphate(aq)
nan
nan
54HEL_1435
 alkaline phosphatase
 L-Î±-glycerophosphate(aq) + H2O(l) = glycerol(aq) + orthophosphate(aq)
54LEV/MEI_1439
 acetyl-CoA C-acetyltransferase
 2 acetyl-CoA(aq) = CoA(aq) + acetoacetyl-CoA(aq)
nan
nan
nan
54LIE/KOR_1036
 butyryl-CoA deh

nan
71WIL/ROC_1115
 phosphodiesterase I
 uridine 3':5'-(cyclic)phosphate(aq) + H2O(l) = UMP(aq)
71WOH_709
 ribonuclease T2
 adenosine 2':3'-(cyclic)phosphate(aq) + H2O(l) = adenosine 3'-monophosphate(aq)
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
nan
72BAK/JEN_290
 ribonuclease T2
 guanosine 2':3'-(cyclic)phosphate(aq) + H2O(l) = guanosine 3'-monophosphate(aq)
nan
nan
nan
nan
nan
nan
72CAG/FRI_570
 pancreatic ribonuclease
 cytidine 2':3'-(cyclic)phosphate(aq) + H2O(l) = cytidine 3'-monophosphate(aq)
nan
nan
nan
nan
nan
nan
72COO/MEI_521
 pancreatic ribonuclease
 uridine 2':3'-(cyclic)phosphate(aq) + H2O(l) = uridine 3'-monophosphate(aq)
72COO/MEI_522
 guanylate kinase
 ATP(aq) + GMP(aq) = ADP(aq) + GDP(aq)
72COO/MEI_523
 guanylate kinase
 ATP(aq) + dGMP(aq) = ADP(aq) + dGDP(aq)
72DAH/AND_1108
 adenylate cyclase
 ATP(aq) = adenosine 3':5'-(cyclic)phosphate(aq) + diphosphate(aq)
72DEL_427
 phosphate acetyltransferase and formate C-acetyltransferase
 pyruvate(aq) + orthophosphate(aq) = a

nan
nan
89TEW/GOL_912
 _-galactosidase
 lactose(aq) + H2O(l) = D-galactose(aq) + D-glucose(aq)
nan
89TEW/GOL_916
 _-galactosidase
 lactose(aq) + H2O(l) = D-galactose(aq) + D-glucose(aq)
nan
nan
89TEW/GOL_918
 _-fructofuranosidase
 sucrose(aq) + H2O(l) = D-glucose(aq) + D-fructose(aq)
nan
nan
nan
nan
nan
89TEW/GOL_919
 NAD(P)+ nucleosidase
 NAD(aq) + 3-acetylpyridine(aq) = 3-acetylpyridine adenine dinucleotide(aq) + nicotinamide(aq)
nan
nan
90LIU_1424
 NAD(P)+ nucleosidase
 Î²-nicotinamide mononucleotide(aq) + 3-acetylpyridine(aq) = 3-acetylpyridine mononucleotide(aq) + nicotinamide(aq)
90LUN/APR_431
 alcohol dehydrogenase
 ethanol(aq) + NAD(aq) = acetaldehyde(aq) + NADH(aq)
nan
nan
nan
90OCO/BUT_752
 a-mannosidase
 disaccharide(aq) + H2O(l) = 2 D-mannose(aq)
90SAN/SIN_1363
 a-mannosidase
 tetrasaccharide(aq) + H2O(l) = trisaccharide(aq) + D-mannose(aq)
91AND/KAT_1232
 a-mannosidase
 trisaccharide(aq) + H2O(l) = disaccharide(aq) + D-mannose(aq)
91GOL/BEL_882
 creatine kinase
 ATP(aq) + 

The above scraping protocols were refined and consolidated. The final dataframe consists of the desired columns from all components of the NIST website and the static HTML file. A 30-second time delay was introduced to the end of each scraping loop, which theoretically allows the NIST server to recover and critically improves the accuracy of the scraped values to an acceptable threshold. 

In [None]:
# Import Statements
from bs4 import BeautifulSoup
import requests #Pulling webpages
import pandas
import datetime
import numpy
import os
import re
import time
import math

# defining the website
root_url = "https://randr.nist.gov/enzyme/DataDetails.aspx?ID="
end_url = "&finalterm=&data=enzyme"
#===========================================================================================================================

# identify the table and rows of pertinent data
bs = BeautifulSoup(open('Enzyme Thermodynamic Database.html'), 'lxml')
table = bs.find("table", attrs = {'id': 'MainBody_gvSearch'})
body = table.find_all("tr")
#print(body[1])

# defining the boundaries of the dataframe section
index_range = 12000
index_count = 0
loop_count = 0 
output_loop = 1
lower_bound = 1 #math.floor(1*len(body)/50)
upper_bound = math.floor(1*len(body))

# loop through the enzyme id values 
name_iteration = 0
enzyme_iteration = 0
for row in range(lower_bound,upper_bound):   
    # parsing the reaction names and strings   
    enzyme_name = body[row].find('span', attrs = {'id': 'MainBody_gvSearch_lblEnzyme_%s' %(name_iteration)}).text
    reaction = body[row].find('span', attrs = {'id': 'MainBody_gvSearch_lblReaction_%s' %(name_iteration)}).text
    id_value = body[row].find("a").text
    name_iteration += 1
    
    # defining the soup
    total_url = root_url + id_value + end_url
    soup = requests.get(total_url).text
    bs = BeautifulSoup(soup, 'lxml')
    
    # scrape the table and header information
    tables1 = bs.find_all("table", attrs = {"id": "MainBody_extraData"})
    print(id_value, '\t: ', loop_count, '\t, ', len(tables1))
    if len(tables1) != 1:
        continue

    body1 = tables1[0].find_all("tr")
    body_rows1 = body1[1:]
    heads = body1[0]

    headings = ['Enzyme:', 'Reaction:']
    for head in heads.find_all("th"):
        head = (head.text).rstrip("\n")
        headings.append(head)
        
    #print(headings)

    total_rows = []
    for row_number in range(len(body_rows1)):
        each_row = [enzyme_name, reaction]
        for row_element in body_rows1[row_number].find_all("td"):
            row_refined = re.sub("(\xa0)|(\n)|,","",row_element.text)
            each_row.append(row_refined)
        total_rows.append(each_row)
        
    #print(total_rows)
    
    # create a dataframe
    index_list_body = range(index_count+lower_bound-1, len(body_rows1)+index_count+lower_bound-1)
    bs_dataframe_table1 = pandas.DataFrame(data = total_rows, columns = headings, index = index_list_body)
    bs_dataframe_table1.drop(bs_dataframe_table1.columns[len(bs_dataframe_table1.columns)-1], axis=1, inplace=True)
    '''display(bs_dataframe_table1)'''
    
#===========================================================================================================================

    # scrape additional information 
    tables = bs.find_all("table", attrs={"id": "MainBody_DataList1"})
    if len(tables) == 0:
        continue
    body2 = tables[0].find_all("tr")
    body_rows2 = body2[1:]

    each_row2 = []
    for row in range(len(body_rows2)):
        for row_element in body_rows2[row].find_all("td"):
            #print('row element: ', row_element)
            row_refined2 = re.sub("(\xa0)|(\n)|,","",row_element.text)
            #print('row refined', row_refined2)
            each_row2.append(row_refined2)

    information_entries_list = []
    information_values_list = []
    column_count = 0
    for i, element in enumerate(each_row2):
        if i == 0 or i % 2 == 0:
            information_entries_list.append(element)
            column_count += 1
        else:
            information_values_list.append(element)
            column_count += 1
    column_count /= 2

    # create the dataframe and refine the columns
    index_list_reference = range(index_count+lower_bound-1, index_count+1+lower_bound-1)
    bs_dataframe_pretable2 = pandas.DataFrame(data = [information_values_list], columns = information_entries_list, index = index_list_reference)
    bs_dataframe_pretable2.drop(bs_dataframe_pretable2.columns[len(bs_dataframe_pretable2.columns)-2], axis=1, inplace=True)
    bs_dataframe_pretable2.drop(bs_dataframe_pretable2.columns[len(bs_dataframe_pretable2.columns)-1], axis=1, inplace=True)
    '''display(bs_dataframe_pretable2)'''
    
#===========================================================================================================================

    # merge the dataframes of this loop
    this_dataframe = bs_dataframe_table1.join(bs_dataframe_pretable2)
    this_dataframe.index.name = 'index'
    
    # iteratively coalesce the new dataframe into the old dataframe 
    if loop_count == 0:
        old_dataframe = this_dataframe
        old_dataframe.index.name = 'index'
        
    elif loop_count > 0:
        these_columns = []
        for column in this_dataframe:
            these_columns.append(column)
            
        old_columns = []
        for existing_column in old_dataframe:
            old_columns.append(existing_column)
            
        common_columns = list(set(these_columns).intersection(old_columns))
        
        # amalgamate the dataframe with the pre-existing dataframe
        current_dataframe = old_dataframe.merge(this_dataframe, on = common_columns, how = 'outer')
        old_dataframe = current_dataframe  
        
    # amalgamate the dataframe with the pre-existing dataframe
    index_count += len(body_rows1)
    
    time_delay = 0
    time.sleep(time_delay)
    
    max_referenes_per_csv = 0
    if loop_count == max_referenes_per_csv:
        id_value = re.sub('(/)', '-', id_value)
        output = './individual scraping/{}, {}.csv'.format(datetime.date.today(), id_value, output_loop)
        while os.path.exists(output):
            output_loop += 1
            output = './individual scraping/2021-05-06_{}.csv'.format(datetime.date.today(), id_value, output_loop)
            
        old_dataframe.to_csv(output)
        
        loop_count = 0 
        
    else:        
        loop_count += 1 

26QUA/WOO_1205 	:  0 	,  1
29WOO_1206 	:  0 	,  1
31BOR/SCH_1141 	:  0 	,  1
34JAC_1142 	:  0 	,  1
34LOH/MEY_1169 	:  0 	,  1
34MEY/LOH_1100 	:  0 	,  1
34MEY/LOH_1288 	:  0 	,  1
35AKA_1170 	:  0 	,  1
35JAC/TAP_1207 	:  0 	,  1
35MEY/KIE_1385 	:  0 	,  1
35MEY/KIE2_1386 	:  0 	,  1
35MEY/LOH_1102 	:  0 	,  1
35MEY/LOH_1290 	:  0 	,  1
35MEY/SCH_797 	:  0 	,  1
35MEY/SCH_798 	:  0 	,  1
35MEY/SCH_802 	:  0 	,  1
35MEY/SCH_805 	:  0 	,  1
35MEY_1101 	:  0 	,  1
35MEY_1289 	:  0 	,  1
36EUL/ADL_7 	:  0 	,  1
36LEH_589 	:  0 	,  1
36LEH_614 	:  0 	,  1
36MEY/LOH_1099 	:  0 	,  1
36MEY/SCH_1096 	:  0 	,  1
36VEI_914 	:  0 	,  1
36VEI_915 	:  0 	,  1
36VEI_917 	:  0 	,  1
36VEI_922 	:  0 	,  1
36VEI_923 	:  0 	,  1
36VEI_924 	:  0 	,  1
36VEI_925 	:  0 	,  1
36VEI_926 	:  0 	,  1
37ADL/SRE_8 	:  0 	,  1
37EUL/ADL_127 	:  0 	,  1
37EUL/ADL_95 	:  0 	,  1
37EUL/ADL2_42 	:  0 	,  1
37EUL/ADL3_96 	:  0 	,  1
37NEG/WUL_9 	:  0 	,  1
38EUL/ADL_273 	:  0 	,  1
38MEY/SCH_1387 	:  0 	,  1
38SCH/HE

In [None]:
# import libraries
import pandas
import numpy
import glob
import os

#path_original = "./sabio_scraped/"
path = './individual scraping/'
files = glob.glob(os.path.join(path, '*.csv'))

# create the total list of dataframes
total_dataframes = []
'''for file in original_files:
    file_name = os.path.splitext(os.path.basename(file))[0]
    dfn = pandas.read_excel(file)
    total_dataframes.append(dfn)'''
for file in files:
    file_name = os.path.splitext(os.path.basename(file))[0]
    dfn = pandas.read_excel(file)
    total_dataframes.append(dfn)
    
# combine the total set of dataframes
combined_df = pandas.DataFrame()
combined_df = pandas.concat(total_dataframes)
display(combined_df)

# replace the NaN values with blank spaces
combined_df = combined_df.fillna(' ')
'''numpy_array = combined_df.to_numpy()
cleaned_array = numpy.nan_to_num(numpy_array, False, '')
display(cleaned_array)'''

# export the dataframe
combined_df.to_csv('{}_concatenated scraped NIST enzymes_03.csv'.format(datetime.date.today()))

'''total_columns = []
for column in combined_df:
    total_columns.append(column)
    print(column)
print(len(total_columns))'''