# Whole Cell KB web scraping

The Whole Cell KB is an impressive repository of bacterial biochemistry. The website was scraped to acquire data that was unprovided by the download feature of the website. The scraped website information was converted into CSV files that can be readily imported as into Python scripts through Pandas for processing in bacterial models.

In [None]:
# -*- coding: utf-8 -*-
"""
@authors: Ethan Chan, Matthew Freiburger
"""

#Import Statements
from bs4 import BeautifulSoup #Website Scraping Library
import requests #Pulling webpages
import pandas as pd #Dataframes
import string #String for removing non-encodable characters
import re #Regex

#Paths
root_url = "https://www.wholecellkb.org/detail/Mgenitalium/"
output_directory = "./formatted_data"

#Output paths
reactions_out_path = "/reactions.csv"
metabolites_out_path = "/metabolites.csv"
genes_out_path = "/genes.csv"
processes_out_path = "/processes.csv"
chromosome_features_out_path = "/chromosome_features.csv"
parameters_out_path = "/parameters.csv"
protein_complexes_out_path = "/protein_complexes.csv"
protein_monomers_out_path = "/protein_monomers.csv"
references_out_path = "/references.csv"
states_out_path = "/states.csv"
stimuli_out_path = "/stimuli.csv"
transcription_units_out_path = "/transcription_units.csv"
transcriptional_regulation_out_path = "/transcriptional_regulation.csv"
types_out_path = "/types.csv"

#JSON file in paths
reactions_json_path = "./data/reactions.json"
compartments_json_path = "./data/compartments.json"
metabolites_json_path = "./data/metabolites.json"

#Reading JSON files
reactions_import = pd.read_json(reactions_json_path)
metabolites_import = pd.read_json(metabolites_json_path)
compartments_import = pd.read_json(compartments_json_path)

#Dataframes for scraping
reactions_scrape = pd.DataFrame(columns=["WID", "Modification", "Name", "Cross references", "Type", "Stoichiometry", "Is spontaneous", "ΔG", "Keq", "Pathways", "Process", "References", "Metadata", "Created", "Last updated", "Coenzymes", "Optimal pH", "Optimal temperature", "Enzyme", "Forward kinetics", "Backward kinetics", "Comments", "State"])
metabolites_scrape = pd.DataFrame(columns=["WID", "Name", "Traditional name", "IUPAC name", "Cross references", "Type", "Empirical formula", "Charge", "Is hydrophobic", "Molecular weight", "van der Waals volume", "ΔfG", "pI", "logP", "logD", "Biomass composition", "Reaction participant", "Created", "Last updated", "Media composition", "References", "Comments", "SMILES", "pKa", "Complex subunit"])
genes_scrape = pd.DataFrame(columns=["WID", "Name", "Cross references", "Type", "Structure", "Sequence", "Transcription unit", "Empirical formula", "Molecular weight", "Is essential", "Relative expression", "Half life", "Codons", "Amino acid", "Extinction coefficient", "pI", "Comments", "References", "Created", "Last updated", "Protein product", "Homologs", "Symbol"])
processes_scrape = pd.DataFrame(columns=["WID", "References", "Name", "Initialization order", "Chemical reactions", "Complex formation reactions", "Parameters", "Comments", "Created", "Last updated"])
chromosome_features_scrape = pd.DataFrame(columns=["WID", "Name", "Type", "Structure", "Genes", "Transcription unit", "Created", "Last updated", "Comments", "References", "Sequence"])
parameters_scrape = pd.DataFrame(columns=["WID", "Name", "Synonyms", "Value", "State", "References", "Created", "Last updated", "Comments", "Molecules", "Process"])
protein_complexes_scrape = pd.DataFrame(columns=["WID", "Name", "Cross references", "Biosynthesis", "No. subunits", "DNA footprint", "Empirical formula", "Molecular weight", "Extinction coefficient", "Half life", "Formation process", "Localization", "Regulatory rule", "Enzyme", "Reaction participant", "Complex subunit", "Parameters", "Comments", "References", "Created", "Last updated"])
protein_monomers_scrape = pd.DataFrame(columns=["WID", "Name", "Cross references", "Gene", "Is N terminal methionine cleaved", "Empirical formula", "Molecular weight", "Extinction coefficient", "Instability index", "Is stable", "Aliphatic index", "GRAVY", "Half life", "Localization", "Chaperones", "Complex subunit", "Comments", "References", "Created", "Last updated", "DNA footprint", "Enzyme", "Reaction participant", "Prosthetic groups", "Parameters", "Sequence"])
references_scrape = pd.DataFrame(columns=["WID", "Name", "Cross references", "Type", "Citation", "Cited by", "Created", "Last updated", "Comments"])
states_scrape = pd.DataFrame(columns=["WID", "Name", "Reactions", "Parameters", "Created", "Last updated", "Comments"])
stimuli_scrape = pd.DataFrame(columns=["WID", "Name", "Value", "Reaction participant", "Comments", "References", "Created", "Last updated"])
transcription_units_scrape = pd.DataFrame(columns=["WID", "Name", "Structure", "Genes", "Promoter  35 box coordinate (nt)", "Promoter  35 box length (nt)", "Promoter  10 box coordinate (nt)", "Promoter  10 box length (nt)", "Transcription start site coordinate (nt)", "Created", "Last updated", "Sequence", "Regulation"])
transcriptional_regulation_scrape = pd.DataFrame(columns=["WID", "Name", "Affinity", "Transcription unit", "Transcripton factor", "Binding site", "Fold change activity", "Comments", "References", "Created", "Last updated"])
types_scrape = pd.DataFrame(columns=["WID", "Name", "Members", "Created", "Last updated", "Children", "Parent"])

#URLs for sections
reactions_url = "https://www.wholecellkb.org/list/Mgenitalium/Reaction"
metabolites_url = "https://www.wholecellkb.org/list/Mgenitalium/Metabolite"
genes_url = "https://www.wholecellkb.org/list/Mgenitalium/Gene"
processes_url = "https://www.wholecellkb.org/list/Mgenitalium/Process"
chromosome_features_url = "https://www.wholecellkb.org/list/Mgenitalium/ChromosomeFeature"
parameters_url = "https://www.wholecellkb.org/list/Mgenitalium/Parameter"
protein_complexes_url = "https://www.wholecellkb.org/list/Mgenitalium/ProteinComplex"
protein_monomers_url = "https://www.wholecellkb.org/list/Mgenitalium/ProteinMonomer"
references_url = "https://www.wholecellkb.org/list/Mgenitalium/Reference"
states_url = "https://www.wholecellkb.org/list/Mgenitalium/State"
stimuli_url = "https://www.wholecellkb.org/list/Mgenitalium/Stimulus"
transcriptional_units_url = "https://www.wholecellkb.org/list/Mgenitalium/TranscriptionUnit"
transcriptional_regulation_url = "https://www.wholecellkb.org/list/Mgenitalium/TranscriptionalRegulation"
types_url = "https://www.wholecellkb.org/list/Mgenitalium/Type"


#====================================================================================================================================
#sub-urls for reactions, metabolites, and genes are mined
def scrape_sub_urls(url):
    
    #Items of the url are found in the table on the page
    sub_urls = []
    
    #The sub-urls are appended to a list for return
    for row in BeautifulSoup(requests.get(url).text, "lxml").find("table", attrs={"id": "list"}).tbody.find_all("tr"):
        for th in row.find_all("th"):
            sub_urls.append(th.text)
    
    return sub_urls
    

#====================================================================================================================================

#specifies whether the argument character is a nucleotide base
def is_base(c):
    if c == "A" or c == "C" or c == "T" or c == "G":
        return True
    else:
        return False
    
#cleans non-ascii characters, and correlates string to df column
def correlate_string(th):
    if "Is" in th and "spontaneous" in th:
        return "Is spontaneous"
    elif "ΔG" in th:
        th = "ΔG"
    elif "ΔfG" in th:
        th = "ΔfG"
    elif "logD" in th:
        th = "logD"
    elif "Charge" in th:
        th = "Charge"
    elif "SMILES" in th:
        th = "SMILES"
    elif "Empirical" in th and "formula" in th:
        th = "Empirical formula"
    elif "Molecular" in str(th):
        th = "Molecular weight"
    elif "volume" in th and "van" in th and "der" in th and "Waals" in th:
        th = "van der Waals volume"
    elif "Biomass" in th and "composition" in th:
        th = "Biomass composition"
    elif "Is" in th and "hydrophobic" in th:
        th = "Is hydrophobic"
    elif "Traditional" in th and "name" in th:
        th = "Traditional name"
    elif "IUPAC" in th and "name" in th:
        th = "IUPAC name"
    elif "Cross" in th and "references" in th:
        th = "Cross references"
    elif "Reaction" in th and "participant" in th:
        th = "Reaction participant"
    elif "Last" in th and "updated" in th:
        th = "Last updated"
    elif "Media" in th and "composition" in th:
        th = "Media composition"
    elif "Complex" in th and "subunit" in th:
        th = "Complex subunit"
    elif "Optimal" in th and "pH" in th:
        th = "Optimal pH"
    elif "Optimal" in th and "temperature" in th:
        th = "Optimal temperature"
    elif "Forward" in th and "kinetics" in th:
        th = "Forward kinetics"
    elif "Backward" in th and "kinetics" in th:
        th = "Backward kinetics"
    elif "Transcription" in th and "unit" in th:
        th = "Transcription unit"
    elif "Is" in th and "essential" in th:
        th = "Is essential"
    elif "Relative" in th and "expression" in th:
        th = "Relative expression"
    elif "Half" in th and "life" in th:
        th = "Half life"
    elif "Amino" in th and "acid" in th:
        th = "Amino acid"
    elif "Extinction" in th and "coefficient" in th:
        th = "Extinction coefficient"
    elif "GRAVY" in th:
        th = "GRAVY"
    else:
        th = ''.join(i if ord(i)<128 else ' ' for i in th)

    return th
    
#strips the strings of newlines and spaces
def clean_string(s):
    return s.strip("\n").lstrip().rstrip() 

#scrapes the website and returns results into the dataframe
def scrape_section(url, dataframe):
    df_index = 0
 
    #scrape each sub-url of section
    for sub_url in scrape_sub_urls(url):
        
        
        #scrape and format to get key and value pairs
        dataframe.loc[df_index] = len(dataframe.columns) * [None]        
        
        for tbody in BeautifulSoup(requests.get(root_url + sub_url).text, "lxml").find("table", attrs={"id": "detail"}).find_all("tbody", attrs={"class": "data"}):

            data_rows = tbody.find_all("tr")
            
            #for every property:value pair extract data
            for tr in data_rows:
                th = tr.find("th")
                td = tr.find("td")
                
                #existing "th" and "td" instances are trimmed
                if th != None and td != None:
                    th = clean_string(th.text)
                    td = clean_string(td.text)
                    
                    #the mined characteristics are purified of non-ascii characters
                    th = correlate_string(th)                    
                
                    if th == "Sequence":

                        td = re.search(".*Sequence: [0-9]*\B([ACTG0-9]*)", td)
                        if td != None:
                            td = "".join([base for base in td.group(1) if is_base(base)])
                        dataframe.loc[df_index][th] = td
                    elif th in dataframe:
                        dataframe.loc[df_index][th] = td
                    elif not th in dataframe:
                        print(th)
                
                    
        df_index += 1

#scrapes the website and writes results to a csv file
def scrape_write(url, dataframe, out_path):
    scrape_section(url, dataframe)
    dataframe.to_csv(output_directory + out_path, na_rep="None")
    
#====================================================================================================================================

#the reactions, metabolites, and genes sections are scraped and written                        
scrape_section(reactions_url, reactions_scrape)
scrape_section(metabolites_url, metabolites_scrape)
scrape_write(genes_url, genes_scrape, genes_out_path)
scrape_write(processes_url, processes_scrape, processes_out_path)
scrape_write(chromosome_features_url, chromosome_features_scrape, chromosome_features_out_path)
scrape_write(parameters_url, parameters_scrape, parameters_out_path)
scrape_write(protein_complexes_url, protein_complexes_scrape, protein_complexes_out_path)
scrape_write(protein_monomers_url, protein_monomers_scrape, protein_monomers_out_path)
scrape_write(references_url, references_scrape, references_out_path) 
scrape_write(states_url, states_scrape, states_out_path) 
scrape_write(stimuli_url, stimuli_scrape, stimuli_out_path) 
scrape_write(transcriptional_units_url, transcription_units_scrape, transcription_units_out_path) 
scrape_write(transcriptional_regulation_url, transcriptional_regulation_scrape, transcriptional_regulation_out_path) 
scrape_write(types_url, types_scrape, types_out_path) 

#====================================================================================================================================

#files with fields from downloaded json files and scraping
reactions_out = open(output_directory + "/reactions.csv", "w")
reactions_out.write("WID,Modification,Name,Cross references,Type,Stoichiometry,Is spontaneous,deltaG,Keq,Pathways,Process,References,Metadata,Created,Last updated,Coenzymes,Optimal pH,Optimal temperature,Enzyme,Forward kinetics,Backward kinetics,Comments,State")

#all imported reactiosn are iteratively processed
index = 0
for df in reactions_import["data"]:
    
    #database parameters are interpreted when the value exists
    kinetics_forward_vmax = "None"
    if df["kinetics_forward"] != None:
        kinetics_forward_vmax = df["kinetics_forward"]["vmax"]
        
    kinetics_backward_string = "None"
    if df["kinetics_backward"] != None:
        kinetics_backward_string = df["kinetics_backward"]["vmax"]
        
    optimal_ph_string = "None"
    if df["optimal_ph"] != None:
        optimal_ph_string = df["optimal_ph"]["value"]

    optimal_temperature_string = "None"
    if df["optimal_temperature"] != None:
        optimal_temperature_string = df["optimal_temperature"]["value"]
        
    stoichiometry_string = "None"
    first = True
    for molecule in df["stoichiometry"]:
        if first:
            stoichiometry_string = molecule["coefficient"] + ":" + molecule["compartment"] + ":" + molecule["molecule"]  #!!! ticket 109
            first = False
        else:
            stoichiometry_string += ";" + molecule["coefficient"] + ":" + molecule["compartment"] + ":" + molecule["molecule"]

    #strings are named when in exsistence
    pathways_string = "None"
    if not df["pathways"] == [] and not df["pathways"] == None:
        pathways_string = df["pathways"][0]

    enzyme_protein_string = "None"
    enzyme_compartment_string = "None"
    if not df["enzyme"] == None:
        enzyme_protein_string = df["enzyme"]["protein"]
        enzyme_compartment_string = df["enzyme"]["compartment"]

    keq_string = "None"
    if not df["keq"] == None:
        if not df["keq"]["value"] == None:
            keq_string = df["keq"]["value"]
    
    type_string = "None"
    if not df["type"] == None and not df["type"] == []:
        if not df["type"][0] == None:
            type_string = df["type"][0]

    name_string = df["name"]      
    is_spontaneous_string = df["is_spontaneous"]  
    delta_g_string = df["delta_g"]
    direction_string = df["direction"]  
    processes_string = df["processes"]
    model_string = df["model"]

    #exchanging a comma delimiter with a space delimiter
    name_string = name_string.replace(",", "")
    enzyme_protein_string = enzyme_protein_string.replace(",", "")
    enzyme_compartment_string = enzyme_compartment_string.replace(",", "")
    is_spontaneous_string = is_spontaneous_string.replace(",", "")
    delta_g_string = delta_g_string.replace(",", "")
    direction_string = direction_string.replace(",", "")
    keq_string = keq_string.replace(",", "")
    kinetics_backward_string = kinetics_backward_string.replace(",", "")

    #adds a comma delimiter for the complete entries of each row    
    scrape_row = reactions_scrape.loc[index]
    out_string = str(scrape_row["WID"]) + "," + str(scrape_row["Modification"]) + "," + str(name_string) + "," + str(scrape_row["Cross references"]).replace(",", "") + "," + str(scrape_row["Type"]) + "," + str(stoichiometry_string) + "," + str(is_spontaneous_string) + "," + str(scrape_row["ΔG"]) + "," + str(keq_string) + "," + str(scrape_row["Pathways"]) + "," + str(scrape_row["Process"]) + "," + str(scrape_row["References"]).replace("," ,"") + "," + str(scrape_row["Metadata"]).replace("," ,"") + "," + str(scrape_row["Created"]).replace("," ,"") + "," + str(scrape_row["Last updated"]).replace("," ,"") + "," + str(scrape_row["Coenzymes"]).replace("," ,"") + "," + str(optimal_ph_string) + "," + str(optimal_temperature_string) + "," + str(enzyme_compartment_string) + "," + str(kinetics_forward_vmax) + "," + str(kinetics_backward_string) + "," + str(scrape_row["Comments"]).replace("," ,"") + "," + str(scrape_row["State"]).replace("," ,"")
    printable = set(string.printable)
    out_string = "".join(filter(lambda x: x in printable, out_string))   #!!! ticket 112
        
    #the data is placed in an output file
    reactions_out.write("\n")
    reactions_out.write(out_string)
        
    index += 1

    
reactions_out.close()

#====================================================================================================================================

metabolites_out = open(output_directory + "/molecules.csv", "w")
metabolites_out.write("WID,Name,Traditional name,IUPAC name,Cross references,Type,Empirical formula,Charge,Is hydrophobic,Molecular weight,van der Waals volume,fG,pI,logP,logD,Biomass composition,Reaction participant,Created,Last updated,Media composition,References,Comments,SMILES,pKa,Complex subunit")

index = 0

#all existing and imported reactions are iteratively stored as objects
for df in metabolites_import["data"]:
    
    media_composition_string = "None"
    if not df["media_composition"] == None:
        media_composition_string = df["media_composition"]["concentration"]
    type_string = "None"
    if df["type"]:
        type_string = df["type"][0]
    
    #the data is printed to the output file
    scrape_row = metabolites_scrape.loc[index]       

    out_string = str(df["wid"]) + "," + str(scrape_row["Name"]).replace(",", "") + "," + str(scrape_row["Traditional name"]).replace(",", "") + "," + str(scrape_row["IUPAC name"]).replace(",", "") + "," + str(scrape_row["Cross references"]).replace(",", "") + "," + str(scrape_row["Type"]).replace(",", "") + "," + str(scrape_row["Empirical formula"]).replace(",", "") + "," + str(df["charge"]) + "," + str(df["is_hydrophobic"]) + "," + str(scrape_row["Molecular weight"]).replace(",", "") + "," + str(df["volume"]) + "," + str(df["deltag_formation"]) + "," + str(df["pi"]) + "," + str(df["log_p"]) + "," + str(df["log_d"]) + "," + str(scrape_row["Biomass composition"]).replace(",", "") + "," + str(scrape_row["Reaction participant"]).replace(",", "") + "," + str(scrape_row["Created"]).replace(",", "") + "," + str(scrape_row["Last updated"]).replace(",", "") + "," + str(media_composition_string) + "," + str(scrape_row["References"]).replace(",", "") + "," + str(scrape_row["SMILES"]).replace(",", "") + "," + str(scrape_row["Reaction participant"]).replace(",", "") + "," + str(scrape_row["pKa"]).replace(",", "") + "," + str(scrape_row["Complex subunit"]).replace(",", "")
    printable = set(string.printable)
    out_string = "".join(filter(lambda x: x in printable, out_string))

    metabolites_out.write("\n")
    metabolites_out.write(out_string)
        
    index += 1

metabolites_out.close()

#====================================================================================================================================

compartments_out = open(output_directory + "/compartments.csv", "w")
compartments_out.write("WID,Name,Protein monomers,Biomass compositions")

index = 0

#Loop through all reactions imported
for df in compartments_import["data"]:   #!!! ticket 113

    
    wid_string = df["wid"] 
    name_string = df["name"]
        
    protein_monomers_string = "None"
    first = True   #!!! ticket 108
    for protein in df["protein_monomers"]:
        if first:
            protein_monomers_string = protein
            first = False
        else:
            protein_monomers_string += ":" + protein

    biomass_compositions_string = "None"
    first = True    #!!! ticket 108
    for molecules in df["biomass_compositions"]:
        if first:
            biomass_compositions_string = molecules["concentration"] + ";" + molecules["compartment"] + ";" + molecules["metabolites"][0]
            first = False
        else:
            biomass_compositions_string += ";" + molecules["concentration"] + ";" + molecules["compartment"] + ";" + molecules["metabolites"][0]

    out_string = str(wid_string) + "," + str(name_string) + "," + str(protein_monomers_string) + "," + str(biomass_compositions_string)
        
    #the data is written to an output file
    compartments_out.write("\n")
    compartments_out.write(out_string)

compartments_out.close()
