In [3]:
# -*- coding: utf-8 -*-
"""
@authors: Ethan Chan, Matthew Freiburger
"""
# Import libraries
import pandas as pd
import glob
import os
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
import json
from itertools import islice
import re
import os


class SABIO_scraping():
#     __slots__ = (str(x) for x in [progress_file_prefix, xls_download_prefix, scraped_xls_prefix, scraped_entryids_prefix, sel_xls_download_path, processed_xls, entry_json, scraped_model, bigg_model_name_suffix, sub_directory_path, progress_file_path, xls_download_path, scraped_xls_file_path, scraped_entryids_file_path, xls_csv_file_path, entryids_json_file_path, scraped_model_json_file_path, bigg_model, step_number, cwd])
    
    def __init__(self):
        self.parameters = {}
        self.parameters['general_delay'] = 2
        self.variables = {}
        self.paths = {}

    #Clicks a HTML element with selenium by id
    def click_element_id(self,n_id):
        element = self.driver.find_element_by_id(n_id)
        element.click()
        time.sleep(self.parameters['general_delay'])

    #Selects a choice from a HTML dropdown element with selenium by id
    def select_dropdown_id(self,n_id, n_choice):
        element = Select(self.driver.find_element_by_id(n_id))
        element.select_by_visible_text(n_choice)
        time.sleep(self.parameters['general_delay'])

    def fatal_error_handler(self,message):
        print("Error: " + message)
        print("Exiting now...")
        exit(0)

    """
    --------------------------------------------------------------------
        STEP 0: GET BIGG MODEL TO SCRAPE AND SETUP DIRECTORIES AND PROGRESS FILE
    --------------------------------------------------------------------    
    """

    def start(self,):
        global scraped_entryids
        global entry_id_json_out

        # find the BiGG model that will be scraped
        while True:
            bigg_model_path = input("Specify the BIGG Model JSON file path: ")

            if os.path.exists(bigg_model_path) and bigg_model_path[-5:] == ".json":
                try:
                    self.model = json.load(open(bigg_model_path))
                    bigg_model_name = re.search("([\w+\.?\s?]+)(?=\.json)", bigg_model_path).group()
                    break
                except:
                    pass

        # define the paths
        self.paths['cwd'] = os.path.dirname(os.path.realpath(bigg_model_path))
        self.paths['sub_directory_path'] = os.path.join(self.paths['cwd'],f"scraping-{bigg_model_name}")
        if not os.path.isdir(self.paths['sub_directory_path']):        
            os.mkdir(self.paths['sub_directory_path'])
            
        progress_file_prefix = "current-progress"
        xls_download_prefix = "xls-download-"
        scraped_xls_prefix = "scraped-xls-"
        scraped_entryids_prefix = "scraped-entryids-"
        processed_xls = "proccessed-xls-"
        entry_json = "entryids-json-"
        scraped_model = "scraped-model-"
        
        self.variables['scraped_xls'] = {}
        self.variables['scraped_entryids'] = {}
        self.paths['scraped_model_json_file_path'] = os.path.join(self.paths['sub_directory_path'], scraped_model) + ".json"
        self.paths['sel_xls_download_path'] = os.path.join(self.paths['sub_directory_path'],xls_download_prefix)
        
        self.step_number = -1
        
        self.paths['progress_file_path'] = os.path.join(self.paths['sub_directory_path'], progress_file_prefix) + '.txt'
        if os.path.exists(self.paths['progress_file_path']):
            f = open(self.paths['progress_file_path'], "r")
            self.step_number = int(f.read(1))
            if not self.step_number in [1, 2, 3, 4, 5]:
                self.fatal_error_handler("Progress file malformed. Please delete and restart")
        else:
            self.step_number = 1
            self.progress_update(self.step_number)

        self.paths['xls_download_path'] = os.path.join(self.paths['sub_directory_path'], xls_download_prefix) 
        if not os.path.isdir(self.paths['xls_download_path']):
            os.mkdir(self.paths['xls_download_path'])

        self.paths['scraped_xls_file_path'] = os.path.join(self.paths['sub_directory_path'], scraped_xls_prefix) + ".json"
        if os.path.exists(self.paths['scraped_xls_file_path']):
            f = open(self.paths['scraped_xls_file_path'], "r")
            self.variables['scraped_xls'] = json.load(f)
            f.close()

        self.paths['scraped_entryids_file_path'] = os.path.join(self.paths['sub_directory_path'], scraped_entryids_prefix) + ".json"
        if os.path.exists(self.paths['scraped_entryids_file_path']):
            f = open(self.paths['scraped_entryids_file_path'], "r")
            self.variables['scraped_entryids'] = json.load(f)
            f.close()

        self.paths['entryids_json_file_path'] = os.path.join(self.paths['sub_directory_path'], entry_json) + ".json"
        if os.path.exists(self.paths['entryids_json_file_path']):
            f = open(self.paths['entryids_json_file_path'], "r")
            self.variables['entry_id_json_out'] = json.load(f)
            f.close()

    """
    --------------------------------------------------------------------
        STEP 1: SCRAPE SABIO WEBSITE BY DOWNLOAD XLS FOR GIVEN REACTIONS IN BIGG MODEL
    --------------------------------------------------------------------    
    """

    def scrape_xls(self,reaction_identifier, search_option):
        global enzymes
        global xls_download_path

        chrome_options = webdriver.ChromeOptions()
        prefs = {'download.default_directory' : self.paths['cwd'] + self.paths['sel_xls_download_path']}
        chrome_options.add_experimental_option('prefs', prefs)
        self.driver = webdriver.Chrome(chrome_options=chrome_options)    
        self.driver.get("http://sabiork.h-its.org/newSearch/index")

        time.sleep(self.parameters['general_delay'])


        self.click_element_id("option")
        self.select_dropdown_id("searchterms", search_option)
        text_area = driver.find_element_by_id("searchtermField")
        text_area.send_keys(reaction_identifier)  
        time.sleep(self.parameters['general_delay'])  
        self.click_element_id("addsearch")

        time.sleep(self.parameters['general_delay'])

        result_num = ""
        try: 
            result_num_ele = self.driver.find_element_by_id("numberofKinLaw")
            for char in result_num_ele.text:
                if re.search('[0-9]', char):
                    result_num = result_num + char

            result_num = int(result_num)
        except:
            driver.close()
            return False

        time.sleep(self.parameters['general_delay'])

        self.select_dropdown_id("max", "100")
        element = Select(self.driver.find_element_by_id("max"))
        element.select_by_visible_text("100")

        time.sleep(self.parameters['general_delay'])

        if result_num > 0 and result_num <= 100:
            self.click_element_id("allCheckbox")
            time.sleep(self.parameters['general_delay'])
        elif result_num > 100:
            self.click_element_id("allCheckbox")
            for i in range(int(result_num/100)):
                element = self.driver.find_element_by_xpath("//*[@class = 'nextLink']")
                element.click()
                time.sleep(self.parameters['general_delay'])
                self.click_element_id("allCheckbox")
                time.sleep(self.parameters['general_delay'])
        else:
            self.driver.close()
            return False

        self.driver.get("http://sabiork.h-its.org/newSearch/spreadsheetExport")

        time.sleep(self.parameters['general_delay']*7.5)

        self.click_element_id("excelExport")

        time.sleep(self.parameters['general_delay']*2.5)

        self.driver.close()

        return True


    def scrape_bigg_xls(self,):
        for reaction in self.model["reactions"]:
            if not reaction["name"] in self.variables['scraped_xls']:
                ids_to_try = reaction["annotation"]

                success_flag = False
                annotation_search_pairs = {"sabiork":"SabioReactionID", "metanetx.reaction":"MetaNetXReactionID", "ec-code":"ECNumber", "kegg.reaction":"KeggReactionID", "rhea":"RheaReactionID"}
                for annotation in annotation_search_pairs:
                    if not success_flag:
                        if annotation in ids_to_try:
                            for id_to_try in ids_to_try[annotation]:
                                try:
                                    success_flag = self.variables['scraped_xls'](id_to_try, annotation_search_pairs[annotation])
                                except:
                                    success_flag = False
                    else:
                        break

                if not success_flag:
                    try:
                        success_flag = self.variables['scraped_xls'](reaction["name"], "Enzymename")
                    except:
                        success_flag = False

                json_dict_key = reaction["name"].replace("\"", "")
                if success_flag:
                    self.variables['scraped_xls'][json_dict_key] = "yes"
                else:
                    self.variables['scraped_xls'][json_dict_key] = "no"

            with open(self.paths['scraped_xls_file_path'], 'w') as outfile:
                json.dump(self.variables['scraped_xls'], outfile, indent = 4)   
                outfile.close()
                
        self.step_number = 2
        self.progress_update(self.step_number)

    """
    --------------------------------------------------------------------
        STEP 2: GLOB EXPORTED XLS FILES TOGETHER
    --------------------------------------------------------------------
    """

    def glob_xls_files(self,):
        scraped_sans_parentheses_enzymes = glob.glob('./{}/*.xls'.format(self.paths['xls_download_path'])))
        total_dataframes = []
        for file in scraped_sans_parentheses_enzymes:
            #file_name = os.path.splitext(os.path.basename(file))[0]
            dfn = pd.read_excel(file)
            total_dataframes.append(dfn)

        # combine the total set of dataframes
        combined_df = pd.DataFrame()
        combined_df = pd.concat(total_dataframes)
        combined_df = combined_df.fillna(' ')
        combined_df = combined_df.drop_duplicates()

        # export the dataframe
        csv_path = os.path.join(self.paths['sub_directory_path'], processed_xls) + ".csv"
        combined_df.to_csv(csv_path)
        
        # update the 
        self.step_number = 3
        self.progress_update(self.step_number)

    """
    --------------------------------------------------------------------
        STEP 3: SCRAPE ADDITIONAL DATA BY ENTRYID
    --------------------------------------------------------------------    
    """

    def scrape_entry_id(self,entry_id):
        global entry_id_json_out

        entry_id = str(entry_id)

        self.driver = webdriver.Chrome(executable_path=r".\chromedriver.exe")
        self.driver.get("http://sabiork.h-its.org/newSearch/index")

        time.sleep(general_delay)

        self.click_element_id("option")
        self.select_dropdown_id("searchterms", "EntryID")
        text_area = self.driver.find_element_by_id("searchtermField")
        text_area.send_keys(entry_id)

        time.sleep(general_delay)

        self.click_element_id("addsearch")

        time.sleep(general_delay)

        self.click_element_id(entry_id + "img")

        time.sleep(general_delay)

        self.driver.switch_to.frame(self.driver.find_element_by_xpath("//iframe[@name='iframe_" + entry_id + "']"))
        element = self.driver.find_element_by_xpath("//table")
        html_source = element.get_attribute('innerHTML')
        
        table_df = pd.read_html(html_source)
        reaction_parameters_df = pd.DataFrame()
        counter = 0
        parameters_json = {}
        for df in table_df:
            try:
                if df[0][0] == "Parameter":
                    reaction_parameters_df = table_df[counter]
            except:
                self.driver.close()
                return parameters_json
            counter += 1
            
        parameter_name = ""
        for i in range(len(reaction_parameters_df[0])-2):
            parameter_name = reaction_parameters_df[0][i+2]
            inner_parameters_json = {}
            for j in range(len(reaction_parameters_df)-3):
                inner_parameters_json[reaction_parameters_df[j+1][1]] = reaction_parameters_df[j+1][i+2]

            parameters_json[parameter_name] = inner_parameters_json

        self.driver.close()

        return parameters_json


    def scrape_entryids(self,):
        global entry_id_json_out

        sabio_xls_df = pd.read_csv(xls_csv_file_path)
        entryids = sabio_xls_df["EntryID"].unique().tolist()

        for entryid in entryids:
            if not entryid in scraped_entryids:
                try:
                    entry_id_json_out[str(entryid)] = scrape_entry_id(entryid)
                    self.variables['scraped_entryids'][entryid] = "yes"
                except:
                    self.variables['scraped_entryids'][entryid] = "no"
            with open(scraped_entryids_file_path, 'w') as outfile:
                json.dump(self.variables['scraped_entryids'], outfile, indent = 4)   
                outfile.close()
            with open(entryids_json_file_path, 'w') as f:
                json.dump(entry_id_json_out, f, indent = 4)        
                f.close()
        
        self.step_number = 4
        self.progress_update(self.step_number)

    """
    --------------------------------------------------------------------
        STEP 4: COMBINE ENZYME AND ENTRYID DATA INTO JSON FILE
    --------------------------------------------------------------------
    """

    def combine_data(self,):

        sabio_xls_df = pd.read_csv(self.paths['xls_csv_file_path'])

        # Opening JSON file
        with open(self.paths['entryids_json_file_path']) as json_file:
            entry_id_data = json.load(json_file)

        enzymenames = sabio_xls_df["Enzymename"].unique().tolist()
        enzyme_dict = {}
        missing_entry_ids = []
        parameters = {}

        for enzyme in enzymenames:
            sabio_grouped_enzyme_df = sabio_xls_df.loc[sabio_xls_df["Enzymename"] == enzyme]
            dict_to_append = {}
            reactions = sabio_grouped_enzyme_df["Reaction"].unique().tolist()
            for reaction in reactions:
                dict_reactions_to_append = {}
                sabio_grouped_reactions_df = sabio_grouped_enzyme_df.loc[sabio_grouped_enzyme_df["Reaction"] == reaction]
                entryids = sabio_grouped_reactions_df["EntryID"].unique().tolist()

                for entryid in entryids:
                    entry_ids_df = sabio_grouped_reactions_df.loc[sabio_grouped_reactions_df["EntryID"] == entryid]
                    dict_entryid_to_append = {}
                    head_of_df = entry_ids_df.head(1).squeeze()
                    entry_id_flag = True
                    parameter_info = {}

                    try:
                        parameter_info = entry_id_data[str(entryid)]
                        dict_entryid_to_append["Parameters"] = parameter_info
                    except:
                        missing_entry_ids.append(str(entryid))
                        entry_id_flag = False
                        dict_entryid_to_append["Parameters"] = "NaN"

                    rate_law = head_of_df["Rate Equation"]
                    bad_rate_laws = ["unknown", "", "-"]

                    if not rate_law in bad_rate_laws:                    
                        dict_entryid_to_append["RateLaw"] = rate_law
                        dict_entryid_to_append["SubstitutedRateLaw"] = rate_law
                    else:
                        dict_entryid_to_append["RateLaw"] = "NaN"
                        dict_entryid_to_append["SubstitutedRateLaw"] = "NaN"

                    if entry_id_flag:

                        fields_to_copy = ["Buffer", "Product", "PubMedID", "Publication", "pH", "Temperature", "Enzyme Variant", "UniProtKB_AC", "Organism", "KineticMechanismType", "SabioReactionID"]
                        for field in fields_to_copy:  
                            dict_entryid_to_append[field] = head_of_df[field]
                        dict_reactions_to_append[entryid] = dict_entryid_to_append
                        dict_entryid_to_append["Substrates"] = head_of_df["Substrate"].split(";")
                        out_rate_law = rate_law
                        if not rate_law in bad_rate_laws:                    
                            substrates = head_of_df["Substrate"].split(";")

                            stripped_string = re.sub('[0-9]', '', rate_law)

                            variables = re.split("\^|\*|\+|\-|\/|\(|\)| ", stripped_string)
                            variables = ' '.join(variables).split()

                            start_value_permutations = ["start value", "start val."]
                            substrates_key = {}
                            for var in variables:
                                if var in parameter_info:
                                    for permutation in start_value_permutations:
                                        try:
                                            if var == "A" or var == "B":
                                                substrates_key[var] = parameter_info[var]["species"]
                                            else:
                                                value = parameter_info[var][permutation]
                                                if value != "-" and value != "" and value != " ":
                                                    out_rate_law = out_rate_law.replace(var, parameter_info[var][permutation])
                                        except:
                                            pass

                            dict_entryid_to_append["RateLawSubstrates"] = substrates_key
                            dict_entryid_to_append["SubstitutedRateLaw"] = out_rate_law

                dict_to_append[reaction] = dict_reactions_to_append

            enzyme_dict[enzyme] = dict_to_append

        with open(scraped_model_json_file_path, 'w') as f:
            json.dump(enzyme_dict, f, indent=4)
        
        self.step_number = 5
        self.progress_update(self.step_number)
        
    def progress_update(step):
        if not re.search('[0-5]', step):
            print(f'--> ERROR: The {step} step is not acceptable.')
        f = open(self.paths['progress_file_path'], "w")
        f.write(str(step))
        f.close()

    def main(self,):
        self.start()

        while True:
            if self.step_number == 1:
                self.scrape_bigg_xls()
            elif self.step_number == 2:
                self.glob_xls_files()
            elif self.step_number == 3:
                self.scrape_entryids()
            elif self.step_number == 4:
                self.combine_data()
            elif self.step_number == 5:
                print("Execution complete. Scraper finished.")
                break

In [4]:
# C:\Users\Andrew Freiburger\Dropbox\My PC (DESKTOP-M302P50)\Documents\UVic Civil Engineering\dFBA\BiGG_models\BiGG model of S. aureus.json
scraping = SABIO_scraping()
scraping.main()

Specify the BIGG Model JSON file path: C:\Users\Andrew Freiburger\Dropbox\My PC (DESKTOP-M302P50)\Documents\UVic Civil Engineering\dFBA\BiGG_models\BiGG model of S. aureus.json


ValueError: No objects to concatenate