# Refine BiGG IDs 

In [22]:
from pprint import pprint
import pandas
import json, re

# define the reaction translation
# reactions = pandas.read_table(open('BiGG_reactions.txt', 'r'), sep = '\t')
# reactions_dict = {}
# for index, met in reactions.iterrows():
#     reactions_dict[met['bigg_id']] = {
#         'name':met['name'],
#         'reaction_string':met['reaction_string']
#     }
# with open('BiGG_reactions, parsed.json', 'w') as out:
#     json.dump(reactions_dict, out, indent = 2)

# define the metabolite translation: BiGG -> SABIO
# metabolites = pandas.read_table(open('BiGG_metabolites.txt', 'r'), sep = '\t')
# metabolites_dict = {}
# for index, met in metabolites.iterrows():
#     bigg_name = str(met['name'])
#     bigg_name = bigg_name.strip()
#     bigg_id = met['universal_bigg_id']
    
#     name = re.sub('\s[A-Z0-9]{3,}$', '', str(met['name']))
#     name = name.strip()
#     metabolites_dict[bigg_id] = {
#         'name':name
#     }
#     if bigg_name != name:
#         metabolites_dict[bigg_id]['bigg_name'] = bigg_name
        
# with open('BiGG_metabolites, parsed.json', 'w') as out:
#     json.dump(metabolites_dict, out, indent = 2)
    
    
# define the metabolite translation: SABIO -> BiGG
metabolites = pandas.read_table(open('BiGG_metabolites.txt', 'r'), sep = '\t')
metabolites_dict = {}
for index, met in metabolites.iterrows():
    bigg_name = str(met['name'])
    bigg_name = bigg_name.strip()
    bigg_id = met['universal_bigg_id']
    
    name = re.sub('\s[A-Z0-9]{3,}$', '', str(met['name']))
    name = name.strip()
    metabolites_dict[name] = {
        'id':bigg_id
    }
    if bigg_name != name:
        metabolites_dict[name]['bigg_name'] = bigg_name
        
with open('BiGG_metabolite_names, parsed.json', 'w') as out:
    json.dump(metabolites_dict, out, indent = 2)
    
# pprint(metabolites_dict)
# pprint(reactions_dict)

# Refine the BiGG model

In [146]:
def split_reaction(reaction_string, bigg_metabolites):
    def _parse_stoich(met):
        stoich = ''
        ch_number = 0
        denom = False
        while re.search('[0-9\./]', met[ch_number]): 
            stoich += met[ch_number]
            if met[ch_number] == '/':
                numerator = stoich
                denom = True
            if denom:
                denominator += met[ch_number]
            ch_number += 1
            
        if denom:
            stoich = f'{numerator}/{denominator}'
        return stoich
    
    def met_parsing(met):
#         print(met)
        met = met.strip()
        if re.search('(\d\s\w|\d\.\d\s|\d/\d\s)', met):
            coefficient = _parse_stoich(met)
            coefficient = '{} '.format(coefficient)
        else:
            coefficient = ''
        met = re.sub(coefficient, '', met)
#         print(met, coefficient)
        return met, coefficient   

    def reformat_met_name(met_name, sabio = False):
        met_name = re.sub(' - ', '-', met_name)
        if not sabio:
            met_name = re.sub(' ', '_', met_name)
        return met_name
    
        
    # parse the reactants and products for the specified reaction string
    reaction_split = reaction_string.split('<->')
    reactants_list = reaction_split[0].split(' + ')
    products_list = reaction_split[1].split(' + ')
    
    # parse the reactants
    reactants = []
    sabio_reactants = []
    for met in reactants_list:
#         print(met)
        met = met.strip()
        met = re.sub('_\w$', '', met)
        met, coefficient = met_parsing(met)
        reactants.append(coefficient + reformat_met_name(bigg_metabolites[met]['name']))
        sabio_reactants.append(coefficient + reformat_met_name(bigg_metabolites[met]['name'], True))

    # parse the products
    products = []
    sabio_products = []
    for met in products_list:
        if not re.search('[a-z]', met, flags = re.IGNORECASE):
            continue
        met = met.strip()
        met = re.sub('_\w$', '', met)
        met, coefficient = met_parsing(met)
        products.append(coefficient + reformat_met_name(bigg_metabolites[met]['name']))
        sabio_products.append(coefficient + reformat_met_name(bigg_metabolites[met]['name'], True))

#     compounds = reactants + products
    reactant_string = ' + '.join(reactants)
    product_string = ' + '.join(products)
    reaction_string = ' <-> '.join([reactant_string, product_string])
    
    # construct the set of compounds in the SABIO format
    sabio_compounds = sabio_reactants + sabio_products
    
    return reaction_string, sabio_compounds

In [149]:
from pprint import pprint
import pandas
import json
%run ../dfbapy/dfba.py

bigg_reactions = json.load(open('BiGG_reactions, parsed.json'))
bigg_metabolites = json.load(open('BiGG_metabolites, parsed.json'))

# substitute the reaction and metabolite names
model = json.load(open('Ecoli core, BiGG, indented.json'))
model_contents = {}
for reaction in model['reactions']:
    # define the reaction identification
    reaction_id = reaction['id'] 
    reaction_name = bigg_reactions[reaction_id]['name']
    
    # substitute the reaction string
    og_reaction_string = bigg_reactions[reaction_id]['reaction_string']
#     print('\n\n', og_reaction_string)
    reaction_string, compounds = split_reaction(og_reaction_string, bigg_metabolites)
#     print(reaction_string)

    model_contents[reaction_name] = {
        'reaction': {
            'original': og_reaction_string,
            'substituted': reaction_string,
        },
        'chemicals': compounds,
        'annotations': reaction['annotation']
    }
    
# pprint(model_contents)
with open('processed_Ecoli_model.json', 'w') as out:
    json.dump(model_contents, out, indent = 3)

# Refine the data combination function

In [44]:
import pandas
import json, re

class CaseInsensitiveDict(dict):
    @classmethod
    def _k(cls, key):
        return key.lower() if isinstance(key, str) else key

    def __init__(self, *args, **kwargs):
        super(CaseInsensitiveDict, self).__init__(*args, **kwargs)
        self._convert_keys()
    def __getitem__(self, key):
        return super(CaseInsensitiveDict, self).__getitem__(self.__class__._k(key))
    def __setitem__(self, key, value):
        super(CaseInsensitiveDict, self).__setitem__(self.__class__._k(key), value)
    def __delitem__(self, key):
        return super(CaseInsensitiveDict, self).__delitem__(self.__class__._k(key))
    def __contains__(self, key):
        return super(CaseInsensitiveDict, self).__contains__(self.__class__._k(key))
    def has_key(self, key):
        return super(CaseInsensitiveDict, self).has_key(self.__class__._k(key))
    def pop(self, key, *args, **kwargs):
        return super(CaseInsensitiveDict, self).pop(self.__class__._k(key), *args, **kwargs)
    def get(self, key, *args, **kwargs):
        return super(CaseInsensitiveDict, self).get(self.__class__._k(key), *args, **kwargs)
    def setdefault(self, key, *args, **kwargs):
        return super(CaseInsensitiveDict, self).setdefault(self.__class__._k(key), *args, **kwargs)
    def update(self, E=None, **F):
        super(CaseInsensitiveDict, self).update(self.__class__(E))
        super(CaseInsensitiveDict, self).update(self.__class__(**F))
    def _convert_keys(self):
        for k in list(self.keys()):
            v = super(CaseInsensitiveDict, self).pop(k)
            self.__setitem__(k, v)

class test():
    def __init__(self):
        self.sabio_df = pandas.read_csv(open('proccessed-xls.csv', encoding="utf-8"))
        self.bigg_to_sabio_metabolites = json.load(open('BiGG_metabolites, parsed.json'))
        self.sabio_to_bigg_metabolites = json.load(open('BiGG_metabolite_names, parsed.json'))
        self.bigg_reactions = json.load(open('BiGG_reactions, parsed.json'))
        
        self.paths = {}
        self.paths['entryids_path'] = 'entryids_progress.json'
        self.paths["scraped_model_path"] = 'scraped_model.json'
        
        self.printing = True
        
    def _split_reaction(self, 
                        reaction_string, # the sabio or bigg reaction string
                        sabio = False   # specifies how the reaction string will be split
                        ):
        def _parse_stoich(met):
            stoich = ''
            ch_number = 0
            denom = False
            numerator = denominator = 0
            while re.search('[0-9\./]', met[ch_number]): 
                stoich += met[ch_number]
                if met[ch_number] == '/':
                    numerator = stoich
                    denom = True
                if denom:
                    denominator += met[ch_number]
                ch_number += 1
                
            if denom:
                stoich = f'{numerator}/{denominator}'
            return stoich
        
        def met_parsing(met):
    #         print(met)
            met = met.strip()
            met = re.sub('_\w$', '', met)
            if re.search('(\d\s\w|\d\.\d\s|\d/\d\s)', met):
                coefficient = _parse_stoich(met)
                coefficient = '{} '.format(coefficient)
            else:
                coefficient = ''
            met = re.sub(coefficient, '', met)
    #         print(met, coefficient)
            return met, coefficient   
    
        def reformat_met_name(met_name, sabio = False):
            met_name = re.sub(' - ', '-', met_name)
            return met_name
        
        def parsing_chemical_list(chemical_list):
            bigg_chemicals = []
            sabio_chemicals = []
            for met in chemical_list:
#                 print('metabolite', met, type(met))
                if not re.search('[A-Za-z]', met):
                    continue
                met, coefficient = met_parsing(met)
                # assign the proper chemical names
                if not sabio:
                    sabio_chemicals.append(coefficient + reformat_met_name(self.bigg_to_sabio_metabolites[met]['name'], True))     
                    if 'bigg_name' in self.bigg_to_sabio_metabolites[met]:
                        bigg_chemicals.append(coefficient + reformat_met_name(self.bigg_to_sabio_metabolites[met]['bigg_name']))
                    else:
                        bigg_chemicals.append(coefficient + reformat_met_name(self.bigg_to_sabio_metabolites[met]['name']))
                    
                elif sabio:
                    sabio_chemicals.append(coefficient + reformat_met_name(met, True))   
#                     if met in list(self.sabio_to_bigg_metabolites.keys()):
#                         print('yes')
#                     try:
#                         print(self.sabio_to_bigg_metabolites[met])
#                     except:
# #                         print(met, '\n\n\n', self.sabio_to_bigg_metabolites.keys())
#                         for ch in met:
#                             print(ch, '\t', ord(ch))
            
                    dic = CaseInsensitiveDict(self.sabio_to_bigg_metabolites)
                    if 'bigg_name' in dic.get(met):
                        bigg_chemicals.append(coefficient + reformat_met_name(dic.get(met)['bigg_name']))
                    else:
                        bigg_chemicals.append(coefficient + reformat_met_name(met))
                                              
#                 bigg_chemicals.append(coefficient + reformat_met_name(self.bigg_metabolites[met]['name']))
#                 sabio_chemicals.append(coefficient + reformat_met_name(self.bigg_metabolites[met]['name'], True))
#                 if 'bigg_name' in self.bigg_metabolites[met]:
#                     bigg_chemicals[-1] = coefficient + reformat_met_name(self.bigg_metabolites[met]['name'])
            
            return bigg_chemicals, sabio_chemicals
        
            
        # parse the reactants and products for the specified reaction string
        if not sabio:
            reaction_split = reaction_string.split(' <-> ')
        else:
            reaction_split = reaction_string.split(' = ')
            
        reactants_list = reaction_split[0].split(' + ')
        products_list = reaction_split[1].split(' + ')
        
        # parse the reactants and products
        bigg_reactants, sabio_reactants = parsing_chemical_list(reactants_list)
        bigg_products, sabio_products = parsing_chemical_list(products_list)
        
        # assemble the chemicals list and reaction string
        bigg_compounds = bigg_reactants + bigg_products
        sabio_chemicals = sabio_reactants + sabio_products
        reactant_string = ' + '.join(bigg_reactants)
        product_string = ' + '.join(bigg_products)
        reaction_string = ' <-> '.join([reactant_string, product_string])
#        if sabio:
#            reaction_string = ' = '.join([reactant_string, product_string])        
        
        return reaction_string, sabio_chemicals, bigg_compounds
        
    def combine_data(self,):
        # import previously parsed content
        self.model_contents = json.load(open('processed_Ecoli_model.json'))
        with open(self.paths['entryids_path']) as json_file: 
            entry_id_data = json.load(json_file)

        # combine the scraped data into a programmable JSON  
        enzyme_dict = {}
        missing_entry_ids = []
        enzymes = self.sabio_df["Enzymename"].unique().tolist()
        for enzyme in enzymes:
            print('enzyme', enzyme)
            enzyme_df = self.sabio_df.loc[self.sabio_df["Enzymename"] == enzyme]
            enzyme_dict[enzyme] = {}
            reactions = enzyme_df["Reaction"].unique().tolist()
            for reaction in reactions:
                enzyme_dict[enzyme][reaction] = {}
                
                # ensure that the reaction chemicals match before accepting kinetic data
                print('reaction', reaction)
                rxn_string, sabio_chemicals, expected_bigg_chemicals= self._split_reaction(reaction, sabio = True)
                bigg_chemicals = self.model_contents[enzyme]['bigg_compounds']
                
                extra_bigg = set(bigg_chemicals) - set(expected_bigg_chemicals) 
                extra_bigg = set(re.sub('(H\+|H2O)', '', chem) for chem in extra_bigg)           
                if len(extra_bigg) != 1:
                    missed_reaction = f'The || {rxn_string} || reaction with {expected_bigg_chemicals} chemicals does not match the BiGG reaction of {bigg_chemicals} chemicals.'
                    if self.printing:
                        print(missed_reaction)
                    enzyme_dict[enzyme][reaction] = missed_reaction
                    continue
                
                # parse each entryid of each reaction
                enzyme_reactions_df = enzyme_df.loc[enzyme_df["Reaction"] == reaction]
                entryids = enzyme_reactions_df["EntryID"].unique().tolist()
                for entryid in entryids:
                    enzyme_reaction_entryids_df = enzyme_reactions_df.loc[enzyme_reactions_df["EntryID"] == entryid]
                    entryid_string = f'condition_{entryid}'
                    enzyme_dict[enzyme][reaction][entryid_string] = {}
                    head_of_df = enzyme_reaction_entryids_df.head(1).squeeze()
                    entry_id_flag = True
                    parameter_info = {}

                    try:
                        parameter_info = entry_id_data[str(entryid)]
                        enzyme_dict[enzyme][reaction][entryid_string]["Parameters"] = parameter_info
                    except:
                        missing_entry_ids.append(str(entryid))
                        entry_id_flag = False
                        enzyme_dict[enzyme][reaction][entryid_string]["Parameters"] = "NaN"

                    rate_law = head_of_df["Rate Equation"]
                    bad_rate_laws = ["unknown", "", "-"]

                    if not rate_law in bad_rate_laws:                    
                        enzyme_dict[enzyme][reaction][entryid_string]["RateLaw"] = rate_law
                        enzyme_dict[enzyme][reaction][entryid_string]["SubstitutedRateLaw"] = rate_law
                    else:
                        enzyme_dict[enzyme][reaction][entryid_string]["RateLaw"] = "NaN"
                        enzyme_dict[enzyme][reaction][entryid_string]["SubstitutedRateLaw"] = "NaN"

                    if entry_id_flag:
                        fields_to_copy = ["Buffer", "Product", "PubMedID", "Publication", "pH", "Temperature", "Enzyme Variant", "UniProtKB_AC", "Organism", "KineticMechanismType", "SabioReactionID"]
                        for field in fields_to_copy:  
                            enzyme_dict[enzyme][reaction][entryid_string][field] = head_of_df[field]
                            
                        enzyme_dict[enzyme][reaction][entryid_string]["Substrates"] = head_of_df["Substrate"].split(";")
                        out_rate_law = rate_law
                        if not rate_law in bad_rate_laws:                    
                            substrates = head_of_df["Substrate"].split(";")

                            stripped_string = re.sub('[0-9]', '', rate_law)

                            variables = re.split("\^|\*|\+|\-|\/|\(|\)| ", stripped_string)
                            variables = ' '.join(variables).split()

                            start_value_permutations = ["start value", "start val."]
                            substrates_key = {}
                            for var in variables:
                                if var in parameter_info:
                                    for permutation in start_value_permutations:
                                        try:
                                            if var == "A" or var == "B":
                                                substrates_key[var] = parameter_info[var]["species"]
                                            else:
                                                value = parameter_info[var][permutation]
                                                if value != "-" and value != "" and value != " ":           # The quantities must be converted to base units
                                                    out_rate_law = out_rate_law.replace(var, parameter_info[var][permutation])
                                        except:
                                            pass

                            enzyme_dict[enzyme][reaction][entryid_string]["RateLawSubstrates"] = substrates_key
                            enzyme_dict[enzyme][reaction][entryid_string]["SubstitutedRateLaw"] = out_rate_law

        with open(self.paths["scraped_model_path"], 'w', encoding="utf-8") as f:
            json.dump(enzyme_dict, f, indent=4, sort_keys=True, separators=(', ', ': '), ensure_ascii=False, cls=NumpyEncoder)

In [45]:
tes = test()
tes.combine_data()  

enzyme 6-phosphogluconolactonase
reaction H2O + 6-Phospho-D-glucono-1,5-lactone = 6-Phospho-D-gluconate
The || H2O H2O + 6-Phospho-D-glucono-1,5-lactone <-> 6-Phospho-D-gluconate || reaction with ['H2O H2O', '6-Phospho-D-glucono-1,5-lactone', '6-Phospho-D-gluconate'] chemicals does not match the BiGG reaction of ['6-phospho-D-glucono-1,5-lactone', 'H2O', '6-Phospho-D-gluconate', 'H+'] chemicals.
enzyme  
reaction Formate = Formate


KeyError: ' '