In [19]:
import cobra
import pandas as pd
import sys

from random import choice 
from string import ascii_uppercase, digits
from typing import Literal

from tqdm import tqdm
from tqdm.auto import tqdm
tqdm.pandas()

import urllib.error
from Bio.KEGG import REST
from Bio.KEGG import Compound

from refinegems.io import load_a_table_from_database, load_model_cobra, kegg_reaction_parser

In [24]:
mapped_genes_path = '/Users/carolinb/Documents/104 Masterthesis/klebsiella-pipeline/example/thesis/Kp_std/03_refinement/step1-extension/genes_mapped.csv'

chem_prop_file = '/Users/carolinb/Documents/104 Masterthesis/klebsiella-pipeline/example/data/MetaNetX/chem_prop.tsv'
chem_xref_file = '/Users/carolinb/Documents/104 Masterthesis/klebsiella-pipeline/example/data/MetaNetX/chem_xref.tsv'
reac_prop_file = '/Users/carolinb/Documents/104 Masterthesis/klebsiella-pipeline/example/data/MetaNetX/reac_prop.tsv'
reac_xref_file = '/Users/carolinb/Documents/104 Masterthesis/klebsiella-pipeline/example/data/MetaNetX/reac_xref.tsv'

In [61]:
draft_model_path = '/Users/carolinb/Documents/104 Masterthesis/klebsiella-pipeline/example/thesis/Kp_std/02_generate_draft_model/Kp_std_draft.xml'

### generally useful functions

-> maybe something for refinegems     
--> where to put them, if yes

In [158]:
# put in entities in refinegems
def make_reaction_annotation_dict(model:cobra.Model, db:Literal['KEGG','BiGG']) -> dict:
    """Create a dictionary of a model's reaction IDs and a choosen database ID as
    saved in the annotations of the model.
    The database ID can be choosen based on the strings for the namespace options
    in other functions.

    Args:
        model (cobra.Model): A model loaded with COBRApy.
        db (Literal['KEGG','BiGG']): The string denoting the database to map to.

    Raises:
        ValueError: Unknown database string for paramezer db

    Returns:
        dict: The mapping of the reaction IDs to the database IDs found in the annotations
    """

    react_dict = {}

    match db:
        case 'KEGG':
            db_string = 'kegg.reaction'
        case 'BiGG':
            db_string = 'bigg.reaction'
        case _:
            mes = f'Unknown database string for parameter db: {db}'
            raise ValueError(mes)

    for r in model.reactions:
        if db_string in r.annotation.keys():
            react_dict[r.id] = r.annotation[db_string]
        else:
            react_dict[r.id] = '-'

    return react_dict



def create_random_id(model:cobra.Model, entity_type:Literal['reac','meta']='reac', prefix:str='') -> str:
    """Generate a unique, random ID for a model entity for a model.

    Args:
        model (cobra.Model): A model loaded with COBRApy.
        entity_type (Literal['reac','meta'], optional): Type of model entity.  
            Can be 'reac' for Reaction or 'meta' for Metabolite.
            Defaults to 'reac'.
        prefix (str, optional): prefix to set for the randomised part.
            Useful to identify the random IDs later on. 
            Defaults to ''.

    Raises:
        ValueError: Unknown entity_type

    Returns:
        str: The generate new and unique ID.
    """

    match entity_type:
        case 'reac':
            all_ids = [_.id for _ in model.reactions]
        case 'meta':
            all_ids = [_.id for _ in model.metabolites]
        case _:
            mes = f'Unkown entity_type: {entity_type}'
            raise ValueError(mes)

    prefix = f'{prefix}{entity_type}'
    var = ''.join(choice(ascii_uppercase + digits) for i in range(6))
    label = prefix + var
    j = 6
    
    while True:
        
        for i in range(36**6): # make sure it does not run endlessly
            if label in all_ids:
                label = prefix + ''.join(choice(ascii_uppercase + digits) for x in range(j))
            else:
                return label
            
        j = j + 1


# @TODO: 
#     more namespace options
def match_id_to_namespace(model_entity:[cobra.Reaction, cobra.Metabolite], namespace:Literal['BiGG']) -> None:
    """Based on a given namespace, change the ID of a given model entity to it the set namespace.

    Currently working namespaces:
        - BiGG 

    Args:
        model_entity (cobra.Reaction, cobra.Metabolite]): The model entity. 
            Can be either a cobra.Reaction or cobra.Metabolite object.
        namespace (Literal['BiGG']): The chosen namespace.

    Raises:
        ValueError: Unknown input for namespace
        TypeError: Unknown type for model_entity
    """

    match model_entity:

        # Reaction
        # --------
        case cobra.Reaction():
            match namespace:

                case 'BiGG':
                    if 'bigg.reaction' in model_entity.annotation.keys():
                        # @TODO : currently takes first entry is annotation is list
                        model_entity.id = model_entity.annotation['bigg.reaction'] if isinstance(model_entity.annotation['bigg.reaction'],str) else model_entity.annotation['bigg.reaction'][0]

                case _:
                    mes = f'Unknown input for namespace: {namespace}'
                    raise ValueError(mes)
                
        # Metabolite
        # ----------
        case cobra.Metabolite():
            match namespace:

                case 'BiGG':
                    if 'bigg.metabolite' in model_entity.annotation.keys():
                        model_entity.id = model_entity.annotation['bigg.metabolite'] + '_' + model_entity.compartment if isinstance(model_entity.annotation['bigg.metabolite'],str) else model_entity.annotation['bigg.metabolite'][0]

                case _:
                    mes = f'Unknown input for namespace: {namespace}'
                    raise ValueError(mes)
        # Error
        # -----
        case _:
            mes = f'Unknown type for model_entity: {type(model_entity)}'
            raise TypeError(mes)

### reworking functions for extension

#### mapping

In [21]:
# @NOTE changed
def map_BiGG_reactions_row(row, namespace):
    """Map a single entry from the table in map_BiGG_reactions() to the BiGG reaction namespace.

    :param row:       A single row of the table.
    :type  row:       pd.Series
    :param namespace: The BiGG reaction namespace table.
    :type  namespace: pd.DataFrame
    :returns:         The edited row.
    :rtype:           pd.Series
    """

    """
    @TODO
        NOTE: only works for cases, where KEGG.reaction in row contains EXACTLY one entry
              in the rare case that multiple reactions belong to one enzyme, they are omitted
              in this search
    """

    # match by EC number AND KEGG id
    matches = namespace.loc[namespace['EC Number'].str.contains(row['EC number']) & namespace['KEGG Reaction'].str.contains(row['KEGG.reaction'])]

    # case 1 : no matches
    if matches.empty:
        return row

    # case 2 : exactly one match
    elif len(matches) == 1:
        row['bigg_id'] = matches['id'].values[0]

    # case 3 : multiple matches
    #          often due to reaction being in different compartments
    else:
        row['bigg_id'] = ' '.join(matches['id'].values)

    return row


# @TEST : fitted to refinegems
# @CHECK : connections, e.g. input is now a param short 
def map_BiGG_reactions(table_file):
    """Map the output of map_to_KEGG() to a BiGG namespace file (rewritten-type, see auxilliaries).

    :param table_file: The path to the saved table from running map_to_KEGG().
    :type  table_file: string
    :returns:          The table with an additional column for the mapping to BiGG reactions.
    :rtype:            pd.DataFrame
    """

    r_namespace = load_a_table_from_database('bigg_reactions', False)

    table = pd.read_csv(table_file)
    table['bigg_id'] = pd.Series(dtype='str')

    table = table.apply(lambda row: map_BiGG_reactions_row(row,r_namespace), axis=1)

    return table


#### actual extension

In [73]:
# @TODO
def isreaction_complete(reac, exclude_dna=True, exclude_rna=True):
    """Check, if a reaction is complete and ready to be added to the model.
    Additionally, it is possible to check for DNA and RNA reations
    and set them to be excluded or included.

    :param reac:        The reaction to be checked.
    :type  reac:        cobra.Reaction
    :param exclude_dna: Tag to include or exclude DNA reactions.
    :type  exclude_dna: bool, default is True.
    :param exclude_rna: Tag to include or exclude RNA reactions.
    :type  exclude_rna: bool, default is True.
    :returns:           True if the check is successful, else false.
    :rtype:             bool
    """

    # ................
    # @TODO
    # extendable
    # ................

    # check reaction
    if exclude_dna and 'DNA' in reac.name:
        return False
    if exclude_rna and 'RNA' in reac.name:
        return False

    # check metabolites
    for m in reac.metabolites:
        if m.id == '' or pd.isnull(m.id):
            return False
        if m.name == '' or pd.isnull(m.name):
            return False
        if m.formula == '' or pd.isnull(m.formula):
            return False

    return True


# @TODO
# @DOCS wrong
# UNDER CONSTRUCTION
def build_metabolite_mnx(metabolite, model, mnx_chem_prop, mnx_chem_xref, bigg_metabolites, namespace):
    """Create or retrieve (from model) a metabolite based on its MetaNetX ID.

    :param metabolite:        The MetaNetX ID of the metabolite.
    :type  metabolite:        string
    :param model:             The underlying genome-scale metabolic model.
    :type  model:             cobra.model
    :param mnx_chem_xref:     The chem_xref table from MetaNetX
    :type  mnx_chem_xref:     pd.DataFrame
    :param mnx_chem_prop:     The chem_prop table from MetaNetX
    :type  mnx_chem_prop:     pd.DataFrame
    :param bigg_metabolites:  The BiGG compound namespace table.
    :type  bigg_metabolites:  pd.DataFrame
    :returns:                 The retrieved or newly build metabolite.
    :rtype:                   cobra.Metabolite
    """

    metabolite_prop = mnx_chem_prop[mnx_chem_prop['ID']==metabolite]
    metabolite_anno = mnx_chem_xref[mnx_chem_xref['ID']==metabolite]
    model_mnx = [x.annotation['metanetx.chemical'] for x in model.metabolites if 'metanetx.chemical' in x.annotation]

    # fast check if compound already in model
    # ------------------------------------------
    # @TODO ..........................................
        #   currently no checking for compartments
        #   first match will be taken (most often cytosol one)
        #   regardless of the compartment
        #.............................................
    # step 1: check if MetaNetX ID in model
    if metabolite in model_mnx:
        matches = [x.id for x in model.metabolites if 'metanetx.chemical' in x.annotation and x.annotation['metanetx.chemical']==metabolite]

    # step 2: if yes, retrieve metabolite from model
    #  case 1: multiple matches found
        if len(matches) > 1:
            # ................
            # @TODO see above
            # ................
            match = model.metabolites.get_by_id(matches[0])
        #  case 2: only one match found
        else:
            match = model.metabolites.get_by_id(matches[0])

        # step 3: add metabolite
        return match

    # if not, create new metabolite
    # -----------------------------
    else:

        # step 1: create a random metabolite ID
        # ...........................
        # @TODO : compartment problem 
        # - does it have to be in the name?
        # ...........................
        new_metabolite = cobra.Metabolite(create_random_id(model, 'meta','SPECIMEN')) 


        # step 2: add features
        # --------------------
        # @TODO ..........................................
        #   currently no checking for compartments
        #   defaults to c
        #   makes it difficult to add exchange reactions
        #.................................................
        new_metabolite.formula = metabolite_prop['formula'].iloc[0]
        new_metabolite.name = metabolite_prop['name'].iloc[0]
        new_metabolite.charge = metabolite_prop['charge'].iloc[0]
        new_metabolite.compartment = 'c'

        # step 3: add notes
        # -----------------
        new_metabolite.notes['added via'] = 'metanetx.chemical'

        # step 4: add annotations
        # -----------------------
        new_metabolite.annotation['metanetx.chemical'] = metabolite_prop['ID'].iloc[0]
        new_metabolite.annotation['chebi'] = metabolite_prop['reference'].iloc[0].upper()
        if not pd.isnull(metabolite_prop['InChIKey'].iloc[0]):
            new_metabolite.annotation['inchikey'] = metabolite_prop['InChIKey'].iloc[0].split('=')[1]
        for db in ['kegg.compound','metacyc.compound','seed.compound','bigg.metabolite']:
            db_matches = metabolite_anno[metabolite_anno['source'].str.contains(db)]
            if len(db_matches) == 1:
                 new_metabolite.annotation[db] = db_matches['source'].iloc[0].split(':',1)[1]
            elif len(db_matches) > 1:
                new_metabolite.annotation[db] = [m.split(':',1)[1] for m in db_matches['source'].tolist()]

        # if no BiGG was found in MetaNetX, try reverse search in BiGG
        if metabolite in bigg_metabolites['MetaNetX (MNX) Chemical']:
            new_metabolite.annotation['bigg.metabolite'] =  bigg_metabolites[bigg_metabolites['MetaNetX (MNX) Chemical']==metabolite].iloc[0]
        
        # add additional information from bigg if possible    
        if 'bigg.metabolite' in new_metabolite.annotation.keys():
            bigg_information = bigg_metabolites[bigg_metabolites['bigg_id'].str.contains('|'.join(new_metabolite.annotation['bigg.metabolite']))]
            db_id_bigg = {'BioCyc':'biocyc', 'MetaNetX (MNX) Chemical':'metanetx.chemical','SEED Compound':'seed.compound','InChI Key':'inchikey'}
            for db in db_id_bigg:
                info = bigg_information[db].dropna().to_list()
                if len(info) > 0:
                    info = ','.join(info)
                    info = [x.strip() for x in info.split(',')] # make sure all entries are a separate list object
                    new_metabolite.annotation[db_id_bigg[db]] = info

        # step 5: change ID according to namespace
        # ----------------------------------------
        match_id_to_namespace(new_metabolite,namespace)
       
        # step 6: re-check existence of ID in model
        # -----------------------------------------
        # @TODO : check complete annotations? 
        #        - or let those be covered by the duplicate check later on?
        if new_metabolite.id in [_.id for _ in model.metabolites]:
            return model.metabolites.get_by_id(new_metabolite.id)
        
    return new_metabolite

# @TODO
# @DOCS
# UNDER CONSTRUCTION
def build_metabolite_kegg(kegg_id, model, model_kegg_ids, bigg_metabolites, namespace):
    """Create or retrieve (from model) a metabolite based on its KEGG ID.

    :param kegg_id:           The KEGG.compound ID of the metabolite in question.
    :type  kegg_id:           string
    :param model:             The model.
    :type  model:             cobra.Model
    :param model_kegg_ids:    List of all annotated KEGG Compound IDs in the model.
    :type  model_kegg_ids:    list
    :param bigg_metabolites:  The BiGG compound namespace table.
    :type  bigg_metabolites:  pd.DataFrame
    :returns:                 The retrieved or newly build metabolite.
    :rytpe:                   cobra.Metabolite
    """

    # retrieve KEGG entry for compound
    # --------------------------------
    try:
        kegg_handle = REST.kegg_get(kegg_id)
        kegg_record = [r for r in Compound.parse(kegg_handle)][0]
    except urllib.error.HTTPError:
        print(F'HTTPError: {kegg_id}')
        return cobra.Metabolite()
    except ConnectionResetError:
        print(F'ConnectionResetError: {kegg_id}')
        return cobra.Metabolite()
    except urllib.error.URLError:
        print(F'URLError: {kegg_id}')
        return cobra.Metabolite()

    # ---------------------------------------
    # fast check if compound already in model
    # ---------------------------------------
    # @TODO ..........................................
        #   currently no checking for compartments
        #   first match will be taken (most often cytosol one)
        #   regardless of the compartment
        #.............................................
    # step 1: check via KEGG ID
    if kegg_id in model_kegg_ids:
        matches = [x.id for x in model.metabolites if ('kegg.compound' in x.annotation and x.annotation['kegg.compound'] == kegg_id)]

        # step 2: model id --> metabolite object
        #  case 1: multiple matches found
        if len(matches) > 1:
            match = model.metabolites.get_by_id(matches[0])
        #  case 2: only one match found
        else:
            match = model.metabolites.get_by_id(matches[0])

        # step 3: add metabolite
        return match

    # -----------------------------
    # if not, create new metabolite
    # -----------------------------
    # ...............
    # @TODO
    #     compartment
    # ...............
    else:
        # step 1: create a random metabolite ID
        # -------------------------------------
        # ...........................
        # @TODO : compartment problem 
        # - does it have to be in the name?
        # ...........................
        new_metabolite = cobra.Metabolite(create_random_id(model, 'meta','SPECIMEN')) 

        # step 2: add features
        # --------------------
        # @TODO ..........................................
        #   currently no checking for compartments
        #.............................................
        # set name from KEGG and additionally use it as ID if there is none yet
        if isinstance(kegg_record.name, list):
            if len(kegg_record.name) > 1:
                new_metabolite.name = kegg_record.name[1]
            else:
                new_metabolite.name = kegg_record.name[0]
        else:
            new_metabolite.name = kegg_record.name
        # set compartment
        new_metabolite.compartment = 'c'
        # set formula
        new_metabolite.formula = kegg_record.formula

        # step 3: add notes
        # -----------------
        new_metabolite.notes['added via'] = 'KEGG.compound'

        # step 4: add annotations
        # -----------------------
        new_metabolite.annotation['kegg.compound'] = kegg_id
        db_idtf = {'CAS':'cas','PubChem':'pubchem.compound','ChEBI':'chebi'}
        for db,ids in kegg_record.dblinks:
            if db in db_idtf:
                if len(ids) > 1:
                    new_metabolite.annotation[db_idtf[db]] = ids
                else:
                    new_metabolite.annotation[db_idtf[db]] = ids[0]

        # add additional information from BiGG
        if kegg_id in bigg_metabolites['KEGG Compound']:

            bigg_information = bigg_metabolites[bigg_metabolites['KEGG Compound']==kegg_id]
            if len(bigg_information) > 0:

                new_metabolite.annotation['bigg.metabolite'] = bigg_information['bigg_id'].to_list()

                db_id_bigg = {'BioCyc':'biocyc', 'MetaNetX (MNX) Chemical':'metanetx.chemical','SEED Compound':'seed.compound','InChI Key':'inchikey'}
                for db in db_id_bigg:
                    info = bigg_information[db].dropna().to_list()
                    if len(info) > 0:
                        info = ','.join(info)
                        info = [x.strip() for x in info.split(',')] # make sure all entries are a separate list object
                        new_metabolite.annotation[db_id_bigg[db]] = info

        # step 5: change ID according to namespace
        # ----------------------------------------
        match_id_to_namespace(new_metabolite,namespace)
       
        # step 6: re-check existence of ID in model
        # -----------------------------------------
        # @TODO : check complete annotations? 
        #        - or let those be covered by the duplicate check later on?
        if new_metabolite.id in [_.id for _ in model.metabolites]:
            return model.metabolites.get_by_id(new_metabolite.id)

        return new_metabolite

# @TODO
# @DOCS
# UNDER CONSTRUCTION
def get_metabolites_mnx(model,equation,mnx_chem_xref,mnx_chem_prop,bigg_metabolites, namespace):
    """Based on a given MetaNetX equation and a model, get or
    create metabolite entires in/for the model.

    :param model:             A GEM.
    :type  model:             cobra.Model
    :param equation:          The equation from MetaNetX
    :type  equation:          string
    :param mnx_chem_xref:     The chem_xref table from MetaNetX
    :type  mnx_chem_xref:     pd.DataFrame
    :param mnx_chem_prop:     The chem_prop table from MetaNetX
    :type  mnx_chem_prop:     pd.DataFrame
    :param bigg_metabolites:  The BiGG compound namespace table.
    :type  bigg_metabolites:  pd.DataFrame
    :returns:                 Dictonary of metabolites and stoichiometric factors.
    :rtype:                   dict
    """

    # @TODO ...................................
    #   currently no checking for compartments
    #..........................................

    model_metabolites = [m.formula for m in model.metabolites]
    metabolites = {}
    produced = -1.0
    factor = 0

    for s in equation.split(' '):
        # switch from reactants to products
        if s == '=':
            produced = 1.0
        # found stoichiometric factor
        elif s.isnumeric():
            factor = float(s)
        # skip
        elif s == '+':
            continue
        # found metabolite
        else:
            # get information from MetaNetX
            metabolite, compartment = s.split('@')
            # build or identify metabolite
            new_metabolite = build_metabolite_mnx(metabolite, model, mnx_chem_prop, mnx_chem_xref,bigg_metabolites, namespace)
            # add metabolite
            if new_metabolite.id in [_.id for _ in metabolites]:
                # ......................................................
                # @TODO: 
                #   check if metabolite if both reactant and product
                #   suggests exchange reaction 
                #   -> maybe a good place to change compartment for one?
                #   -> what about name and directions???
                # ......................................................
                try:
                    test = model.metabolites.get_by_id(new_metabolite.id)
                    new_metabolite = new_metabolite.copy()
                    new_metabolite.id = new_metabolite.id + '_i'
                except:
                    new_metabolite.id = new_metabolite.id + '_i'

            metabolites[new_metabolite] = factor * produced

    return metabolites


# @TODO
# @DOCS
# UNDER CONSTRUCTION
def get_metabolites_kegg(model,equation,chem_xref,chem_prop,bigg_metabolites, namespace):
    """Based on a given KEGG equation and a model, get or
    create metabolite entires in/for the model.

    :param model:             A GEM.
    :type  model:             cobra.Model
    :param equation:          The equation from KEGG
    :type  equation:          string
    :param chem_xref:         The chem_xref table from MetaNetX
    :type  chem_xref:         pd.DataFrame
    :param chem_prop:         The chem_prop table from MetaNetX
    :type  chem_prop:         pd.DataFrame
    :param bigg_metabolites:  The BiGG compound namespace table.
    :type  bigg_metabolites:  pd.DataFrame
    :returns:                 Dictonary of metabolites and stoichiometric factors.
    :rtype:                   dict
    """

    # @TODO ...................................
    #   currently no checking for compartments
    #..........................................

    model_metabolites = [m.formula for m in model.metabolites]
    model_kegg_ids = [m.annotation['kegg.compound'] for m in model.metabolites if 'kegg.compound' in m.annotation]
    metabolites = {}
    produced = -1.0
    factor = 1
    mnx_id = ''

    for s in equation.split(' '):
        # switch from reactants to products
        if '=' in s:
            produced = 1.0
        # found stoichiometric factor
        elif s.isnumeric():
            factor = float(s)
        # skip
        elif s == '+':
            continue
        # found metabolite
        else:
            # check if s is a valid ID
            if '(' in s:
                s = s.split('(')[0]
                # ..................................
                # @TODO
                #     known case: DNA(n) --> DNA(n+1)
                #     currently note in brackets gets ignored
                # ..................................
            elif not s.isalnum():
                print('Problem: unknown character in ID inside get_metabolites_kegg() detected.\nPlease contact dev about your problem.')
                sys.exit(1)

            mnx_id = chem_xref[chem_xref['source'] == F'kegg.compound:{s}']
            # case 1:
            #     add metabolite via MetaNetX
            #     -> make sure, only 1 ID match is found (match is unambiguous)
            if len(mnx_id) == 1:
                mnx_id = mnx_id['ID'].item()
                metabolite = build_metabolite_mnx(mnx_id, model, chem_prop, chem_xref, bigg_metabolites, namespace)
            # case 2:
            #     add metabolite via KEGG
            else:
                metabolite = build_metabolite_kegg(s, model, model_kegg_ids, bigg_metabolites, namespace)

            # add new metabolite
            # @TODO : place to check for exchanges?
            if metabolite.id in [_.id for _ in metabolites]:
                try:
                    test = model.metabolites.get_by_id(metabolite.id)
                    metabolite = metabolite.copy()
                    metabolite.id = metabolite.id + '_i'
                except:
                    metabolite.id = metabolite.id + '_i'

            metabolites[metabolite] = factor * produced

    return metabolites



def add_gene(model, reaction, row, first=False):
    """Add a new gene to a genome-scale metabolic cobra model.

    :param model:    The model.
    :type  model:    cobra.Model
    :param reaction: The reaction id to add the gene to.
    :type  reaction: string
    :param row:      A single row of the output table of map_BiGG_reactions().
    :type  row:      pd.Series
    :param first:    Shows, if gene is the first gene to be added to the reaction.
    :type  first:    bool, true if gene is first to be added.
    :returns:       The updated model.
    :rtype: cobra.Model
    """

    # add gene
    if first or model.reactions.get_by_id(reaction).gene_reaction_rule == '':
        model.reactions.get_by_id(reaction).gene_reaction_rule = row[0]
    else:
        model.reactions.get_by_id(reaction).gene_reaction_rule = model.reactions.get_by_id(reaction).gene_reaction_rule + ' or ' + row[0]

    # add name
    model.genes.get_by_id(row[0]).name = row[1]

    # add annotations
    if not pd.isnull(row[4]):
        model.genes.get_by_id(row[0]).annotation['ncbigene'] = row[4]
    model.genes.get_by_id(row[0]).annotation['ncbiprotein'] = row[2].split('.')[0]
    # note: annotations like sbo, kegg.genes and uniprot missing

    return model


# UNDER CONSTRUCTION
def add_reaction(model,row,reac_xref,reac_prop,chem_xref,chem_prop,bigg_metabolites, namespace:str='BiGG', exclude_dna=True, exclude_rna=True):

    # create reaction object
    reac = cobra.Reaction(create_random_id(model,'reac','SPECIMEN'))

    # ----------------------------
    # curate reaction via MetaNetX
    # ----------------------------
    # try kegg.reaction --> metanetx.reaction
    if F'kegg.reaction:{row["KEGG.reaction"]}' in list(reac_xref['source']):
        
        # get MetaNetX ID
        met_reac_kegg = reac_xref[reac_xref['source']==F'kegg.reaction:{row["KEGG.reaction"]}']
        met_reac = reac_prop[reac_prop['ID']==met_reac_kegg['ID'].iloc[0]]

        # make sure exactly one entry is parsed
        # @TODO : parallel parsing
        if len(met_reac) > 1:
            print(F'Warning: multiple matches for kegg.reaction {row["KEGG.reaction"]} found. Only first one will be used.')
            met_reac = met_reac.head(1)

        # add name
        # --------
        #     from MetaNetX KEGG description
        reac.name = met_reac_kegg['description'].iloc[0].split('|')[0]

        # add notes
        # ---------
        reac.notes['creation'] = 'via MetaNetX'
        reac.notes['KEGG.information'] = row['KEGG.notes']

        # add metabolites
        # ----------------
        reac.add_metabolites(get_metabolites_mnx(model,met_reac['mnx equation'].iloc[0],chem_xref,chem_prop,bigg_metabolites, namespace))
        #@TODO .............
        #   direction of reaction
        #   ---> current solution:
        #        use one direction only
        # ..................

        # add annotations
        # ---------------
        reac.annotation['ec-code'] = row['EC number']
        reac.annotation['kegg.reaction'] = row['KEGG.reaction']
        reac.annotation['metanetx.reaction'] = met_reac_kegg['ID'].iloc[0]
        met_reac_anno = reac_xref[reac_xref['ID']==met_reac_kegg['ID'].iloc[0]]
        for db in ['metacyc.reaction','seed.reaction','rhea','bigg.reaction']:
            db_matches = met_reac_anno[met_reac_anno['source'].str.contains(db)]
            if len(db_matches) == 1:
                reac.annotation[db] = db_matches['source'].iloc[0].split(':',1)[1]
            elif len(db_matches) > 1:
                reac.annotation[db] = [r.split(':',1)[1] for r in db_matches['source'].tolist()]
            else:
                continue
    
    # if not possible, use information from KEGG only
    # ------------------------
    # curate reaction via KEGG
    # ------------------------
    else:
        
        # retrieve reaction information from KEGG
        reac_kegg = kegg_reaction_parser(row['KEGG.reaction'])

        # add name
        # --------
        #     from KEGG name
        reac.name = reac_kegg['name']

        # add notes
        # ---------
        reac.notes['creation'] = 'via KEGG'
        reac.notes['KEGG.information'] = row['KEGG.notes']

        # add metabolites
        # ----------------
        reac.add_metabolites(get_metabolites_kegg(model,reac_kegg['equation'],chem_xref,chem_prop,bigg_metabolites, namespace))
            #@TODO .............
            #   direction of reaction
            #   ---> current solution:
            #        use one direction only
            # ..................

        # add annotations
        # ---------------
        reac.annotation['ec-code'] = row['EC number']
        reac.annotation['kegg.reaction'] = row['KEGG.reaction']
        for db, identifiers in reac_kegg['db'].items():
            if len(identifiers) == 1:
                reac.annotation[db] = identifiers[0]
            else:
                reac.annotation[db] = identifiers


    # --------------------------------------
    # re-set ID to fit namespace if possible
    # --------------------------------------
    match_id_to_namespace(reac, namespace)

    # ---------------------
    # add reaction to model
    # ---------------------
    
    # if the ID change results in an ID already in the model, use that reaction
    if reac.id in [_.id for _ in model.reactions]:
        print(f'{reac.id} already in model, not added a second time.')
    else:
        # check if reaction is complete
        # and fullfills the requirements / parameters
        if isreaction_complete(reac, exclude_dna, exclude_rna):
            model.add_reactions([reac])
        else:
            print(F'reaction {reac.name} for gene {row["locus_tag"]} could not be completely reconstructed, not added to model.')
            return model

    # --------
    # add gene
    # --------
    # check if gene is already in model
    if row['locus_tag'] in model.genes:
        # if - for whatever reason - gene already in gpr, skip
        if row['locus_tag'] in model.reactions.get_by_id(reac.id).gene_reaction_rule:
            return model
        # create new gpr, if nonexistent
        elif not model.reactions.get_by_id(reac.id).gene_reaction_rule or len(model.reactions.get_by_id(reac.id).gene_reaction_rule) == 0:
            model.reactions.get_by_id(reac.id).gene_reaction_rule = row['locus_tag']
        # add gene to existing gpr
        else:
            model.reactions.get_by_id(reac.id).gene_reaction_rule = model.reactions.get_by_id(reac.id).gene_reaction_rule + ' or ' + row['locus_tag']
    else:
        # add (to) gene reaction rule and curate new gene object
        if not model.reactions.get_by_id(reac.id).gene_reaction_rule or len(model.reactions.get_by_id(reac.id).gene_reaction_rule) == 0:
            model = add_gene(model, reac.id, row, first=True)
        else:
            model = add_gene(model, reac.id, row, first=False)

    return model

# notes
# @CHECK : connections, e.g. input is now a param short 
def extent_model(table, model,chem_prop_file,chem_xref_file,reac_prop_file,reac_xref_file, namespace, exclude_dna=True, exclude_rna=True):
    """Add reactions, metabolites and genes to a model based on the output of map_to_bigg().

    :param table:                 The table with the information to be added to the model.
    :type  table:                 pd.DataFrame, output of map_to_bigg
    :param model:                 The genome-scale metabolic model to be extended
    :type  model:                 cobra.Model
    :param chem_prop_file:        Path to the MetaNetX chem_prop file.
    :type  chem_prop_file:        string
    :param chem_xref_file:        Path to the MetaNetX chem_xref file.
    :type  chem_xref_file:        string
    :param reac_prop_file:        Path to the MetaNetX reac_prop file.
    :type  reac_prop_file:        string
    :param reac_xref_file:        Path to the MetaNetX reac_xref file.
    :type  reac_xref_file:        string
    :param exclude_dna:           Tag to include or exclude DNA reactions.
    :type  exclude_dna:           bool, default is True.
    :param exclude_rna:           Tag to include or exclude RNA reactions.
    :type  exclude_rna:           bool, default is True.
    :returns:                     The extended model.
    :rytpe:                       cobra.Model
    """

    # load MetaNetX database / namespace
    chem_prop = pd.read_csv(chem_prop_file, sep='\t', comment='#', names=['ID','name','reference','formula','charge','mass','InChI','InChIKey','SMILES'])
    chem_xref = pd.read_csv(chem_xref_file, sep='\t', comment='#', names=['source','ID','description'])

    reac_prop = pd.read_csv(reac_prop_file, sep='\t', comment='#', names=['ID','mnx equation','reference','classifs','is_balanced','is_transport'])
    reac_xref = pd.read_csv(reac_xref_file, sep='\t', comment='#', names=['source','ID','description'])

    # load bigg metabolite namespace
    bigg_metabolites = load_a_table_from_database('bigg_metabolites', False)
    bigg_metabolites.rename(columns={'id':'bigg_id'}, inplace=True)
    bigg_metabolites = bigg_metabolites[['bigg_id','universal_bigg_id','name','CHEBI','BioCyc','KEGG Compound','MetaNetX (MNX) Chemical','SEED Compound','InChI Key']]

    # add genes one by one to model
    print('\tAdding genes and if needed reactions and metabolites to model:')
    for row_idx in tqdm(table.index):

        # generate Name -> KEGG.reaction dictionary
        react_dict = make_reaction_annotation_dict(model,'KEGG')
        # generate Name -> BiGG.reaction dictionary
        react_dict_2 = make_reaction_annotation_dict(model,'BiGG')

        # get row in pandas format
        row = table.iloc[row_idx]


        # case 1: BiGG name already in model = reaction in model
        if not pd.isnull(row['bigg_id']) and any((True for _ in row['bigg_id'].split(' ') if _ in react_dict_2.values())):
            # get matching reaction id(s)
            reac_found = [_ for _ in row['bigg_id'].split(' ') if _ in react_dict_2.values()]
            # add genes to all reactions
            for r in reac_found:
                model = add_gene(model, r, row)

        # case 1: KEGG reaction ID in model = reaction probably in model as well
        elif row['KEGG.reaction'] in react_dict.values():
            # get corresponding reaction
            react_found = [_ for _ in react_dict.keys() if row['KEGG.reaction'] == react_dict[_]]
            # add gene to all reactions found
            for r in react_found:
                model = add_gene(model,r,row)

        # case 3: reaction not in model
        #         -> add reaction(s), gene and metabolites if needed
        else:
            # case 3.1: one reaction
            react = row['KEGG.reaction'].split(' ')
            if len(react) == 1:
                model = add_reaction(model,row,reac_xref,reac_prop,chem_xref,chem_prop,bigg_metabolites, namespace, exclude_dna, exclude_rna)

            # case 3.2: multiple reactions
            #           add each reaction separatly, with the same gene for th gene reaction rule
            # note: zero reactions not possible due to previous filtering
            else:

                for r in react:
                    temp_row = row.copy(deep=True)
                    temp_row['KEGG.reaction'] = r
                    model = add_reaction(model,temp_row,reac_xref,reac_prop,chem_xref,chem_prop,bigg_metabolites, namespace, exclude_dna, exclude_rna)

    return model



### Test Area 51

In [28]:
mg = pd.read_csv(mapped_genes_path)
mapped_genes_comp = map_BiGG_reactions(mapped_genes_path)

In [74]:
draft_model = load_model_cobra(draft_model_path)
new_model = extent_model(mapped_genes_comp, 
             draft_model,
             chem_prop_file,
             chem_xref_file,
             reac_prop_file,
             reac_xref_file, 
             namespace='BiGG', 
             exclude_dna=True, exclude_rna=True)

	Adding genes and if needed reactions and metabolites to model:


  0%|          | 0/53 [00:00<?, ?it/s]

reaction tRNA-uridine uracilmutase for gene AB-1_S128_00258 could not be completely reconstructed, not added to model.
reaction L-Methionine:tRNAMet ligase (AMP-forming) for gene AB-1_S128_00407 could not be completely reconstructed, not added to model.
reaction Aminoacyl-tRNA aminoacylhydrolase for gene AB-1_S128_00466 could not be completely reconstructed, not added to model.
reaction S-adenosyl-L-methionine:rRNA (adenine-N6-)-methyltransferase for gene AB-1_S128_01553 could not be completely reconstructed, not added to model.
reaction deoxynucleoside triphosphate:DNA deoxynucleotidyltransferase for gene AB-1_S128_02056 could not be completely reconstructed, not added to model.
reaction L-Cysteine:tRNA(Cys) ligase (AMP-forming) for gene AB-1_S128_02165 could not be completely reconstructed, not added to model.
reaction deoxynucleoside triphosphate:DNA deoxynucleotidyltransferase for gene AB-1_S128_02195 could not be completely reconstructed, not added to model.
reaction tRNA-guanosin

In [76]:
c = 0
for x in new_model.reactions:
    if x.notes['creation'] != 'via template':
        c += 1
        print(x.id)
print(c)

SPECIMENreac2A7RQ7
SPECIMENreac81FWMJ
SPECIMENreacPKXFUQ
AGMT
SPECIMENreac3WTKGW
R_TPRDCOAS
SPECIMENreacCFJ7N8
SPECIMENreacJPD1V9
SPECIMENreacQHC8MB
SPECIMENreacNO6KCQ
SPECIMENreac522PJU
SPECIMENreacJU0MHQ
SPECIMENreac7U4580
SPECIMENreac0OTQ1Q
SPECIMENreac8JUQXD
SPECIMENreacHPK3IW
SPECIMENreacTNN7GZ
MAN1PT
SPECIMENreac44OE2G
SPECIMENreacZ3S4RB
SPECIMENreacQBYJ8V
SPECIMENreacJOSC6B
SPECIMENreac9ZVS32
23


In [85]:
new_model.reactions.get_by_id('SPECIMENreacJU0MHQ')

0,1
Reaction identifier,SPECIMENreacJU0MHQ
Name,GTP:GTP guanylyltransferase
Memory address,0x1fbd61060
Stoichiometry,"2.0 gtp_c --> SPECIMENmeta0B2QH7 + 2.0 ppi_c  2.0 GTP C10H12N5O14P3 --> cyclic di-3',5'-guanylate + 2.0 Diphosphate"
GPR,AB-1_S128_02424
Lower bound,0.0
Upper bound,1000.0


In [87]:
new_model.metabolites.get_by_id('SPECIMENmeta0B2QH7')

{'metanetx.chemical': 'MNXM731193',
 'chebi': 'CHEBI:58805',
 'inchikey': 'PKFDLKSEZWEFGL-MHARETSRSA-L',
 'kegg.compound': 'C16463',
 'metacyc.compound': 'C-DI-GMP',
 'seed.compound': 'cpd15167'}

In [160]:
draft_model = load_model_cobra(draft_model_path)

{'sbo': 'SBO:0000185',
 'metanetx.reaction': 'MNXR94675',
 'bigg.reaction': '12DGR120tipp',
 'eco': 'ECO:0007759'}

In [161]:
draft_model.reactions[0].id

'12DGR120tipp'

In [162]:
draft_model.reactions[0].id = 'dud'
print(draft_model.reactions[0].id)
match_id_to_namespace(draft_model.reactions[0], 'BiGG')
print(draft_model.reactions[0].id)

dud
12DGR120tipp
