In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
from refinegems.classes.gapfill import GeneGapFiller
from refinegems.utility.io import load_model

modelpath = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JSC1435/JCSC1435.xml'
model = load_model(modelpath,'libsbml')
cmodel = load_model(modelpath,'cobra')

gffpath = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JSC1435/JCSC1435_RefSeq.gff'

gf2 = GeneGapFiller()
gf2_missing_genes = gf2.get_missing_genes(gffpath,model)
# ncbiprotein | locus_tag | ec-code

tfasta = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JSC1435/JCSC1435_proteins_genome.fasta'
spdb = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/swissprot.dmnd'
# gf2_missing_genes
spmap = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/uniprot_table.tsv'
kwargs = {'outdir':'/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JSC1435',
          'sens':'more-sensitive',
          'cov':90.0,
          't':4,
          'pid':90.0}

mapped_res = gf2.get_missing_reacs(model=cmodel,
                                   missing_genes=gf2_missing_genes,
                                   fasta=tfasta, 
                                   dmnd_db=spdb,
                                   swissprot_map=spmap,
                                   **kwargs)

Running in debugging mode.


## the "Filling" part of Gapfilling

**input**

- the model
- missing genes table
- missing reacs table

**out**

- the extended model

**else**

- logging
- save stats information

In [15]:
import pandas as pd
from itertools import chain
import re
from refinegems.utility.cvterms import add_cv_term_genes
from libsbml import FbcOr, FbcAnd, GeneProductRef
import warnings


# @TODO merge with the function of the same name in entities, if possible
# or just use them separatly 
# @TODO generalise addition of references -> maybe kwargs
# @TODO
# what to do about the name
def create_gp(model, protein_id, 
              name:str=None, locus_tag:str=None,
              uniprot:tuple[str,bool]=None):
    
    # create gene product object
    gp = model.getPlugin(0).createGeneProduct()
    # set basic attributes
    geneid = f'G_{protein_id}'.replace('.','_') # remove problematic signs
    gp.setIdAttribute(geneid)               # ID 
    if name: gp.setName(name)               # Name  
    if locus_tag: gp.setLabel(locus_tag)    # Label
    gp.setSBOTerm('SBO:0000243')            # SBOterm
    gp.setMetaId(f'meta_G_{protein_id}')    # Meta ID
    # test for NCBI/RefSeq
    if re.fullmatch('^(((AC|AP|NC|NG|NM|NP|NR|NT|NW|WP|XM|XP|XR|YP|ZP)_\d+)|(NZ_[A-Z]{2,4}\d+))(\.\d+)?$', protein_id, re.IGNORECASE):
        id_db = 'REFSEQ'
    elif re.fullmatch('^(\w+\d+(\.\d+)?)|(NP_\d+)$', protein_id, re.IGNORECASE): id_db = 'NCBI'
    if id_db: add_cv_term_genes(protein_id, id_db, gp)           # NCBI protein
    # add further references
    # @TODO extend or generalise
    if uniprot:
        for uniprotid in uniprot[0]:
            add_cv_term_genes(uniprotid, 'UNIPROT', gp, uniprot[1]) # UniProt
   
   
# probably sort into GapFiller
def add_genes_from_table(model, gene_table:pd.DataFrame):
    
    # ncbiprotein | locus_tag | ec-code | ...
    # work on a copy to ensure input stays the same
    gene_table = gene_table.copy()
    gene_table.drop(columns=['ec-code'],inplace=True)
    
    # create gps from the table and add them to the model
    for idx,x in gene_table.iterrows():
        create_gp(model, x['ncbiprotein'], 
                  locus_tag=x['locus_tag'],
                  uniprot=(x['UniProt'],True))
        

# @TODO : does it cover indeed all cases
# Where to sort it -> entities?
def create_gpr(reaction,gene):

    # Step 1: test, if there is already a gpr
    # ---------------------------------------
    old_association_str = None
    old_association_fbc = None
    if reaction.getPlugin(0).getGeneProductAssociation():
        old_association = reaction.getPlugin(0).getGeneProductAssociation().getListOfAllElements()
        # case 1: only a single association
        if len(old_association) == 1 and isinstance(old_association[0],GeneProductRef):
            old_association_str = old_association[0].getGeneProduct()
        # case 2: nested structure of asociations
        elif isinstance(old_association[0], FbcOr) or isinstance(old_association[0], FbcAnd):
            old_association_fbc = old_association[0].clone()
            # this should get the highest level association (that includes all others)

                    
    # Step 2: create new gene product association 
    # -------------------------------------------
    if old_association_str and isinstance(gene,str):
        gene = [old_association_str,id]
    elif old_association_str  and isinstance(gene,list):
        gene.append(old_association_str)
        
    # add the old association rule as an 'OR' (if needed)
    if not old_association_fbc:
        new_association = reaction.getPlugin(0).createGeneProductAssociation()
    else:
        new_association = reaction.getPlugin(0).createGeneProductAssociation().createOr()
        new_association.addAssociation(old_association_fbc)

    # add the remaining genes 
    # @TODO currently, only connection possible is 'OR'
    if isinstance(gene,str):
        new_association.createGeneProductRef().setGeneProduct(gene)
    elif isinstance(gene,list) and len(gene) == 1:
        new_association.createGeneProductRef().setGeneProduct(gene[0])
    elif isinstance(gene,list) and len(gene) > 1:
        gpa_or =  new_association.createOr()
        for i in gene:
            gpa_or.createGeneProductRef().setGeneProduct(i)
            

# @TODO seems very ridgid, beter ways to find the ids?
# probably sort into GapFiller
def add_gene_reac_associations_from_table(model,reac_table:pd.DataFrame):
    
    model_gene_ids = [_.getId() for _ in model.getPlugin(0).getListOfGeneProducts()]
    
    # get each unique ncbiprotein vs reaction mapping
    reac_table = reac_table[['ncbiprotein','add_to_GPR']]
    reac_table = reac_table.explode('ncbiprotein').explode('add_to_GPR')
    reac_table.drop_duplicates(inplace=True)
    
    # add the genes to the corresponding GPRs
    for idx,row in reac_table.iterrows():
        # check, if G_+ncbiprotein in model
        # if yes, add gpr
        geneid = 'G_'+row['ncbiprotein'].replace('.','_')
        reacid = 'R_'+row['add_to_GPR']
        if geneid in model_gene_ids:
            create_gpr(model.getReaction(reacid),geneid)
         # else, print warning
        else:
            mes = f'Cannot find {geneid} in model. Should be added to {reacid}'
            warnings.warn(mes,UserWarning)
    

def fill_model(model, missing_genes:pd.DataFrame, 
               missing_reacs:pd.DataFrame):
    
    # Step 1: Add genes to model whoose reactions are already in it
    # -------------------------------------------------------------
    # filter the respective genes and reactions
    reacs_in_model = missing_reacs[~(missing_reacs['add_to_GPR'].isnull())]
    ncbiprot_with_reacs_in_model = [*chain(*list(reacs_in_model['ncbiprotein']))]
    genes_with_reacs_in_model = missing_genes[missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model)]
    
    if len(genes_with_reacs_in_model) > 0:
        # add genes as gene products to model
        add_genes_from_table(model, genes_with_reacs_in_model)
    
        # extend gene production rules 
        add_gene_reac_associations_from_table(model,reacs_in_model)
        
        # what remains:
        missing_reacs = missing_reacs[missing_reacs['add_to_GPR'].isnull()]
        missing_genes = missing_genes[~(missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model))]
    
    
    # Step 2: 

In [16]:
# [*chain(*list(mapped_res[1][~mapped_res[1]['add_to_GPR']]['ncbiprotein']))]
testmodel = model.clone()
# print(testmodel.getReaction('R_12DGR160tipp').getPlugin(0).getGeneProductAssociation().getListOfAllElements())
testcase = mapped_res[1].copy()
testcase.iloc[2,-1] = ['12DGR160tipp']
fill_model(testmodel,mapped_res[0],testcase)
after = testmodel.getPlugin(0).getListOfGeneProducts()

In [17]:
testmodel.getReaction('R_12DGR160tipp').getPlugin(0).getGeneProductAssociation().getListOfAllElements()[0].getGeneProduct()

'G_WP_011274813_1'

In [18]:
testcase.iloc[2,3]

'1 MNXM10@MNXD1 + 1 MNXM1@MNXD1 + 1 MNXM731466@MNXD1 + 1 MNXM735438@MNXD1 = 1 MNXM3180@MNXD1 + 1 MNXM8@MNXD1 + 1 WATER@MNXD1'

### Further ideas and Code snippets for the filling part

##### how to build the new entities:

- option a) collection all information first, filter and then add them from table
- option b) iteratively collection information and add entities (reaction after reaction)

use libsbml or cobrapy?

available functions:
- libsbml-based create_reaction/create_species (needs all information beforehand + all other entities need to be in the model) -> required for the gene labels
- cobra-based add_reaction/add_metabolite (builds as it goes), also match_id_to_namespace and 
finding possible matches might be easier using COBRApy <- namespace and annotation stuff far easier here

definitly needed:
- parse reaction string of different formats:
    - MetaNetX (can get this somewhat from SPECIMEN)
    - KEGG (also somewhat in SPECIMEN)
    - BiGG (new?)
    - BioCyc (new?)
- retrieve needed information from the required databases (reaction/metabolites)
    - cross referencing, if one db not enough?
- filter for when to include reactions and when not (e.g. missing metabolites, formulas, DNA/RNA etc.) **This means, before adding stuff to the model, it needs to be validated**


#### Reload a libsbml model into a cobra model

In [19]:
from tempfile import NamedTemporaryFile
from refinegems.utility.io import write_model_to_file, load_model

with NamedTemporaryFile(suffix='.xml') as tmp:
    print(tmp)
    write_model_to_file(model,tmp.name)
    cobramodel = load_model(tmp.name,'cobra')

<tempfile._TemporaryFileWrapper object at 0x2c5d7c190>


#### Parse reaction string

In [123]:

equation = '1 MNXM1100221@MNXD1 + 1 MNXM735047@MNXD1 + 1 MNXM9@MNXD1 = 1 MNXM1100222@MNXD1 + 1 MNXM286@MNXD1'
equation2 = 'aspsa_c + nadp_c + pi_c <-> 4pasp_c + h_c + nadph_c'
equation3 = 'C00024 + C00025 <=> C00010 + C00624'

# @TODO: BioCyc missing
def parse_reac_str(equation, type='MetaNetX'):

    products = {}
    reactants = {}
    compartments = list()
    is_product = False
    reversible = True

    match type:
        case 'MetaNetX':
            for s in equation.split(' '):
                # switch from reactants to products
                if s == '=':
                    is_product = True
                # found stoichiometric factor
                elif s.isnumeric():
                    factor = float(s)
                # skip
                elif s == '+':
                    continue
                # found metabolite
                else:
                    # get information from MetaNetX
                    metabolite, compartment = s.split('@')
                    compartments.append(compartment)
                    
                    if is_product:
                        products[metabolite] = factor
                    else:
                        reactants[metabolite] = factor
                        
        case 'BiGG':
            factor = 1.0 # BiGG does not use factor 1 in the quations
            for s in equation.split(' '):
                # found factor
                if s.replace('.','').isdigit():
                    factor = float(s)
                # switch from reactants to products
                elif s == '-->' :
                    is_product = True
                    reversible = False
                elif s == '<->':
                    is_product = True
                # skip
                elif s == '+':
                    continue
                # found metabolite
                else:
                    compartments.append(s.rsplit('_',1)[1])
                    if is_product:
                        products[s] = factor
                    else:
                        reactants[s] = factor
                    factor = 1.0
              
        case 'KEGG':
            compartments = None
            factor = 1.0
            for s in equation.split(' '):
                if s.isnumeric():
                    factor = float(s)
                elif s == '+':
                    continue
                elif s == '<=>': # @TODO are there more options?
                    is_product = True
                else:
                    if is_product:
                        products[s] = factor
                    else:
                        reactants[s] = factor
                    factor = 1.0
        
        case 'BioCyc':
            pass
                  
    return (reactants,products,compartments,reversible)
        
        
        
parse_reac_str(equation3,'KEGG')

({'C00024': 1.0, 'C00025': 1.0}, {'C00010': 1.0, 'C00624': 1.0}, None, True)

#### check, if a reaction should be added or not

In [21]:
# originally from SPECIMEN HQTB
# @TODO
def isreaction_complete(reac:cobra.Reaction, 
                        exclude_dna:bool=True, exclude_rna:bool=True) -> bool:

    # check reaction
    if exclude_dna and 'DNA' in reac.name:
        return False
    if exclude_rna and 'RNA' in reac.name:
        return False

    # check metabolites
    for m in reac.metabolites:
        if m.id == '' or pd.isnull(m.id):
            return False
        if m.name == '' or pd.isnull(m.name):
            return False
        if m.formula == '' or pd.isnull(m.formula):
            return False

    return True



#### creating the reactions

In [23]:
# some decorators
def template(func):
    def wrapper():
        print('This function is a template for developers.\nThis cannot be executed.')
    return wrapper

def implement(func):
    def wrapper():
        print('The current function is just a placeholder and will be implement in the fucture.')
    return wrapper

In [124]:
from refinegems.utility.io import load_a_table_from_database
from refinegems.utility.entities import create_random_id, match_id_to_namespace
import cobra
import pandas as pd
from typing import Literal
from Bio.KEGG import REST, Compound
import urllib
import requests 
import json

# @TODO : name is an issue
def get_BiGG_metabs_annotation_via_dbid(metabolite, id, dbcol, compartment):
    if not 'bigg.metabolite' in metabolite.annotation.keys():
        bigg_search = load_a_table_from_database(
            f'SELECT * FROM bigg_metabolites WHERE \'{dbcol}\' = \'{id}\'',
            query=True)
        if len(bigg_search) > 0:
            metabolite.annotation['bigg.metabolite'] = [_ for _ in bigg_search['id'].tolist() if _.endswith(f'_{compartment}')]
            if len(metabolite.annotation['bigg.metabolite']) == 0:
                metabolite.annotation.pop('bigg.metabolite')


def add_annotations_from_BiGG_metabs(metabolite:cobra.Metabolite):
    if 'bigg.metabolite' in metabolite.annotation.keys():
        bigg_information = load_a_table_from_database(
            'SELECT * FROM bigg_metabolites WHERE id = \'' + f'\' OR id = \''.join(metabolite.annotation['bigg.metabolite']) + '\'',
            query=True)
        db_id_bigg = {'BioCyc':'biocyc', 'MetaNetX (MNX) Chemical':'metanetx.chemical','SEED Compound':'seed.compound','CHEBI':'chebi', 'KEGG Compound':'kegg.compound'}
        for db in db_id_bigg:
            info = list(set(bigg_information[db].dropna().to_list()))
            if len(info) > 0:
                info = ','.join(info)
                info = [x.strip() for x in info.split(',')] # make sure all entries are a separate list object
                if db_id_bigg[db] in metabolite.annotation.keys():
                    metabolite.annotation[db_id_bigg[db]] = list(set(info + metabolite.annotation[db_id_bigg[db]]))
                else:
                    metabolite.annotation[db_id_bigg[db]] = info


@template
def build_metabolite_xxx(id:str, model:cobra.Model, 
                         namespace:str,
                         compartment:str,
                         idprefix:str) -> cobra.Metabolite: 
    # check if id in model
    # get information via id
    # collection formation in a new metabolite object
    # add more annotations from other databases
    # adjust namespace
    # check model again for new namespace
    pass

# originally from SPECIMEN
# @TODO some issues left
# current version works on a couple of examples 
def build_metabolite_mnx(id: str, model:cobra.Model, 
                         namespace:str='BiGG',
                         compartment:str='c',
                         idprefix:str='refineGEMs') -> cobra.Metabolite | None:

    # fast check if compound already in model
    # ------------------------------------------
    # step 1: check if MetaNetX ID in model
    matches = [x.id for x in model.metabolites if 'metanetx.chemical' in x.annotation and x.annotation['metanetx.chemical']==id and x.compartment == compartment]

    # step 2: if yes, retrieve metabolite from model
        # case 1: multiple matches found
    if len(matches) > 0:
        if len(matches) > 1:
            # ................
            # @TODO what to do
            # currently, just the first one is taken
            # ................
            match = model.metabolites.get_by_id(matches[0])
        #  case 2: only one match found
        else:
            match = model.metabolites.get_by_id(matches[0])

        # step 3: add metabolite
        return match

    # if not, create new metabolite
    # -----------------------------
    metabolite_prop = load_a_table_from_database(f'SELECT * FROM mnx_chem_prop WHERE id = \'{id}\'')
    metabolite_anno = load_a_table_from_database(f'SELECT * FROM mnx_chem_xref WHERE id = \'{id}\'')
    if len(metabolite_prop) == 0: # cannot construct metabolite
        return None
    else:
        
        # step 1: create a random metabolite ID
        new_metabolite = cobra.Metabolite(create_random_id(model, 'meta', idprefix)) 

        # step 2: add features
        # --------------------
        new_metabolite.formula = metabolite_prop['formula'].iloc[0]
        new_metabolite.name = metabolite_prop['name'].iloc[0]
        new_metabolite.charge = metabolite_prop['charge'].iloc[0]
        new_metabolite.compartment = compartment

        # step 3: add notes
        # -----------------
        new_metabolite.notes['created with'] = 'refineGEMs GapFiller, metanetx.chemical'

        # step 4: add annotations
        # -----------------------
        # add SBOTerm
        new_metabolite.annotation['sbo'] = 'SBO:0000247'
        
        # add information directly available from the mnx_chem_prop table 
        new_metabolite.annotation['metanetx.chemical'] = [metabolite_prop['id'].iloc[0]]
        if not pd.isnull(metabolite_prop['InChIKey'].iloc[0]):
            new_metabolite.annotation['inchikey'] = metabolite_prop['InChIKey'].iloc[0].split('=')[1]
        
        # get more annotation from the mnx_chem_xref table
        for db in ['kegg.compound','metacyc.compound','seed.compound','bigg.metabolite','chebi']:
            db_matches = metabolite_anno[metabolite_anno['source'].str.contains(db)]
            if len(db_matches) > 0:
                new_metabolite.annotation[db] = [m.split(':',1)[1] for m in db_matches['source'].tolist()]

        # Cleanup BiGG annotations (MetaNetX only saves universal)
        # @TODO : there is no guarantee, that the id with the specific compartment actually exists -> still do it? // kepp the universal id?
        new_metabolite.annotation['bigg.metabolite'] = [_+'_'+compartment for _ in new_metabolite.annotation['bigg.metabolite']]
        # if no BiGG was found in MetaNetX, try reverse search in BiGG
        get_BiGG_metabs_annotation_via_dbid(new_metabolite, id, 'MetaNetX (MNX) Chemical', compartment)
                
        # add additional information from BiGG (if ID found)    
        add_annotations_from_BiGG_metabs(new_metabolite)

        # step 5: change ID according to namespace
        # ----------------------------------------
        match_id_to_namespace(new_metabolite,namespace)
       
        # step 6: re-check existence of ID in model
        # -----------------------------------------
        # @TODO : check complete annotations? 
        #        - or let those be covered by the duplicate check later on?
        if new_metabolite.id in [_.id for _ in model.metabolites]:
            return model.metabolites.get_by_id(new_metabolite.id)
           
    return new_metabolite


# originally from SPECIMEN
# @TODO some issues left
# current version works on a couple of examples 
def build_metabolite_kegg(kegg_id:str, model:cobra.Model, 
                          namespace:Literal['BiGG']='BiGG', 
                          compartment:str='c',
                          idprefix='refineGEMs') -> cobra.Metabolite | None:

    
    # ---------------------------------------
    # fast check if compound already in model
    # ---------------------------------------
    # step 1: check via KEGG ID
    matches = [x.id for x in model.metabolites if ('kegg.compound' in x.annotation and x.annotation['kegg.compound'] == kegg_id)]
    if len(matches) > 0:
        # step 2: model id --> metabolite object
        #  case 1: multiple matches found
        if len(matches) > 1:
            # .......
            # @TODO
            # .......
            match = model.metabolites.get_by_id(matches[0])
        #  case 2: only one match found
        else:
            match = model.metabolites.get_by_id(matches[0])

        # step 3: add metabolite
        return match

    # -----------------------------
    # if not, create new metabolite
    # -----------------------------
    
    # step 1: retrieve KEGG entry for compound
    # ----------------------------------------
    try:
        kegg_handle = REST.kegg_get(kegg_id)
        kegg_record = [r for r in Compound.parse(kegg_handle)][0]
    except urllib.error.HTTPError:
        warnings.warn(F'HTTPError: {kegg_id}')
        return None
    except ConnectionResetError:
        warnings.warn(F'ConnectionResetError: {kegg_id}')
        return None
    except urllib.error.URLError:
        warnings.warn(F'URLError: {kegg_id}')
        return None

    # step 2: create a random metabolite ID
    # -------------------------------------
    new_metabolite = cobra.Metabolite(create_random_id(model, 'meta',idprefix)) 

    # step 3: add features
    # --------------------
    # set name from KEGG and additionally use it as ID if there is none yet
    if isinstance(kegg_record.name, list):
        # @TODO : better way to choose a name than to just take the first entry???
        new_metabolite.name = kegg_record.name[0]
    else:
        new_metabolite.name = kegg_record.name
    # set compartment
    new_metabolite.compartment = compartment
    # set formula
    new_metabolite.formula = kegg_record.formula

    # step 4: add notes
    # -----------------
    new_metabolite.notes['created with'] = 'refineGEMs GapFiller, KEGG.compound'

    # step 5: add annotations
    # -----------------------
    # add annotation from the KEGG entry
    new_metabolite.annotation['kegg.compound'] = kegg_id
    db_idtf = {'CAS':'cas','PubChem':'pubchem.compound','ChEBI':'chebi'}
    for db,ids in kegg_record.dblinks:
        if db in db_idtf:
            new_metabolite.annotation[db_idtf[db]] = ids
            
    # add SBOTerm
    new_metabolite.annotation['sbo'] = 'SBO:0000247'

    # search for infos in MetaNetX
    # @TODO, since the table are readily available at the database now
    mnx_info = load_a_table_from_database(
        f'SELECT * FROM mnx_chem_xref WHERE source = \'kegg.compound:{kegg_id}\'',
        query=True
    )
    if len(mnx_info) > 0:
        mnx_ids = list(set(mnx_info['id']))
    # mapping is unambiguously
    if len(mnx_ids) == 1:
        mnx_info = load_a_table_from_database(
        f'SELECT * FROM mnx_chem_prop WHERE id = \'{mnx_ids[0]}\'',
        query=True
        )
        # add charge 
        new_metabolite.charge = mnx_info['charge'].iloc[0]
        # add more annotations
        new_metabolite.annotation['metanetx.chemical'] = [mnx_info['id'].iloc[0]]
        if not pd.isnull(mnx_info['InChIKey'].iloc[0]):
            new_metabolite.annotation['inchikey'] = mnx_info['InChIKey'].iloc[0].split('=')[1]
        
        # get more annotation from the mnx_chem_xref table 
        metabolite_anno = load_a_table_from_database(f'SELECT * FROM mnx_chem_xref WHERE id = \'{mnx_info["id"]}\'')
        for db in ['kegg.compound','metacyc.compound','seed.compound','bigg.metabolite','chebi']:
            db_matches = metabolite_anno[metabolite_anno['source'].str.contains(db)]
            if len(db_matches) > 0:
                mnx_tmp = [m.split(':',1)[1] for m in db_matches['source'].tolist()]
                if db in new_metabolite.annotation.keys():
                    new_metabolite.annotation[db] = list(set(mnx_tmp + new_metabolite.annotation[db]))
                else:
                    new_metabolite.annotation[db] = mnx_tmp

    else:
        pass
        # @TODO : how to handle multiple matches, e.g. getting charge will be complicated
        
    # Cleanup BiGG annotations (MetaNetX only saves universal)
    # @TODO : there is no guarantee, that the id with the specific compartment actually exists -> still do it? // kepp the universal id?
    if 'bigg.metabolite' in new_metabolite.annotation.keys():
        new_metabolite.annotation['bigg.metabolite'] = [_+'_'+compartment for _ in new_metabolite.annotation['bigg.metabolite']]
    
    # if no BiGG ID, try reverse search
    get_BiGG_metabs_annotation_via_dbid(new_metabolite, id, 'KEGG Compound', compartment)
    
    # search for annotations in BiGG
    add_annotations_from_BiGG_metabs(new_metabolite)

    # step 6: change ID according to namespace
    # ----------------------------------------
    match_id_to_namespace(new_metabolite,namespace)
    
    # step 7: re-check existence of ID in model
    # -----------------------------------------
    # @TODO : check complete annotations? 
    #        - or let those be covered by the duplicate check later on?
    if new_metabolite.id in [_.id for _ in model.metabolites]:
        return model.metabolites.get_by_id(new_metabolite.id)

    return new_metabolite


# @TEST some more, somewhat works, but who knows...
# @TODO some comments inside
# @NOTE expects the non-universal BiGG ID (meaning the one with the compartment abbreviation
#       at the end) -> change behaviour or keep it?
def build_metabolite_bigg(id:str, model:cobra.Model, 
                         namespace:Literal['BiGG']='BiGG',
                         idprefix:str='refineGEMs') -> cobra.Metabolite: 
    
    compartment = id.rsplit('_',1)[1]
    # ------------------------------------------
    # fast check if compound already in model
    # ------------------------------------------
    # step 1: check if MetaNetX ID in model
    matches = [x.id for x in model.metabolites if 'bigg.metabolite' in x.annotation and (x.annotation['bigg.metabolite']==id or x.annotation['bigg.metabolite']==id.rsplit('_',1)[0]) and x.compartment == compartment]
    # step 2: if yes, retrieve metabolite from model
        # case 1: multiple matches found
    if len(matches) > 0:
        if len(matches) > 1:
            # ................
            # @TODO what to do
            # currently, just the first one is taken
            # ................
            match = model.metabolites.get_by_id(matches[0])
        #  case 2: only one match found
        else:
            match = model.metabolites.get_by_id(matches[0])

        # step 3: add metabolite
        return match
    
    # -----------------------------
    # if not, create new metabolite
    # -----------------------------
    # get information from the database
    bigg_res = load_a_table_from_database(
            f'SELECT * FROM bigg_metabolites WHERE id = \'{id}\'',
            query=True)
    if len(bigg_res) > 0:
        bigg_res = bigg_res.iloc[0,:]
    else:
        return None # not data = no recontruction
    # get information from MNX if ID available
    mnx_res=None
    if bigg_res['MetaNetX (MNX) Chemical']:
        mnx_res = load_a_table_from_database(
            f'SELECT * FROM mnx_chem_prop WHERE id=\'{bigg_res["MetaNetX (MNX) Chemical"]}\'',
            query=True)
        if len(mnx_res) > 0:
            mnx_res = mnx_res.iloc[0,:]
        else:
            mnx_res=None 
    
    # step 1: create a random metabolite ID
    # -------------------------------------
    new_metabolite = cobra.Metabolite(create_random_id(model, 'meta', idprefix)) 
    
    # step 2: add features
    # --------------------
    new_metabolite.name = bigg_res['name']
    new_metabolite.compartment = compartment
    if mnx_res is not None:
        if mnx_res['charge']:
            new_metabolite.charge = mnx_res['charge']
        if mnx_res['formula']:
            new_metabolite.formula = mnx_res['formula']
            
    if not new_metabolite.formula or not new_metabolite.charge:
        try:
            bigg_fetch = json.loads(requests.get(f'http://bigg.ucsd.edu/api/v2/universal/metabolites/{id.rsplit("_",1)[0]}').text)
            if 'formulae' in bigg_fetch.keys() and not new_metabolite.formula:     
                new_metabolite.formula = bigg_fetch['formulae'][0]
            if 'charges' in bigg_fetch.keys() and new_metabolite.charge:   
                new_metabolite.charge = bigg_fetch['charges'][0]
        except Exception as e:
            # @TODO
            pass
    
    # step 3: add notes
    # -----------------
    new_metabolite.notes['created with'] = 'refineGEMs GapFiller, BiGG'

    # step 4: add annotations
    # -----------------------
    # add SBOTerm
    new_metabolite.annotation['sbo'] = 'SBO:0000247'
    # add infos from BiGG
    new_metabolite.annotation['bigg.metabolite'] = [id]  # @TODO or use the universal id?
    add_annotations_from_BiGG_metabs(new_metabolite)
    # add annotations from MNX
    if mnx_res is not None:
        if mnx_res['InChI'] and 'inchi' not in new_metabolite.annotation.keys():
            new_metabolite.annotation['inchi'] = mnx_res['InChI']
        if mnx_res['InChIKey'] and 'inchikey' not in new_metabolite.annotation.keys():
            new_metabolite.annotation['inchikey'] = mnx_res['InChIKey']
    
    # step 5: change ID according to namespace
    # ----------------------------------------
    match_id_to_namespace(new_metabolite,namespace)
    
    # step 6: re-check existence of ID in model
    # -----------------------------------------
    # @TODO : check complete annotations? 
    #        - or let those be covered by the duplicate check later on?
    if new_metabolite.id in [_.id for _ in model.metabolites]:
        return model.metabolites.get_by_id(new_metabolite.id)
    
    return new_metabolite
    

@implement
def build_metabolite_biocyc(id:str, model:cobra.Model, 
                         namespace:str,
                         compartment:str,
                         idprefix:str) -> cobra.Metabolite: 
    pass

In [129]:
from refinegems.curation.db_access.kegg import kegg_reaction_parser
# general functions
# -----------------

# TODO
#   extend the build function so, that all of them can take either the id or an equation 
#   as input for rebuilding the reaction (would also be beneficial for semi-manual curation)

# NOTE: 
# (final) validation of reaction in all reactions missing
#  --> check reaction for completness etc.
#  --> maybe better to put it outside the build_... functions
#  --> like an extra check for the pipeline, also better for
#      filtering out DNA / RNA reactions if neccessary

def _add_annotations_from_bigg_reac_row(row, reac):
    dbnames = {'RHEA':'rhea','BioCyc':'biocyc','MetaNetX (MNX) Equation':'metanetx.reaction','EC Number':'ec-code'}
    for dbname,dbprefix in dbnames.items():
        if row[dbname]:
            ids_to_add = row[dbname].split(',')
            if dbprefix in reac.annotation.keys():
                reac.annotation[dbprefix] = list(set(reac.annotation[dbprefix]).union(set(ids_to_add)))
            else:
                reac.annotation[dbprefix] = ids_to_add

def _add_annotations_from_dict_cobra(references, entity):
    # add additional references from the parameter
    for db,idlist in references.items():
        if not isinstance(idlist,list):
            idlist = [idlist]
        if db in entity.annotation.keys():
            entity.annotation[db] = list(set(entity.annotation[db] + idlist))
        else:
            entity.annotation[db] = idlist


@template
def build_reaction_xxx():
    pass


# @TEST (more) - tries some cases, in which it seems to work
# @TODO
def build_rection_mnx(model, id,
                      reac_str:str = None,
                      references:dict={},
                      idprefix='refineGEMs',
                      namespace:Literal['BiGG']='BiGG') -> cobra.Reaction | None | list:
    
    # ---------------------
    # check, if ID in model
    # ---------------------
    matches_found = [_.id for _ in model.reactions if 'metanetx.reaction' in _.annotation.keys() and _.annotation['metanetx.reaction']==id]
    if len(matches_found) > 0:
        return matches_found
    
    # -----------------------------
    # otherwise, build new reaction
    # -----------------------------
    
    # get relevant part of table from database
    mnx_reac_refs = load_a_table_from_database(
        f'SELECT * FROM mnx_reac_xref WHERE id = \'{id}\'',
        query=True)
    mnx_reac_refs = mnx_reac_refs[~(mnx_reac_refs['description']=='secondary/obsolete/fantasy identifier')]
    
    # create reaction object
    new_reac = cobra.Reaction(create_random_id(model,'reac',idprefix))
    
    # set name of reaction
    name = ''
    for desc in mnx_reac_refs['description']:
        if '|' in desc: # entry has a name and an equation string
            name = desc.split('|')[0]
            break # one name is enough
    new_reac.name = name 
    
    # get metabolites
    # ---------------
    if not reac_str:
        mnx_reac_prop = load_a_table_from_database(
                f'SELECT * FROM mnx_reac_prop WHERE id = \'{id}\'',
                query=True)
        reac_str = mnx_reac_prop['mnx_equation'][0]
        if mnx_reac_prop['ec-code'][0]:
            references['ec-code'] = mnx_reac_prop['ec-code'][0]
        
    if reac_str:
        reactants,products,comparts,rev = parse_reac_str(reac_str,'MetaNetX')
    else:
        return None
    # ............................................................
    # @TODO / Issue
    #    reac_prop / mnx equation only saves generic compartments 1 and 2 (MNXD1 / MNXD2)
    #    how to get the (correct) compartment?
    #    current solution 1 -> c, 2 -> e
    comparts = ['c' if _ == 'MNXD1' else 'e' for _ in comparts ]
    # ............................................................
    metabolites = {}
    meta_counter = 0
    
    # reconstruct reactants
    for mid,factor in reactants.items():
        tmp_meta = build_metabolite_mnx(mid,model,
                                        namespace,
                                        comparts[meta_counter],idprefix)
        if tmp_meta:
            metabolites[tmp_meta] = -1*factor
            meta_counter += 1
        else:
            return None # not able to build reaction successfully
        
    # reconstruct products
    for mid,factor in products.items():
        tmp_meta = build_metabolite_mnx(mid,model,
                                        namespace,
                                        comparts[meta_counter],idprefix)
        if tmp_meta:
            metabolites[tmp_meta] = factor
            meta_counter += 1
        else:
            return None # not able to build reaction successfully
        
    # add metabolites to reaction
    # @TODO: does it need some kind of try and error, if - for some highly unlikely reason - two newly generated ids are the same
    new_reac.add_metabolites(metabolites)
    
    # set reversibility
    if rev:
        new_reac.bounds = (1000.0,1000.0)
    else:
        new_reac.bounds = (0.0,1000.0)
        
    # get annotations
    # ---------------
    new_reac.annotation['sbo'] = 'SBO:0000167'
    # get more annotation from the mnx_reac_xref table
    for db in ['bigg.reaction','kegg.reaction','seed.reaction','metacyc.reaction','rhea']:
        db_matches = mnx_reac_refs[mnx_reac_refs['source'].str.contains(db)]
        if len(db_matches) > 0:
            new_reac.annotation[db] = [m.split(':',1)[1] for m in db_matches['source'].tolist()]
            # update reactions direction, if MetaCyc has better information
            if db == 'metacyc.reaction' and len(db_matches[db_matches['source'].str.contains('-->')]):
                new_reac.bounds = (0.0,1000.0)
    # add additional references from the parameter
    _add_annotations_from_dict_cobra(references,new_reac)
    
    # get annotations
    # ---------------
    new_reac.annotation['sbo'] = 'SBO:0000167'

    # add notes
    # ---------
    new_reac.notes['created with'] = 'refineGEMs GapFiller, MetaNetX'
    
    # match ID to namespace
    # ---------------------
    match_id_to_namespace(new_reac,namespace)
    
    return new_reac


# @TEST (more) - tries some cases, in which it seems to work
# @TODO some things still open (for discussion)
def build_reaction_kegg(model, id:str=None, reac_str:str=None,
                        references:dict={},
                        idprefix='refineGEMs',
                        namespace:Literal['BiGG']='BiGG'):
    
    # either reaction id or a reaction string needed for reconstruction
    if not id and not reac_str:
        return None # reconstruction not possible
    
    # create an empty reaction with random id
    new_reac = cobra.Reaction(create_random_id(model,'reac',idprefix))
    
    # -------------
    # KEGG ID given
    # -------------
    if id:
        # check, if reaction in model
        matches = [_.id for _ in model.reactions if 'kegg.reaction' in _.annotation.keys() and _.annotation['kegg.reaction']==id]
        if len(matches) > 0:
            return matches # return matched reaction ids in list
        
        # retrieve information from KEGG
        kegg_info = kegg_reaction_parser(id)
        if kegg_info:
            if 'name' in kegg_info.keys():
                new_reac.name = kegg_info['name']
            if 'equation' in kegg_info.keys():
                reac_str = kegg_info['equation'] if not reac_str else reac_str
            if 'db' in kegg_info.keys():
                new_reac.annotation = kegg_info['db']
    
    # -------------------------------------
    # Reaction reconstruction from equation
    # -------------------------------------
    
    # skip, if not reaction string is available
    if not reac_str:
        return None # reconstruction not possible
    
    # parse reaction string
    reactants,products,comparts,rev = parse_reac_str(reac_str,'KEGG')
        
    # ..............................................
    # @TODO
    # KEGG has no information about compartments !!!
    # current solution: always use c
    compartment = 'c'
    # ..............................................
    metabolites = {}
    meta_counter = 0
    
    # reconstruct reactants
    for mid,factor in reactants.items():
        tmp_meta = build_metabolite_kegg(mid,model,
                                        namespace,
                                        compartment,idprefix)
        if tmp_meta:
            metabolites[tmp_meta] = -1*factor
            meta_counter += 1
        else:
            return None # not able to build reaction successfully
        
    # reconstruct products
    for mid,factor in products.items():
        tmp_meta = build_metabolite_kegg(mid,model,
                                        namespace,
                                        compartment,idprefix)
        if tmp_meta:
            metabolites[tmp_meta] = factor
            meta_counter += 1
        else:
            return None # not able to build reaction successfully
    
    # add metabolites to reaction
    # @TODO: does it need some kind of try and error, if - for some highly unlikely reason - two newly generated ids are the same
    new_reac.add_metabolites(metabolites)
    
    # set reversibility
    if rev:
        new_reac.bounds = (1000.0,1000.0)
    else:
        new_reac.bounds = (0.0,1000.0)
    
    # --------------------
    # add more information
    # --------------------
    new_reac.annotation['sbo'] = 'SBO:0000167'
    # get more information from searching the KEGG ID in BiGG
    bigg_res = load_a_table_from_database(
            f'SELECT * FROM bigg_reactions WHERE \"KEGG Reaction\" = \'{id}\'',
            query=True)
    for idx,row in bigg_res.iterrows():
        r,p,compartments,r = parse_reac_str(row['reaction_string'],'BiGG')
        # .........................................
        # @TODO part 2 of compartment issue
        # find the reaction with 'c' as compartment
        if len(set(compartments)) == 1 and compartments[0] == 'c':
            new_reac.annotation['bigg.reaction'] = row['id']
            # @TODO add more information, exclude None entries
            _add_annotations_from_bigg_reac_row(row, new_reac)
            break
        # .........................................

    # @IDEA / @TODO get more information from MetaNetX
    
    # add additional references from the parameter
    _add_annotations_from_dict_cobra(references,new_reac)
    
    # add notes
    # ---------
    new_reac.notes['created with'] = 'refineGEMs GapFiller, KEGG'
    
    # match ID to namespace
    # ---------------------
    match_id_to_namespace(new_reac,namespace)
    
    return new_reac


# @TEST
# @TODO some things still open (for discussion)
def build_reaction_bigg(model, id, 
                        reac_str:str = None, 
                        references:dict={},
                        idprefix='refineGEMs',
                        namespace:Literal['BiGG']='BiGG'):
    
    # ---------------------
    # check, if ID in model
    # ---------------------
    matches_found = [_.id for _ in model.reactions if 'bigg.reaction' in _.annotation.keys() and _.annotation['bigg.reaction']==id]
    if len(matches_found) > 0:
        return matches_found
    
    # -----------------------------
    # otherwise, build new reaction
    # -----------------------------
    # create reaction object
    new_reac = cobra.Reaction(create_random_id(model,'reac',idprefix))
    
    # get information from the database
    bigg_reac_info = load_a_table_from_database(
            f'SELECT * FROM bigg_reactions WHERE id = \'{id}\'',
            query=True).iloc[0,:]
    new_reac.name = bigg_reac_info['name']
    
    # add metabolites
    # ---------------
    reactants,products,comparts,rev = parse_reac_str(bigg_reac_info['reaction_string'],'BiGG')
    
    metabolites = {}
    meta_counter = 0
    # reconstruct reactants
    for mid,factor in reactants.items():
        tmp_meta = build_metabolite_bigg(mid,model,
                                        namespace,
                                        idprefix)
        if tmp_meta:
            metabolites[tmp_meta] = -1*factor
            meta_counter += 1
        else:
            return None # not able to build reaction successfully
        
    # reconstruct products
    for mid,factor in products.items():
        tmp_meta = build_metabolite_bigg(mid,model,
                                        namespace,
                                        idprefix)
        if tmp_meta:
            metabolites[tmp_meta] = factor
            meta_counter += 1
        else:
            return None # not able to build reaction successfully
        
    # add metabolites to reaction
    # @TODO: does it need some kind of try and error, if - for some highly unlikely reason - two newly generated ids are the same
    new_reac.add_metabolites(metabolites)
    
    # set reversibility
    if rev:
        new_reac.bounds = (1000.0,1000.0)
    else:
        new_reac.bounds = (0.0,1000.0)
    
    # add annotations
    # ---------------
    # add SBOTerm
    new_reac.annotation['sbo'] = 'SBO:0000167'
    # add infos from BiGG
    new_reac.annotation['bigg.reaction'] = [id]
    _add_annotations_from_bigg_reac_row(bigg_reac_info, new_reac)
    # add additional references from the parameter
    _add_annotations_from_dict_cobra(references,new_reac)
    
    # add notes
    # ---------
    new_reac.notes['created with'] = 'refineGEMs GapFiller, BiGG'
    
    # match ID to namespace
    # ---------------------
    match_id_to_namespace(new_reac,namespace)

    return new_reac
    
    
@implement
def build_reaction_biocyc():
    pass

@implement
def build_reaction():
    pass

# GapFiller functions
# -------------------

@implement
def add_reactions_from_table():
    pass

In [128]:
build_reaction_bigg(cobramodel, '13PPDH')

0,1
Reaction identifier,13PPDH
Name,"1,3-propanediol dehydrogenase"
Memory address,0x2dd49fe50
Stoichiometry,"3hppnl_c + h_c + nadh_c --> 13ppd_c + nad_c  3-Hydroxypropanal + H+ + Nicotinamide adenine dinucleotide - reduced --> 1,3-Propanediol + Nicotinamide adenine dinucleotide"
GPR,
Lower bound,1000.0
Upper bound,1000.0


In [114]:
cobramodel.reactions.get_by_id('13PPDH')

KeyError: '13PPDH'