In [1]:
%load_ext autoreload
%autoreload 2

In [14]:
from refinegems.classes.gapfill import GeneGapFiller
from refinegems.utility.io import load_model

modelpath = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JSC1435/JCSC1435.xml'
model = load_model(modelpath,'libsbml')
cmodel = load_model(modelpath,'cobra')

gffpath = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JSC1435/JCSC1435_RefSeq.gff'

gf2 = GeneGapFiller()
gf2_missing_genes = gf2.get_missing_genes(gffpath,model)
# ncbiprotein | locus_tag | ec-code

tfasta = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JSC1435/JCSC1435_proteins_genome.fasta'
spdb = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/swissprot.dmnd'
# gf2_missing_genes
spmap = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/uniprot_table.tsv'
kwargs = {'outdir':'/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JSC1435',
          'sens':'more-sensitive',
          'cov':90.0,
          't':4,
          'pid':90.0}

mapped_res = gf2.get_missing_reacs(model=cmodel,
                                   missing_genes=gf2_missing_genes,
                                   fasta=tfasta, 
                                   dmnd_db=spdb,
                                   swissprot_map=spmap,
                                   **kwargs)

Running in debugging mode.


In [149]:
mapped_res[0]

Unnamed: 0,ncbiprotein,locus_tag,ec-code,UniProt
303,WP_011274811.1,SH0486,3.2.-.-,[Q4L980]
304,WP_011274812.1,SH0487,2.3.1.-,[Q4L979]
305,WP_011274813.1,SH0488,1.14.99.-,[Q4L978]
306,WP_011274814.1,SH0489,2.4.1.-,[Q4L977]
307,WP_011274815.1,SH0490,2.5.1.96,[Q4L976]
308,WP_080004924.1,SH0491,1.3.8.-,[Q4L975]
312,WP_011274824.1,SH0499,2.3.1.-,[Q4L967]


## the "Filling" part of Gapfilling

**input**

- the model
- missing genes table
- missing reacs table

**out**

- the extended model

**else**

- logging
- save stats information

In [15]:
import pandas as pd
from itertools import chain
import re
from refinegems.utility.cvterms import add_cv_term_genes
from libsbml import FbcOr, FbcAnd, GeneProductRef
from libsbml import Model as libModel
import warnings
import libsbml
from typing import Union,List


def fill_model(model, missing_genes:pd.DataFrame, 
               missing_reacs:pd.DataFrame):
    
    # Step 1: Add genes to model whoose reactions are already in it
    # -------------------------------------------------------------
    # filter the respective genes and reactions
    reacs_in_model = missing_reacs[~(missing_reacs['add_to_GPR'].isnull())]
    ncbiprot_with_reacs_in_model = [*chain(*list(reacs_in_model['ncbiprotein']))]
    genes_with_reacs_in_model = missing_genes[missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model)]
    
    if len(genes_with_reacs_in_model) > 0:
        # add genes as gene products to model
        add_genes_from_table(model, genes_with_reacs_in_model)
    
        # extend gene production rules 
        add_gene_reac_associations_from_table(model,reacs_in_model)
        
        # what remains:
        missing_reacs = missing_reacs[missing_reacs['add_to_GPR'].isnull()]
        missing_genes = missing_genes[~(missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model))]
    
    
    # Step 2: 

In [16]:
# [*chain(*list(mapped_res[1][~mapped_res[1]['add_to_GPR']]['ncbiprotein']))]
testmodel = model.clone()
# print(testmodel.getReaction('R_12DGR160tipp').getPlugin(0).getGeneProductAssociation().getListOfAllElements())
testcase = mapped_res[1].copy()
testcase.iloc[2,-1] = ['12DGR160tipp']
fill_model(testmodel,mapped_res[0],testcase)
after = testmodel.getPlugin(0).getListOfGeneProducts()

In [17]:
testmodel.getReaction('R_12DGR160tipp').getPlugin(0).getGeneProductAssociation().getListOfAllElements()[0].getGeneProduct()

'G_WP_011274813_1'

In [18]:
testcase.iloc[2,3]

'1 MNXM10@MNXD1 + 1 MNXM1@MNXD1 + 1 MNXM731466@MNXD1 + 1 MNXM735438@MNXD1 = 1 MNXM3180@MNXD1 + 1 MNXM8@MNXD1 + 1 WATER@MNXD1'

### Further ideas and Code snippets for the filling part

##### how to build the new entities:

- option a) collection all information first, filter and then add them from table
- option b) iteratively collection information and add entities (reaction after reaction)

use libsbml or cobrapy?

available functions:
- libsbml-based create_reaction/create_species (needs all information beforehand + all other entities need to be in the model) -> required for the gene labels
- cobra-based add_reaction/add_metabolite (builds as it goes), also match_id_to_namespace and 
finding possible matches might be easier using COBRApy <- namespace and annotation stuff far easier here

definitly needed:
- parse reaction string of different formats:
    - MetaNetX (can get this somewhat from SPECIMEN)
    - KEGG (also somewhat in SPECIMEN)
    - BiGG (new?)
    - BioCyc (new?)
- retrieve needed information from the required databases (reaction/metabolites)
    - cross referencing, if one db not enough?
- filter for when to include reactions and when not (e.g. missing metabolites, formulas, DNA/RNA etc.) **This means, before adding stuff to the model, it needs to be validated**


#### Reload a libsbml model into a cobra model

In [19]:
from tempfile import NamedTemporaryFile
from refinegems.utility.io import write_model_to_file, load_model

with NamedTemporaryFile(suffix='.xml') as tmp:
    print(tmp)
    write_model_to_file(model,tmp.name)
    cobramodel = load_model(tmp.name,'cobra')

<tempfile._TemporaryFileWrapper object at 0x2c5d7c190>


#### creating the reactions

In [147]:
from refinegems.curation.db_access.kegg import kegg_reaction_parser
from tqdm import tqdm

# GapFiller functions
# -------------------

# @TODO BioCyc not implemeneted
# @TODO logging, save stuff for manual curation etc.
# @TEST - somewhat seems to work - for now
def add_reactions_from_table(model,missing_reac_table,
                             idprefix='refineGEMs',
                             namespace:Literal['BiGG']='BiGG'):
    
    # reconstruct reactions
    # ---------------------
    for idx,row in tqdm(missing_reac_table.iterrows(), 
                        desc='Trying to add missing reacs',
                        total=missing_reac_table.shape[0]):
        # build reaction
        reac = None
        match row['via']:
            # MetaNetX
            case 'MetaNetX':
                reac = build_reaction_mnx(model,row['id'],
                                          reac_str=row['equation'],
                                          references={'ec-code':[row['ec-code']]},
                                          idprefix=idprefix,
                                          namespace=namespace)      
            # KEGG
            case 'KEGG':
                refs = row['references']
                refs['ec-code'] = row['ec-code']
                reac = build_reaction_kegg(model,row['id'],
                                           reac_str=row['equation'],
                                           references=refs,
                                           idprefix=idprefix,
                                           namespace=namespace)
            # BiGG
            case 'BiGG':
                reac = build_reaction_bigg(model,row['id'],
                                           references={'ec-code':[row['ec-code']]},
                                           idprefix=idprefix,
                                           namespace=namespace)
            # BioCyc @TODO
            case 'BioCyc':
                reac = build_reaction_biocyc()
            # Unknown database
            case _:
                mes = f'''Unknown database name for reaction reconstruction: {row["via"]}\n
                Reaction will not be reconstructed.'''
                warnings.warn(mes,UserWarning)
        
        # check output of reconstruction
        # ------------------------------
        # case 1: reconstruction was not possible
        if not reac:
            pass # nothing to do here
        # case 2: reaction(s) found in model
        elif isinstance(reac,list):
            # add found names to the add_to_GPR column of the table
            current_gpr = missing_reac_table.loc[idx,'add_to_GPR']
            if not current_gpr:
                missing_reac_table.at[idx,'add_to_GPR'] = reac
            else:
                missing_reac_table.at[idx,'add_to_GPR'] = list(set(reac + current_gpr))
        # case 3: new reaction was generated
        elif isinstance(reac,cobra.Reaction):
            # validate reaction
            if isreaction_complete(reac):
                # add reaction to model (if validation succesful)
                model.add_reactions([reac])
                # add reaction ID to table under add_to_GPR
                current_gpr = missing_reac_table.loc[idx,'add_to_GPR']
                if not current_gpr:
                    missing_reac_table.at[idx,'add_to_GPR'] = reac.id
                else:
                    current_gpr.append(reac.id)
                    missing_reac_table.at[idx,'add_to_GPR'] = list(set(current_gpr))
        # case 4: should never occur
        else:
            mes = f'Unknown return type for reac param. Please contact the developers.'
            raise TypeError(mes)
        
    #@TODO
    # save reactions, that could not be recontructed, for manual curation
    manual_curation_reacs = missing_reac_table[missing_reac_table['add_to_GPR'].isnull()]
    # return the updated table with successfully reconstructed reaction ids 
    # to enable adding the genes
    missing_gprs = missing_reac_table[~missing_reac_table['add_to_GPR'].isnull()]
    return missing_gprs



# -------------------

# since adding genes works better with libsbml, 
# return the tabke with the newly added reaction IDs?
# or still add them here??? -> since this is specific 
# for gapfilling, it does not matter

In [148]:
testcmod = cobramodel.copy()
addtestcase = testcase.sample(5)
add_reactions_from_table(testcmod,addtestcase)

Trying to add missing reacs: 100%|██████████| 5/5 [00:04<00:00,  1.09it/s]


Unnamed: 0,ec-code,ncbiprotein,id,equation,reference,is_transport,via,add_to_GPR
486,2.4.1.-,[WP_011274814.1],MNXR117306,1 MNXM1033@MNXD1 + 1 MNXM1103285@MNXD1 + 1 MNX...,rheaR:31315,,MetaNetX,refineGEMsreacJ7KDH4
169,2.3.1.-,"[WP_011274812.1, WP_011274824.1]",MNXR112303,1 MNXM1102789@MNXD1 + 1 MNXM735433@MNXD1 + 1 W...,keggR:R08809,,MetaNetX,refineGEMsreac55L771
455,2.4.1.-,[WP_011274814.1],MNXR112311,1 MNXM1102128@MNXD1 + 1 MNXM6228@MNXD1 = 1 MNX...,keggR:R08817,,MetaNetX,refineGEMsreac5RBIFT
110,1.14.99.-,[WP_011274813.1],MNXR180140,1 MNXM62080@MNXD1 + 1 WATER@MNXD1 = 1 MNXM9572...,seedR:rxn32331,,MetaNetX,refineGEMsreacY72BV4
