In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from refinegems.classes.gapfill import GeneGapFiller
from refinegems.utility.io import load_model

modelpath = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JSC1435/JCSC1435.xml'
model = load_model(modelpath,'libsbml')
cmodel = load_model(modelpath,'cobra')

gffpath = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JSC1435/JCSC1435_RefSeq.gff'

gf2 = GeneGapFiller()
gf2_missing_genes = gf2.get_missing_genes(gffpath,model)
# ncbiprotein | locus_tag | ec-code

tfasta = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JSC1435/JCSC1435_proteins_genome.fasta'
spdb = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/swissprot.dmnd'
# gf2_missing_genes
spmap = '/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/uniprot_table.tsv'
kwargs = {'outdir':'/Users/brune/Documents/11_Test_Data/test_refinegems/test_gapfill/JSC1435',
          'sens':'more-sensitive',
          'cov':90.0,
          't':4,
          'pid':90.0}

mapped_res = gf2.get_missing_reacs(model=cmodel,
                                   missing_genes=gf2_missing_genes,
                                   fasta=tfasta, 
                                   dmnd_db=spdb,
                                   swissprot_map=spmap,
                                   **kwargs)

* 'underscore_attrs_are_private' has been removed


Running in debugging mode.


In [3]:
mapped_res[1]

Unnamed: 0,ec-code,ncbiprotein,id,equation,reference,is_transport,via,add_to_GPR
0,5.6.2.2,[WP_011274363.1],MNXR115632,1 MNXM12437@MNXD1 + 1 MNXM40333@MNXD1 + 1 MNXM...,metacycR:5.99.1.3-RXN,,MetaNetX,
1,5.6.2.2,[WP_011274363.1],MNXR172894,1 MNXM1100221@MNXD1 + 1 MNXM40333@MNXD1 + 1 MN...,sabiorkR:15120,,MetaNetX,
2,5.6.2.2,[WP_011274363.1],MNXR172895,1 MNXM1100221@MNXD1 + 1 MNXM735047@MNXD1 + 1 M...,sabiorkR:15121,,MetaNetX,
3,5.6.2.2,[WP_011274363.1],MNXR172896,1 MNXM1100223@MNXD1 + 1 MNXM40333@MNXD1 + 1 MN...,sabiorkR:15122,,MetaNetX,


## the "Filling" part of Gapfilling

**input**

- the model
- missing genes table
- missing reacs table

**out**

- the extended model

**else**

- logging
- save stats information

In [4]:
mapped_res[0]

Unnamed: 0,ncbiprotein,locus_tag,ec-code,UniProt
4,WP_011274363.1,SH0005,5.6.2.2,"[Q5HK03, Q8CQK4, Q6GKU0, Q2FKQ1, Q6GD85, P0A0K..."


In [5]:
import pandas as pd
from itertools import chain
import re
from refinegems.utility.cvterms import add_cv_term_genes


# @TODO merge with the function of the same name in entities, if possible
# or just use them separatly 
# @TODO generalise addition of references -> maybe kwargs
# @TODO
# what to do about the name
def create_gp(model, protein_id, 
              name:str=None, locus_tag:str=None,
              uniprot:tuple[str,bool]=None):
    
    # create gene product object
    gp = model.getPlugin(0).createGeneProduct()
    # set basic attributes
    geneid = f'G_{protein_id}'.replace('.','_') # remove problematic signs
    gp.setIdAttribute(geneid)               # ID 
    if name: gp.setName(name)               # Name  
    if locus_tag: gp.setLabel(locus_tag)    # Label
    gp.setSBOTerm('SBO:0000243')            # SBOterm
    gp.setMetaId(f'meta_G_{protein_id}')    # Meta ID
    # test for NCBI/RefSeq
    if re.fullmatch('^(((AC|AP|NC|NG|NM|NP|NR|NT|NW|WP|XM|XP|XR|YP|ZP)_\d+)|(NZ_[A-Z]{2,4}\d+))(\.\d+)?$', protein_id, re.IGNORECASE):
        id_db = 'REFSEQ'
    elif re.fullmatch('^(\w+\d+(\.\d+)?)|(NP_\d+)$', protein_id, re.IGNORECASE): id_db = 'NCBI'
    if id_db: add_cv_term_genes(protein_id, id_db, gp)           # NCBI protein
    # add further references
    # @TODO extend or generalise
    if uniprot:
        for uniprotid in uniprot[0]:
            add_cv_term_genes(uniprotid, 'UNIPROT', gp, uniprot[1]) # UniProt
   
   
# probably sort into GapFiller
def add_genes_from_table(model, gene_table:pd.DataFrame):
    
    # ncbiprotein | locus_tag | ec-code | ...
    # work on a copy to ensure input stays the same
    gene_table = gene_table.copy()
    gene_table.drop(columns=['ec-code'],inplace=True)
    
    # create gps from the table and add them to the model
    for idx,x in gene_table.iterrows():
        create_gp(model, x['ncbiprotein'], 
                  locus_tag=x['locus_tag'],
                  uniprot=(x['UniProt'],True))
        

def create_gpr(reaction,gene):
    # Case 1:
    pass


def fill_model(model, missing_genes:pd.DataFrame, 
               missing_reacs:pd.DataFrame):
    
    # Step 1: Add genes to model whoose reactions are already in it
    # -------------------------------------------------------------
    # filter the respective genes and reactions
    reacs_in_model = missing_reacs[~(missing_reacs['add_to_GPR'].isnull())]
    ncbiprot_with_reacs_in_model = [*chain(*list(reacs_in_model['ncbiprotein']))]
    genes_with_reacs_in_model = missing_genes[missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model)]
    
    if len(genes_with_reacs_in_model) > 0:
        # add genes as gene products to model
        add_genes_from_table(model, genes_with_reacs_in_model)
    
        # extend gene production rules 
        # @TODO
        # add_gene_reac_associations_from_table(model,....)
   
        # what remains:
        missing_reacs = missing_reacs[missing_reacs['add_to_GPR'].isnull()]
        missing_genes = missing_genes[~(missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model))]
    
    
    # Step 2: 

In [6]:
# [*chain(*list(mapped_res[1][~mapped_res[1]['add_to_GPR']]['ncbiprotein']))]
testmodel = model.clone()
testcase = mapped_res[1].copy()
testcase.iloc[2,-1] = ['12DGR160tipp']
fill_model(testmodel,mapped_res[0],testcase)
after = testmodel.getPlugin(0).getListOfGeneProducts()

In [None]:
from libsbml import FbcOr, FbcAnd

testmodel = model.clone()
x = 113 # 45 : None, 46 : One, 113 Or
id = 'WP_011274363_1'
reac = testmodel.getListOfReactions()[x].getPlugin(0)
# connection = 'or'
# test, if there is already a gpr
old_association_str = None
old_association_fbc = None
if reac.getGeneProductAssociation():
    old_association = reac.clone().getGeneProductAssociation().getListOfAllElements()
    if len(old_association) == 1:
        old_association_str = old_association[0].getGeneProduct()
    else:
        for el in old_association:
            if isinstance(el, FbcOr) or isinstance(el, FbcAnd):
                old_association_fbc = el # there should only be one object od this type -> @TODO check
                break
                
# create new gene product association 
if old_association_str and isinstance(id,str):
    id = [old_association_str,id]
elif old_association_str  and isinstance(id,list):
    id.append(old_association_str)
    
# this does not work!!!!
# @IDEA: create a dummy gp e.g. a copy of the current one and copy it from there
if not old_association_fbc:
    new_association = reac.createGeneProductAssociation()
else:
    new_association = reac.createGeneProductAssociation().createOr()
    new_association.addAssociation(old_association_fbc)
    
if isinstance(id,str):
    new_association.createGeneProductRef().setGeneProduct(id)
elif isinstance(id,list) and len(id) == 1:
    new_association.createGeneProductRef().setGeneProduct(id[0])
elif isinstance(id,list) and len(id) > 1:
    gpa_or =  new_association.createOr()
    for i in id:
        gpa_or.createGeneProductRef().setGeneProduct(i)
        
    

print(reac.getGeneProductAssociation().getListOfAllElements())
    

: 

In [14]:
testmodel = model.clone()
print(len(testmodel.getListOfReactions()))
testreac = testmodel.getListOfReactions()[113]
testreaccopy = testreac.clone()

1538


In [10]:
testreac.getId()

'R_ADPT'

In [11]:
testreaccopy.getId()

'R_ADPT'

In [12]:
len(testmodel.getListOfReactions())

1538

### Further ideas and Code snippets for the filling part