In [4]:
%load_ext autoreload
%autoreload 2

## KEGGapFiller Example

### Initialisation

In [5]:
from refinegems.classes.gapfill import KEGGapFiller
from refinegems.utility.io import load_model

# Get model file loaded with COBRA (for missing reactions) & libSBML
modelpath = './test_files/JCSC1435.xml' # A link to this model will be added as soon as it is published
model = load_model(modelpath,'libsbml')
cmodel = load_model(modelpath,'cobra')

# Initialise GapFiller subclass to be used with required parameters
gfk = KEGGapFiller('sha')

### Missing genes

In [6]:
# Find missing genes
gfk.find_missing_genes(model)

# Get/show missing genes
gfk.missing_genes

Creating directory /Users/doebel/Library/Caches/bioservices 


100%|██████████| 1987/1987 [43:06<00:00,  1.30s/it]


Unnamed: 0,orgid:locus,locus_tag,kegg.orthology,ec-code,ncbiprotein,uniprot
0,sha:pSHaeA01,pSHaeA01,[K11210],[2.5.1.-],BAE05988,[Q4L2Y9]
1,sha:pSHaeA02,pSHaeA02,,,BAE05989,[Q4L2Y8]
2,sha:pSHaeA03,pSHaeA03,,,BAE05990,[Q4L2Y7]
3,sha:pSHaeB01,pSHaeB01,[K00561],[2.1.1.184],BAE05991,[Q4L2Y6]
4,sha:pSHaeB02,pSHaeB02,,,BAE05992,[Q4L2Y5]
...,...,...,...,...,...,...
1982,sha:SH2674,SH2674,[K03501],[2.1.1.170],BAE05983,[Q4L2Z4]
1983,sha:SH2675,SH2675,[K03495],,BAE05984,[Q4L2Z3]
1984,sha:SH2676,SH2676,[K03650],[3.6.-.-],BAE05985,[Q4L2Z2]
1985,sha:SH2677,SH2677,[K03536],[3.1.26.5],BAE05986,[Q4L2Z1]


### Missing reactions

In [16]:
# Find missing reactions
gfk.find_missing_reactions(cmodel)

# Get/show missing reactions
gfk.missing_reactions

100%|██████████| 11/11 [00:38<00:00,  3.51s/it]


Unnamed: 0,ec-code,ncbiprotein,id,equation,reference,is_transport,via,add_to_GPR
0,2.3.1.313,[BAE04147],R13186,,{'brenda': ['2.3.1.313']},,KEGG,
1,2.4.99.28,"[BAE04395, BAE04771]",R06178,,"{'kegg.pathway': ['ec00550'], 'brenda': ['2.4....",,KEGG,
2,2.4.99.28,"[BAE04395, BAE04771]",R06179,,"{'kegg.pathway': ['ec00550'], 'brenda': ['2.4....",,KEGG,
3,2.7.10.3,[BAE03698],R02584,,{'brenda': ['2.7.10.3']},,KEGG,
4,2.7.2.18,[BAE04998],R12793,,{'brenda': ['2.7.2.18']},,KEGG,
...,...,...,...,...,...,...,...,...
3289,7.5.2.4,"[BAE04398, BAE05568]",MNXR115570,1 MNXM1@MNXD1 + 1 MNXM21999@MNXD2 + 1 MNXM4033...,seedR:rxn18688,T,MetaNetX,
3290,7.5.2.4,"[BAE04398, BAE05568]",MNXR121346,1 MNXM12982@MNXD1 + 1 MNXM1@MNXD2 + 1 MNXM4033...,seedR:rxn41181,T,MetaNetX,
3291,7.5.2.4,"[BAE04398, BAE05568]",MNXR121347,1 MNXM163615@MNXD1 + 1 MNXM1@MNXD2 + 1 MNXM403...,seedR:rxn43450,T,MetaNetX,
3292,7.5.2.4,"[BAE04398, BAE05568]",MNXR123981,1 MNXM163613@MNXD1 + 1 MNXM1@MNXD2 + 1 MNXM403...,seedR:rxn42597,T,MetaNetX,


In [18]:
gfk.missing_reactions.to_csv('./test_files/kegg_missing_reactions.tsv', sep='\t', index=False)

### Fill model

In [19]:
# Fill model
model = gfk.fill_model(model)

Trying to add missing reacs:   0%|          | 2/3287 [00:15<7:11:33,  7.88s/it]


TypeError: 'NoneType' object is not iterable

### Statistics

In [None]:
# Get raw statistics
gfk._statistics

Index(['orgid:locus', 'locus_tag', 'kegg.orthology', 'ec-code', 'ncbiprotein',
       'uniprot'],
      dtype='object')

## GeneGapFiller Example

### Initialisation
#### Get SwissProt DIAMOND database

In [None]:
from refinegems.utility.set_up import download_url

# Get SwissProt
# download_url('SwissProt gapfill', directory='./test_files/test_gapfill')

# On the command line:
# diamond makedb --in ./dev/test_files/test_gapfill/SwissProt.fasta -d ./dev/test_files/test_gapfill/swissprot

Downloading SwissProt.fasta: 100%|██████████| 88.2M/88.2M [00:05<00:00, 16.8MB/s]


#### Initialise GeneGapFiller

In [None]:
from refinegems.classes.gapfill import GeneGapFiller
from refinegems.utility.io import load_model

# Get model file loaded with COBRA (for missing reactions) & libSBML
modelpath = './test_files/IMITSC147.xml' # A link to this model will be added as soon as it is published
model = load_model(modelpath,'libsbml')
cmodel = load_model(modelpath,'cobra')

# Get files required for the GeneGapFiller
gffpath = './test_files/IMITSC147_genome.gff'

tfasta = './test_files/IMITSC147_proteins_genome.fasta'
spdb = './test_files/test_gapfill/swissprot.dmnd'
spmap = './test_files/test_gapfill/SwissProt_mapping.tsv'
kwargs = {'outdir':'./dev/test_files/test_gapfill/IMITSC147',
          'sens':'more-sensitive',
          'cov':95.0,
          't':4,
          'pid':90.0}

# Initialise GapFiller subclass to be used with required parameters
gfg = GeneGapFiller()

* 'underscore_attrs_are_private' has been removed


### Missing genes

In [None]:
# Find missing genes
gfg.find_missing_genes(gffpath,model)

# Get/show missing genes
gfg.missing_genes


### Missing reactions

In [None]:
# Find missing reactions
gfg.find_missing_reactions(model=cmodel,
                                   fasta=tfasta, 
                                   dmnd_db=spdb,
                                   swissprot_map=spmap,
                                   **kwargs)

# Get/show missing reactions
gfg.missing_reactions

Unnamed: 0,ncbiprotein,locus_tag,ec-code
0,WP_011274359.1,SH0001,
1,WP_011274360.1,SH0002,
2,WP_172458781.1,SH0003,
3,WP_011274362.1,SH0004,
4,WP_011274363.1,SH0005,
...,...,...,...
2573,WP_145424740.1,pSHaeC05,
2574,WP_011276926.1,pSHaeC06,
2575,WP_011276927.1,pSHaeC07,
2576,WP_011276928.1,pSHaeC08,


### Fill model

In [None]:
# Fill model
model = gfg.fill_model(model,formula_check='existence')

### Statistics

In [None]:
# Get raw statistics
gfg._statistics

{'genes': {'missing (before)': 1800,
  'added': 3,
  'missing (after)': 4,
  'no locus tag': 108},
 'reactions': {'added (total)': 7,
  'failed to build': 3,
  'no NCBI, no EC': 0,
  'NCBI, no EC': 43},
 'metabolites': {}}

## BioCycGapFiller example

### Initialisation

In [None]:
from refinegems.classes.gapfill import BioCycGapFiller
from refinegems.utility.io import load_model

# Get model file loaded with COBRA (for missing reactions) & libSBML
modelpath = './test_files/ATCC29970.xml' # A link to this model will be added as soon as it is published
model = load_model(modelpath,'libsbml')
cmodel = load_model(modelpath,'cobra')

# Get files required for the BioCycGapFiller
gffpath = './test_files/ATCC29970_RefSeq.gff'
biocyc_gene_tbl_path = './test_files/ATCC29970_Accession-22Reactions.txt'
biocyc_reacs_tbl_path = './test_files/ATCC29970_biocyc_rxns.txt'
fasta = './test_files/ATCC29970_proteins_genome.fasta'

# Initialise GapFiller subclass to be used with required parameters
gfbc = BioCycGapFiller(biocyc_gene_tbl_path, biocyc_reacs_tbl_path, gffpath)

mapped_res = gfbc.find_missing_reactions(cmodel)

#model = gfbc.fill_model(model,mapped_res[0],mapped_res[1],formula_check='existence')
# formula_check,exclude_dna,exclude_rna,idprefix,namespace

* 'underscore_attrs_are_private' has been removed


### Missing genes

In [None]:
# Find missing genes
gfbc.find_missing_genes(model)

# Get/show missing genes
gfg.missing_genes

### Missing reactions

In [None]:
# Find missing reactions
gfbc.find_missing_reactions(cmodel)

# Get/show missing reactions
gfbc.missing_reactions

Unnamed: 0,add_to_GPR,ec-code,equation,id,ncbiprotein,reference,via
0,[3HAD80],[4.2.1.59],(3R)-3-hydroxyoctanoyl-[acp] -> (2E)-oct-2-e...,4.2.1.59-RXN,[WP_011275247.1],,BioCyc
1,[AMMQLT8],[2.1.1.163],S-adenosyl-L-methionine + demethylmenaquinol-8...,ADOMET-DMK-METHYLTRANSFER-RXN,[WP_011275738.1],,BioCyc
2,"[NTP1, ATPM]","[3.6.1.3, 3.6.1.15]",ATP + H2O -> ADP + phosphate + H+,ATPASE-RXN,[WP_016931380.1],,BioCyc
3,[LDH_D],[1.1.1.28],(R)-lactate + NAD+ <- pyruvate + NADH + H+,DLACTDEHYDROGNAD-RXN,[WP_011274818.1],,BioCyc
4,[DHNAOT4],[2.5.1.74],"all-trans-octaprenyl diphosphate + 1,4-dihydro...",DMK-RXN,[WP_016931372.1],,BioCyc
...,...,...,...,...,...,...,...
233,,,1 MNXM11@MNXD1 + 1 MNXM167418@MNXD1 + 1 MNXM72...,{MNXR124075},[WP_011276231.1],{'metacyc.reaction': 'TRYPTOPHAN--TRNA-LIGASE-...,MetaNetX
234,,,1 MNXM11@MNXD1 + 1 MNXM16655@MNXD1 + 1 MNXM728...,{MNXR124080},[WP_011275493.1],{'metacyc.reaction': 'TYROSINE--TRNA-LIGASE-RX...,MetaNetX
235,,,1 MNXM1104679@MNXD1 + 1 MNXM1446@MNXD1 + 1 MNX...,{MNXR146695},[WP_011275264.1],{'metacyc.reaction': 'UDP-NACMURALGLDAPAALIG-R...,MetaNetX
236,,,1 MNXM1101258@MNXD1 = 1 MNXM1104529@MNXD1,{MNXR151539},[WP_033079611.1],"{'metacyc.reaction': 'UDPGLCNACEPIM-RXN', 'ori...",MetaNetX


### Fill model

In [None]:
# Fill model
gfbc.fill_model(model)

Unnamed: 0,add_to_GPR,ec-code,equation,id,ncbiprotein,reference,via
238,,,"(1,4-alpha-D-galacturonosyl)(n+m) -> (1,4-al...",4.2.2.2-RXN,[WP_011275342.1],{'ec-code': ['4.2.2.2']},BioCyc
239,,,a [protein]-L-proline (omega = 180) <--> a [...,PEPTIDYLPROLYL-ISOMERASE-RXN,"[WP_016931161.1, WP_011275422.1, WP_016930839.1]",{'ec-code': ['5.2.1.8']},BioCyc


### Statistics

In [None]:
# Get raw statistics
gfbc._statistics

{'genes': {'missing (before)': 235,
  'duplicates': 0,
  'added': 0,
  'missing (after)': 0,
  'missing (unmappable)': 0},
 'reactions': {'added (total)': 0,
  'failed to build': 0,
  'missing (before)': 240,
  'add to GPR (BioCyc)': 11,
  'mapped2MNX': 227,
  'mapped2BiGG': 0,
  'remaining_unmapped': 2},
 'metabolites': {}}