# Reannotation of iJN678 to locus tag in *NC_000911.1*

In [2]:
import cobra
import pandas as pd

In [4]:
model_name = 'iJN678_autotrophic.mat'
species = 'synechocystis_sp_pcc_6803'
model_filepath = '/'.join(['models', model_name])
matlab_model = cobra.io.load_matlab_model(model_filepath)
new_model = cobra.io.load_matlab_model('/'.join(['models', 'iSynCJ816.mat']))

No defined compartments in model iJN678. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e, p, u
This model seems to have metCharge instead of metCharges field. Will use metCharge for what metCharges represents.
No defined compartments in model iSynCJ816. Compartments will be deduced heuristically using regular expressions.
Using regular expression found the following compartments:c, e, l, p, u, x, y


In the following cell I create a file called *'new_model_annotation.csv'*, where are all the **GPRs of reactions with shared IDs in both models**. This file is going to be used for directly **translate the GPRs of *iJN678* to the newest version**.

In [7]:
new_annotation_filepath = '/'.join(['data',species,'new_model_annotation.csv'])

shared_reactions = set([r.id for r in matlab_model.reactions]).intersection(set([r.id for r in new_model.reactions]))

new_annotation = { 'Reaction' : list(shared_reactions),
                   'GPR_iJN678' : [ matlab_model.reactions.get_by_id(reaction).gene_reaction_rule for reaction in shared_reactions ],
                   'GPR_iSynCJ816' : [ new_model.reactions.get_by_id(reaction).gene_reaction_rule for reaction in shared_reactions ] }

annotation_df = pd.DataFrame.from_dict(new_annotation)
annotation_df.to_csv(new_annotation_filepath)
annotation_df

Unnamed: 0,Reaction,GPR_iJN678,GPR_iSynCJ816
0,HGPHT,slr1736,SGL_RS08065
1,CYSS,cysK or cysM,SGL_RS02310 or SGL_RS06370
2,FMETTRS,fmt,SGL_RS13995
3,NDPK9,ndkR,SGL_RS12365
4,RBFSa,ribH,SGL_RS06985
...,...,...,...
764,DGDG183_9_12_15,slr1508,SGL_RS09310
765,SBP,fbp or glpX,SGL_RS09170
766,EX_akg_e,,
767,ADCYRS,cbiP,SGL_RS15620


In [8]:
from tools.importExcelModel import *
model_df = model_to_dataframe(matlab_model)
model_df[0]

Unnamed: 0,Abbreviation,Reaction,GPR,Lower bound,Upper bound,Objective,Confidence Score,Subsystem,Description
0,EX_ac_e,ac_e -->,,0.0,1000.0,0.0,4,Extracellular exchange,Acetate exchange
1,34DHOXPEGOX,34dhmald_c + h_c + nadh_c <=> 34dhoxpeg_c + nad_c,sll0990,-1000.0,1000.0,0.0,4,Phenylalanine tyrosine and tryptophan biosynth...,"3,4-Dihydroxyphenylethyleneglycol NAD+ oxidore..."
2,EX_photon_e,photon_e <--,,-100.0,0.0,0.0,4,Extracellular exchange,Photon exchange
3,34HPPOR,34hpp_c + o2_c --> co2_c + hgentis_c,ppd,0.0,1000.0,0.0,4,Phenylalanine tyrosine and tryptophan biosynth...,4 Hydroxyphenylpyruvateoxygen oxidoreductase
4,EX_ca2_e,ca2_e <=>,,-1000.0,1000.0,0.0,4,Extracellular exchange,Calcium exchange
...,...,...,...,...,...,...,...,...,...
858,CBFCpp,2.0 h_c + 2.0 pcox_p + pqh2_p --> 4.0 h_p + 2....,petA and petB and petC and petC and petC and p...,0.0,1000.0,0.0,4,Oxidative phosphorylation,Cytochrome b6/f complex periplasm
859,CYO1b2_syn,4.0 h_c + 0.5 o2_u + 2.0 pcrd_u --> h2o_u + 2....,(ctaD or ctaD) and (ctaC or ctaC) and (ctaE or...,0.0,1000.0,0.0,4,Photosynthesis,"Cytochrome c oxidase, synechocystis (2 protons)"
860,PHETA1,akg_c + phe__L_c <=> glu__L_c + phpyr_c,aspC or sll0480 or aspC or hisC or hisC,-1000.0,1000.0,0.0,4,Phenylalanine tyrosine and tryptophan biosynth...,Phenylalanine transaminase
861,ORNTAC,acorn_c + glu__L_c <=> acglu_c + orn_c,argJ or ama,-1000.0,1000.0,0.0,4,Urea cycle and metabolism of amino groups,Ornithine transacetylase


In [9]:
gprs_to_rename = { row['Reaction'] : {'iJN678': row['GPR_iJN678'],
                                      'iSynCJ816': row['GPR_iSynCJ816']}
                   for index, row in annotation_df.iterrows() }

model_df[0] = model_df[0].set_index('Abbreviation')
for reaction in shared_reactions:
    model_df[0].loc[reaction, 'GPR'] = gprs_to_rename[reaction]['iSynCJ816']

model_df[0]

Unnamed: 0_level_0,Reaction,GPR,Lower bound,Upper bound,Objective,Confidence Score,Subsystem,Description
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
EX_ac_e,ac_e -->,,0.0,1000.0,0.0,4,Extracellular exchange,Acetate exchange
34DHOXPEGOX,34dhmald_c + h_c + nadh_c <=> 34dhoxpeg_c + nad_c,SGL_RS04900,-1000.0,1000.0,0.0,4,Phenylalanine tyrosine and tryptophan biosynth...,"3,4-Dihydroxyphenylethyleneglycol NAD+ oxidore..."
EX_photon_e,photon_e <--,,-100.0,0.0,0.0,4,Extracellular exchange,Photon exchange
34HPPOR,34hpp_c + o2_c --> co2_c + hgentis_c,SGL_RS15355,0.0,1000.0,0.0,4,Phenylalanine tyrosine and tryptophan biosynth...,4 Hydroxyphenylpyruvateoxygen oxidoreductase
EX_ca2_e,ca2_e <=>,,-1000.0,1000.0,0.0,4,Extracellular exchange,Calcium exchange
...,...,...,...,...,...,...,...,...
CBFCpp,2.0 h_c + 2.0 pcox_p + pqh2_p --> 4.0 h_p + 2....,petA and petB and petC and petC and petC and p...,0.0,1000.0,0.0,4,Oxidative phosphorylation,Cytochrome b6/f complex periplasm
CYO1b2_syn,4.0 h_c + 0.5 o2_u + 2.0 pcrd_u --> h2o_u + 2....,(ctaD or ctaD) and (ctaC or ctaC) and (ctaE or...,0.0,1000.0,0.0,4,Photosynthesis,"Cytochrome c oxidase, synechocystis (2 protons)"
PHETA1,akg_c + phe__L_c <=> glu__L_c + phpyr_c,SGL_RS13785 or SGL_RS08515 or SGL_RS16530,-1000.0,1000.0,0.0,4,Phenylalanine tyrosine and tryptophan biosynth...,Phenylalanine transaminase
ORNTAC,acorn_c + glu__L_c <=> acglu_c + orn_c,SGL_RS10330,-1000.0,1000.0,0.0,4,Urea cycle and metabolism of amino groups,Ornithine transacetylase


In [10]:
missing_to_reannotate = set([r.id for r in matlab_model.reactions if len(r.gene_reaction_rule)>0 ])-shared_reactions
len(missing_to_reannotate)

83

In [11]:
missing_reactions_df = model_df[0].loc[list(missing_to_reannotate)]
missing_reactions_df

Unnamed: 0_level_0,Reaction,GPR,Lower bound,Upper bound,Objective,Confidence Score,Subsystem,Description
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ACLS,h_c + 2.0 pyr_c --> alac__S_c + co2_c,ilvB and ilvG and ilvN,0.0,1000.0,0.0,4,C5-Branched dibasic acid metabolism,Acetolactate synthase
CYTBDu,0.5 o2_u + pqh2_u --> h2o_u + pq_u,cydA and cydB,0.0,1000.0,0.0,4,Photosynthesis,Cytochrome oxidase bd (plastocianine-8 2 prot...
ARGDr,arg__L_c + h2o_c --> citr__L_c + nh4_c,sll1336,0.0,1000.0,0.0,4,Arginine and proline metabolism,Arginine deiminase
PYK3,gdp_c + h_c + pep_c --> gtp_c + pyr_c,pykF or pykF,0.0,1000.0,0.0,4,Glycolysis/Gluconeogenesis,Pyruvate kinase(3)
THZPSN,atp_c + cys__L_c + dxyl5p_c + tyr__L_c --> 4hb...,thiG or ycf40,0.0,1000.0,0.0,4,Thiamine metabolism,Thiazole phosphate synthesis
...,...,...,...,...,...,...,...,...
AMID2,h2o_c + pad_c --> nh4_c + pac_c,nylA,0.0,1000.0,0.0,4,Phenylalanine tyrosine and tryptophan biosynth...,Amidase
TDPGDH,dtdpglu_c --> dtdp4d6dg_c + h2o_c,rfbB or rfbB,0.0,1000.0,0.0,4,Nucleotide sugars metabolism,"DTDPglucose 4,6-dehydratase"
PPTGF_Syn,peptido_syn_c --> peptido_syn_p,slr0488,0.0,1000.0,0.0,4,Peptidoglycan biosynthesis,Flypase
ASNN,asn__L_c + h2o_c --> asp__L_c + nh4_c,sll0422,0.0,1000.0,0.0,4,Nitrogen metabolism,L-asparaginase


In [12]:
genes_missing = set([ gene for reaction in missing_to_reannotate for gene in [g.id for g in matlab_model.reactions.get_by_id(reaction).genes]])
len(genes_missing)

166

In [13]:
reaction_name_matching = [r for r in new_model.reactions if r.name in missing_reactions_df.Description.tolist()]
for reaction in reaction_name_matching[60:]:
    display(missing_reactions_df.loc[missing_reactions_df.Description==reaction.name, ['Reaction', 'GPR', 'Description']])
    print(missing_reactions_df.loc[missing_reactions_df.Description==reaction.name, ['Reaction', 'GPR', 'Description']].Reaction.values)
    display(reaction)
    print(reaction.gene_reaction_rule)

Unnamed: 0_level_0,Reaction,GPR,Description
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CBFC2pp,2.0 ficytc6_p + 2.0 h_c + pqh2_p --> 2.0 focyt...,petA and petB and petC and petC and petC and p...,Cytochrome b6/f complex periplasm
CBFCpp,2.0 h_c + 2.0 pcox_p + pqh2_p --> 4.0 h_p + 2....,petA and petB and petC and petC and petC and p...,Cytochrome b6/f complex periplasm


['2.0 ficytc6_p + 2.0 h_c + pqh2_p --> 2.0 focytc6_p + 4.0 h_p + pq_p'
 '2.0 h_c + 2.0 pcox_p + pqh2_p --> 4.0 h_p + 2.0 pcrd_p + pq_p']


0,1
Reaction identifier,CBFCpf
Name,Cytochrome b6/f complex periplasm
Memory address,0x7f1068447b80
Stoichiometry,"h_c + hemeB2p_y + pqhb6s_y --> hemeB3p_y + pqh2_y  H+ + Heme B located in the Cytochrome-b6/f complex, twice protonated + Plastosemiquinone located at the stromal side of the Cytochrome-b6/f complex --> Heme B located in the Cytochrome-b6/f..."
GPR,SGL_RS05875 and SGL_RS07315 and SGL_RS13200 and SGL_RS10620 and SGL_RS07320 and SGL_RS18155 and...
Lower bound,0.0
Upper bound,999999.0


SGL_RS05875 and SGL_RS07315 and SGL_RS13200 and SGL_RS10620 and SGL_RS07320 and SGL_RS18155 and SGL_RS13205 and SGL_RS13610 and SGL_RS10475 and ssl3803 and SGL_RS16365 and sml0004 and SGL_RS05245 and SGL_RS05250 and SGL_RS05240 and SGL_RS05235 and SGL_RS05230 and SGL_RS07605 and SGL_RS17600 and SGL_RS04660 and SGL_RS08580 and SGL_RS08585 and SGL_RS08590 and SGL_RS03465 and SGL_RS12650 and SGL_RS17175


Unnamed: 0_level_0,Reaction,GPR,Description
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PSI,2.0 fdxo_2_2_c + 2.0 pcrd_u + 2.0 photon_c -->...,psaA and psaB and psaC and psaD and psaE and p...,Photosystem I (plastocyanin)


['2.0 fdxo_2_2_c + 2.0 pcrd_u + 2.0 photon_c --> 2.0 fdxrd_c + 2.0 pcox_u']


0,1
Reaction identifier,PSI_2a
Name,Photosystem I (plastocyanin)
Memory address,0x7f10684d2ef0
Stoichiometry,focytc6_l + p700p_u --> ficytc6_l + p700_u  Ferrocytochrome c6 + Positive charged reaction centre of the Photosystem I --> Ferricytochrome c6 + PSI reaction center P700
GPR,SGL_RS02560 and (SGL_RS17240 or SGL_RS02600) and SGL_RS03975 and SGL_RS09810 and SGL_RS12695 and...
Lower bound,0.0
Upper bound,999999.0


SGL_RS02560 and (SGL_RS17240 or SGL_RS02600) and SGL_RS03975 and SGL_RS09810 and SGL_RS12695 and SGL_RS02435 and SGL_RS09960 and SGL_RS17820 and SGL_RS06305 and SGL_RS06310 and SGL_RS09805 and SGL_RS17825 and SGL_RS04090 and SGL_RS12620 and SGL_RS11240 and SGL_RS06050


Unnamed: 0_level_0,Reaction,GPR,Description
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PSII,h2o_u + 2.0 h_c + 2.0 photon_c + pq_u --> 2.0 ...,psbA3 and psbA1 and psbA2 and psbD and psbD2 a...,Photosystem II


['h2o_u + 2.0 h_c + 2.0 photon_c + pq_u --> 2.0 h_u + 0.5 o2_u + pqh2_u']


0,1
Reaction identifier,PSIIa
Name,Photosystem II
Memory address,0x7f10681956f0
Stoichiometry,e680_u + p680_u + qa_u --> p680p_u + qan_u  Photons with 680nm wavelength + PSII reaction center P680 + Internal bound plastoquinone of the Photosystem II --> Positive charged reaction centre of the Photosystem II + Internal bound...
GPR,SGL_RS11935 and SGL_RS11735 and SGL_RS08205 and SGL_RS08200 and SGL_RS03265 and SGL_RS02070 and...
Lower bound,0.0
Upper bound,999999.0


SGL_RS11935 and SGL_RS11735 and SGL_RS08205 and SGL_RS08200 and SGL_RS03265 and SGL_RS02070 and SGL_RS09330 and SGL_RS10440 and SGL_RS14825 and SGL_RS16825 and SGL_RS18130 and SGL_RS01910 and SGL_RS11645 and SGL_RS08090 and SGL_RS04525 and SGL_RS12880 and SGL_RS14050 and sml0003 and SGL_RS04440 and SGL_RS07800 and SGL_RS13125 and SGL_RS04535 and SGL_RS04540 and SGL_RS04545 and smr0009 and SGL_RS07325 and SGL_RS04530 and SGL_RS02645 and SGL_RS02635


Unnamed: 0_level_0,Reaction,GPR,Description
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
PPTGF_Syn,peptido_syn_c --> peptido_syn_p,slr0488,Flypase


['peptido_syn_c --> peptido_syn_p']


0,1
Reaction identifier,PPTGF_Syn_1
Name,Flypase
Memory address,0x7f10678a4c40
Stoichiometry,e11_c --> e11_p  Peptidoglycan --> Peptidoglycan
GPR,SGL_RS14090
Lower bound,0.0
Upper bound,999999.0


SGL_RS14090


Unnamed: 0_level_0,Reaction,GPR,Description
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
MPOMOR2,nadp_c + omppp9_c --> dvpchlld_c + h_c + nadph_c,bchE,magnesium-protoporphyrin-IX 13-monomethyl este...


['nadp_c + omppp9_c --> dvpchlld_c + h_c + nadph_c']


0,1
Reaction identifier,MPOMOR2_1
Name,"magnesium-protoporphyrin-IX 13-monomethyl ester,NADPH"
Memory address,0x7f106757cd90
Stoichiometry,h2o_c + nadp_c + omppp9_c --> dvpchlld_c + h_c + nadph_c  H2O H2O + Nicotinamide adenine dinucleotide phosphate + 13(1)-Oxo-magnesium-protoporphyrin IX 13-monomethyl ester --> Divinylprotochlorophyllide + H+ + Nicotinamide adenine dinucleotide phosphate -...
GPR,SGL_RS14820
Lower bound,0.0
Upper bound,999999.0


SGL_RS14820


Unnamed: 0_level_0,Reaction,GPR,Description
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AGPAT183_9_12_15,1odec91215eg3p_c + octe_9_12_15_ACP_c --> ACP_...,sll1848,1-octadec-enoyl-sn-glycerol 3-phosphate O-acyl...


['1odec91215eg3p_c + octe_9_12_15_ACP_c --> ACP_c + pa183_9_12_15_c']


0,1
Reaction identifier,AGPAT183_6_9_12
Name,1-octadec-enoyl-sn-glycerol 3-phosphate O-acyltransferase (n-C18 3)
Memory address,0x7f1066debf10
Stoichiometry,"1odec6912eg3p_c + octe_6_9_12_ACP_c --> ACP_c + pa183_6_9_12_c  1-octadec-6-9-12trienoyl-sn-glycerol 3-phosphate + G-linolenoilACP --> Acyl carrier protein + 1,2-dioctadec-6-9-12-trienoyl-sn-glycerol 3-phosphate"
GPR,SGL_RS12400
Lower bound,0.0
Upper bound,999999.0


SGL_RS12400


Unnamed: 0_level_0,Reaction,GPR,Description
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AGPAT183_9_12_15,1odec91215eg3p_c + octe_9_12_15_ACP_c --> ACP_...,sll1848,1-octadec-enoyl-sn-glycerol 3-phosphate O-acyl...


['1odec91215eg3p_c + octe_9_12_15_ACP_c --> ACP_c + pa183_9_12_15_c']


0,1
Reaction identifier,AGPAT183_9_12_16
Name,1-octadec-enoyl-sn-glycerol 3-phosphate O-acyltransferase (n-C18 3)
Memory address,0x7f1066c5a7d0
Stoichiometry,"1odec91215eg3p_c + octe_9_12_ACP_c --> ACP_c + pa183_9_12_15_c  1-octadec-9-12-15-trienoyl-sn-glycerol 3-phosphate + Linoleoyl-ACP (n-C18 2ACP) --> Acyl carrier protein + 1,2-dioctadec-9-12-15-trienoyl-sn-glycerol 3-phosphate"
GPR,SGL_RS12400
Lower bound,0.0
Upper bound,999999.0


SGL_RS12400


I reviewed those genes with name matching one by one and if they were the same reaction, I added them to a file called *'gpr_manual_annotations.csv'*, that is composed by 2 columns, **representing Reaction -> GPR relationships**.

In [14]:
manual_annotation_filepath = '/'.join(['data',species,'gpr_manual_annotations.csv'])
manual_annotation_df = pd.read_csv(manual_annotation_filepath)
manual_annotation_df

Unnamed: 0,Reaction,GPR
0,HMBS,SGL_RS07710
1,HISTD,SGL_RS03870 or SGL_RS04465
2,3HAD40,SGL_RS08755
3,3OAR40,SGL_RS07245 or SGL_RS12965 or sll5079
4,PSI,SGL_RS02560 and (SGL_RS17240 or SGL_RS02600) a...
5,RBPC,SGL_RS13415 and SGL_RS13425
6,RBCh,SGL_RS13415 and SGL_RS13425
7,CBFC2,SGL_RS05875 and SGL_RS07315 and SGL_RS13200 an...
8,CBFCu,SGL_RS05875 and SGL_RS07315 and SGL_RS13200 an...
9,CYO1b_syn,(SGL_RS09865 or SGL_RS05540) and (SGL_RS09075 ...


In [15]:
for reaction in manual_annotation_df.Reaction.tolist():
    model_df[0].loc[reaction, 'GPR'] = manual_annotation_df.loc[manual_annotation_df.Reaction==reaction].GPR.tolist()[0]

model_df[0]

Unnamed: 0_level_0,Reaction,GPR,Lower bound,Upper bound,Objective,Confidence Score,Subsystem,Description
Abbreviation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
EX_ac_e,ac_e -->,,0.0,1000.0,0.0,4,Extracellular exchange,Acetate exchange
34DHOXPEGOX,34dhmald_c + h_c + nadh_c <=> 34dhoxpeg_c + nad_c,SGL_RS04900,-1000.0,1000.0,0.0,4,Phenylalanine tyrosine and tryptophan biosynth...,"3,4-Dihydroxyphenylethyleneglycol NAD+ oxidore..."
EX_photon_e,photon_e <--,,-100.0,0.0,0.0,4,Extracellular exchange,Photon exchange
34HPPOR,34hpp_c + o2_c --> co2_c + hgentis_c,SGL_RS15355,0.0,1000.0,0.0,4,Phenylalanine tyrosine and tryptophan biosynth...,4 Hydroxyphenylpyruvateoxygen oxidoreductase
EX_ca2_e,ca2_e <=>,,-1000.0,1000.0,0.0,4,Extracellular exchange,Calcium exchange
...,...,...,...,...,...,...,...,...
CBFCpp,2.0 h_c + 2.0 pcox_p + pqh2_p --> 4.0 h_p + 2....,SGL_RS05875 and SGL_RS07315 and SGL_RS13200 an...,0.0,1000.0,0.0,4,Oxidative phosphorylation,Cytochrome b6/f complex periplasm
CYO1b2_syn,4.0 h_c + 0.5 o2_u + 2.0 pcrd_u --> h2o_u + 2....,(SGL_RS09865 or SGL_RS05540) and (SGL_RS09075 ...,0.0,1000.0,0.0,4,Photosynthesis,"Cytochrome c oxidase, synechocystis (2 protons)"
PHETA1,akg_c + phe__L_c <=> glu__L_c + phpyr_c,SGL_RS13785 or SGL_RS08515 or SGL_RS16530,-1000.0,1000.0,0.0,4,Phenylalanine tyrosine and tryptophan biosynth...,Phenylalanine transaminase
ORNTAC,acorn_c + glu__L_c <=> acglu_c + orn_c,SGL_RS10330,-1000.0,1000.0,0.0,4,Urea cycle and metabolism of amino groups,Ornithine transacetylase


In [16]:
model_df[1] = model_df[1].rename(columns={'Formula' : 'Charged formula'})
model_df[0].reset_index(inplace=True)

In [17]:
reannotated_model = dataframe_to_model(model_df[0], model_df[1])
reannotated_model.summary()

Metabolite,Reaction,Flux,C-Number,C-Flux
ca2_e,EX_ca2_e,0.0003987,0,0.00%
co2_e,EX_co2_e,3.7,1,99.99%
cobalt2_e,EX_cobalt2_e,0.0002874,0,0.00%
cu2_e,EX_cu2_e,0.0002658,0,0.00%
fe2_e,EX_fe2_e,0.0006614,0,0.00%
fe3_e,EX_fe3_e,0.0006022,0,0.00%
h2o_e,EX_h2o_e,2.42,0,0.00%
h_e,EX_h_e,0.8795,0,0.00%
k_e,EX_k_e,0.01495,0,0.00%
mg2_e,EX_mg2_e,0.002541,0,0.00%

Metabolite,Reaction,Flux,C-Number,C-Flux
5mdru1p_c,DM_5mdru1p_c,-0.0005959,6,100.00%
o2_e,EX_o2_e,-5.585,0,0.00%


In [18]:
not_in_new_annotation = set([g for r in reannotated_model.reactions for g in [gene.id for gene in r.genes] if g.startswith('SGL')==False ])
len(not_in_new_annotation)

72

In [22]:
not_annotated_genes_in_new_model = set([gene for gene in not_in_new_annotation if gene in [g.id for g in new_model.genes]])
len(not_annotated_genes_in_new_model)

7

In [23]:
not_annotated_genes_in_matlab_model = set([gene for gene in not_in_new_annotation if gene in [g.id for g in matlab_model.genes]])
len(not_annotated_genes_in_matlab_model)

67

In [24]:
not_annotated_genes_in_new_model.intersection(not_annotated_genes_in_matlab_model)

{'sll8031', 'ssl3803'}

In [26]:
annotation_table_filepath = '/'.join(['data',species,'gene_info.csv'])
annot_table = pd.read_csv(annotation_table_filepath)
genes_with_annotated_traditional_names = set([gene for gene in not_in_new_annotation if gene in annot_table.gene_name.tolist()])
len(genes_with_annotated_traditional_names)

26

In [29]:
annot_table

Unnamed: 0,locus_tag,gene_name,accession,old_locus_tag,start,end,strand,gene_product,COG,uniprot,operon
0,SGL_RS01370,SGL_RS01370,NC_005232.1,slr6001,243,3053,+,hybrid sensor histidine kinase/response regulator,Signal transduction mechanisms,,Op97
1,SGL_RS00010,SGL_RS00010,NC_005229.1,sll5002,574,1017,-,hypothetical protein,No COG annotation,,Op712
2,SGL_RS00635,SGL_RS00635,NC_005230.1,sll7002,706,1287,-,IS701 family transposase,"Replication, recombination and repair",,Op2588
3,SGL_RS01880,SGL_RS01880,NC_000911.1,slr0612,811,1494,+,pseudouridine synthase,"Translation, ribosomal structure and biogenesis",,Op465
4,SGL_RS00015,SGL_RS00015,NC_005229.1,sll5003,1136,1540,-,DUF5615 family PIN-like protein,No COG annotation,,Op686
...,...,...,...,...,...,...,...,...,...,...,...
3737,SGL_RS18370,SGL_RS18370,NC_000911.1,slr0607,3569134,3569523,+,cyclic nucleotide-binding domain-containing pr...,Signal transduction mechanisms,,Op766
3738,SGL_RS18375,hisIE,NC_000911.1,slr0608,3569672,3570319,+,bifunctional phosphoribosyl-AMP cyclohydrolase...,Nucleotide transport and metabolism,A0A6P1VM37_9SYNC,Op2513
3739,SGL_RS18380,SGL_RS18380,NC_000911.1,slr0609,3570424,3571575,+,GTP-binding protein,Function unknown,,Op489
3740,SGL_RS18385,SGL_RS18385,NC_000911.1,slr0610,3571711,3572403,+,ABC transporter permease,Function unknown,,Op108


After reannotating te model following the modifications indicated in *'gpr_manual_annotations.csv'*, there are still 72 genes missing the new annotation. Of those, 7 are in the newes annotated model and 67 in our model, sharing 2 old annotated genes. Partially, this is due to the **annotation following traditional names** (26 genes with traditional names), so I will parse those names **using the gene annotation obtained during the modulome workflow**.

In [33]:
in_gene_name_column_dict = { gene : set(annot_table.loc[annot_table.gene_name==gene].locus_tag.tolist())
                             for gene in not_in_new_annotation if gene in annot_table.gene_name.tolist()}

in_gene_name_column_dict

{'ilvC': {'SGL_RS17215'},
 'pyrG': {'SGL_RS10805'},
 'atpA': {'SGL_RS02675'},
 'atpH': {'SGL_RS02680'},
 'ilvA': {'SGL_RS06140'},
 'atpC': {'SGL_RS09710'},
 'ndhI': {'SGL_RS16995'},
 'psaK': {'SGL_RS02600', 'SGL_RS17240'},
 'menB': {'SGL_RS06810'},
 'atpD': {'SGL_RS09705'},
 'gcvT': {'SGL_RS12745'},
 'pdxA': {'SGL_RS16360'},
 'hoxU': {'SGL_RS09745'},
 'ndhK': {'SGL_RS10730'},
 'psaB': {'SGL_RS06310'},
 'atpB': {'SGL_RS02700'},
 'hoxF': {'SGL_RS09755'},
 'cydB': {'SGL_RS05015'},
 'psaA': {'SGL_RS06305'},
 'psaM': {'SGL_RS04090'},
 'trpB': {'SGL_RS16740'},
 'atpE': {'SGL_RS02695'},
 'trpA': {'SGL_RS03395'},
 'ndhC': {'SGL_RS10725'},
 'psaC': {'SGL_RS12620'},
 'psaJ': {'SGL_RS09805'}}

In [34]:
annot_table.loc[annot_table.gene_name=='psaK']

Unnamed: 0,locus_tag,gene_name,accession,old_locus_tag,start,end,strand,gene_product,COG,uniprot,operon
530,SGL_RS02600,psaK,NC_000911.1,ssr0390,156391,156651,+,photosystem I reaction center subunit PsaK,"Intracellular trafficking, secretion, and vesi...",A0A6P1VFL7_9SYNC,Op765
3520,SGL_RS17240,psaK,NC_000911.1,sll0629,3322377,3322649,-,photosystem I reaction center subunit PsaK,"Intracellular trafficking, secretion, and vesi...",,Op578


In [35]:
matlab_model.genes.psaK

0,1
Gene identifier,psaK
Name,
Memory address,0x7f10a5714190
Functional,True
In 2 reaction(s),"PSI, PSI_2"


In [36]:
matlab_model.reactions.PSI

0,1
Reaction identifier,PSI
Name,Photosystem I (plastocyanin)
Memory address,0x7f10957f3a30
Stoichiometry,2.0 fdxo_2_2_c + 2.0 pcrd_u + 2.0 photon_c --> 2.0 fdxrd_c + 2.0 pcox_u  2.0 Oxidized ferredoxin + 2.0 Plastocyanin(Cu+) + 2.0 Light --> 2.0 Reduced ferredoxin + 2.0 Plastocyanin(Cu2+)
GPR,psaA and psaB and psaC and psaD and psaE and psaF and psaI and psaJ and (psaK or psaK) and psaL...
Lower bound,0.0
Upper bound,1000.0


In [37]:
matlab_model.reactions.PSI_2

0,1
Reaction identifier,PSI_2
Name,Photosystem I (ferrocytochrome)
Memory address,0x7f108bc3d6c0
Stoichiometry,2.0 fdxo_2_2_c + 2.0 focytc6_u + 2.0 photon_c --> 2.0 fdxrd_c + 2.0 ficytc6_u  2.0 Oxidized ferredoxin + 2.0 Ferrocytochrome c6 + 2.0 Light --> 2.0 Reduced ferredoxin + 2.0 Ferricytochrome c6
GPR,psaA and psaB and psaC and psaD and psaE and psaF and psaI and psaJ and (psaK or psaK) and psaL...
Lower bound,0.0
Upper bound,1000.0


In [38]:
matlab_model.reactions.PSI_2.gene_reaction_rule == matlab_model.reactions.PSI.gene_reaction_rule

True

In [39]:
new_model.reactions.PSIa

0,1
Reaction identifier,PSIa
Name,Photosystem I
Memory address,0x7f1080c17220
Stoichiometry,e700_u + fdxo_2_2_c + p700_u --> fdxrd_c + p700p_u  Photons with 700nm wavelength + Oxidized ferredoxin + PSI reaction center P700 --> Reduced ferredoxin + Positive charged reaction centre of the Photosystem I
GPR,SGL_RS02560 and (SGL_RS17240 or SGL_RS02600) and SGL_RS03975 and SGL_RS09810 and SGL_RS12695 and...
Lower bound,0.0
Upper bound,999999.0


In [40]:
target_reactions = [r.id for r in matlab_model.genes.get_by_id('psaK').reactions]
model_df[0].loc[model_df[0].Abbreviation.isin(target_reactions)]

Unnamed: 0,Abbreviation,Reaction,GPR,Lower bound,Upper bound,Objective,Confidence Score,Subsystem,Description
462,PSI,2.0 fdxo_2_2_c + 2.0 pcrd_u + 2.0 photon_c -->...,SGL_RS02560 and (SGL_RS17240 or SGL_RS02600) a...,0.0,1000.0,0.0,4,Photosynthesis,Photosystem I (plastocyanin)
828,PSI_2,2.0 fdxo_2_2_c + 2.0 focytc6_u + 2.0 photon_c ...,psaA and psaB and psaC and psaD and psaE and p...,0.0,1000.0,0.0,4,Photosynthesis,Photosystem I (ferrocytochrome)


In [41]:
model_df[0].loc[model_df[0].Abbreviation=='PSI_2', 'GPR'] = reannotated_model.reactions.PSI.gene_reaction_rule
model_df[0].loc[model_df[0].Abbreviation.isin(target_reactions)]

Unnamed: 0,Abbreviation,Reaction,GPR,Lower bound,Upper bound,Objective,Confidence Score,Subsystem,Description
462,PSI,2.0 fdxo_2_2_c + 2.0 pcrd_u + 2.0 photon_c -->...,SGL_RS02560 and (SGL_RS17240 or SGL_RS02600) a...,0.0,1000.0,0.0,4,Photosynthesis,Photosystem I (plastocyanin)
828,PSI_2,2.0 fdxo_2_2_c + 2.0 focytc6_u + 2.0 photon_c ...,SGL_RS02560 and (SGL_RS17240 or SGL_RS02600) a...,0.0,1000.0,0.0,4,Photosynthesis,Photosystem I (ferrocytochrome)


In [42]:
model_df[0].loc[model_df[0].GPR.str.contains('|'.join(list(in_gene_name_column_dict.keys())))]

Unnamed: 0,Abbreviation,Reaction,GPR,Lower bound,Upper bound,Objective,Confidence Score,Subsystem,Description
108,CTPS1,atp_c + nh4_c + utp_c --> adp_c + ctp_c + 2.0 ...,pyrG,0.0,1000.0,0.0,4,Pyrimidine metabolism,CTP synthase NH3
178,CYTBDpp_1,0.5 o2_p + pqh2_p --> h2o_p + pq_p,cydA and cydB,0.0,1000.0,0.0,4,Oxidative phosphorylation,Cytochrome oxidase bd (plastocianine-8 2 prot...
180,CYTBDu,0.5 o2_u + pqh2_u --> h2o_u + pq_u,cydA and cydB,0.0,1000.0,0.0,4,Photosynthesis,Cytochrome oxidase bd (plastocianine-8 2 prot...
182,GLYCL_2,co2_c + mlthf_c + nadh_c + nh4_c --> gly_c + n...,gcvT,0.0,1000.0,0.0,4,Nitrogen metabolism,"Glycine cleavage system, cytosol"
334,KARA2,2ahbut_c + h_c + nadph_c --> 23dhmp_c + nadp_c,ilvC,0.0,1000.0,0.0,4,Valine leucine and isoleucine biosynthesis,Ketol-acid reductoisomerase (2-Acetolactate)
390,NPHS,sbzcoa_c --> coa_c + dhna_c,menB,0.0,1000.0,0.0,4,Ubiquinone and other pterpenoids biosynthesis,Naphthoate synthase
423,PDX5PS,dxyl5p_c + nad_c + phthr_c --> co2_c + 2.0 h2o...,pdxA,0.0,1000.0,0.0,4,Vitamin B6 metabolism,Pyridoxine 5'-phosphate synthase
511,SERD_L,ser__L_c --> nh4_c + pyr_c,ilvA,0.0,1000.0,0.0,4,Arginine and proline metabolism,L-serine deaminase
545,TRPS1,3ig3p_c + ser__L_c --> g3p_c + h2o_c + trp__L_c,trpA and trpB,0.0,1000.0,0.0,4,Phenylalanine tyrosine and tryptophan biosynth...,Tryptophan synthase (indoleglycerol phosphate)
796,NDH1_3u,co2_p + h2o_c + 3.0 h_c + nadph_c + pq_u --> 3...,ndhA and ndhB and ndhC and ndhE and ndhG and n...,0.0,1000.0,0.0,4,Transport,Active co2 transporter facilitator (tilacoide)


In [45]:
in_gene_name_column_dict.pop('psaK')
in_gene_name_column_dict = { key : list(value)[0] for key, value in in_gene_name_column_dict.items() }
in_gene_name_column_dict

{'ilvC': 'SGL_RS17215',
 'pyrG': 'SGL_RS10805',
 'atpA': 'SGL_RS02675',
 'atpH': 'SGL_RS02680',
 'ilvA': 'SGL_RS06140',
 'atpC': 'SGL_RS09710',
 'ndhI': 'SGL_RS16995',
 'menB': 'SGL_RS06810',
 'atpD': 'SGL_RS09705',
 'gcvT': 'SGL_RS12745',
 'pdxA': 'SGL_RS16360',
 'hoxU': 'SGL_RS09745',
 'ndhK': 'SGL_RS10730',
 'psaB': 'SGL_RS06310',
 'atpB': 'SGL_RS02700',
 'hoxF': 'SGL_RS09755',
 'cydB': 'SGL_RS05015',
 'psaA': 'SGL_RS06305',
 'psaM': 'SGL_RS04090',
 'trpB': 'SGL_RS16740',
 'atpE': 'SGL_RS02695',
 'trpA': 'SGL_RS03395',
 'ndhC': 'SGL_RS10725',
 'psaC': 'SGL_RS12620',
 'psaJ': 'SGL_RS09805'}

In [46]:
model_df[0].GPR.replace(in_gene_name_column_dict, regex=True, inplace=True)
model_df[0].loc[model_df[0].GPR.str.contains('|'.join(list(in_gene_name_column_dict.keys())))]

Unnamed: 0,Abbreviation,Reaction,GPR,Lower bound,Upper bound,Objective,Confidence Score,Subsystem,Description


In [47]:
reannotated_model = dataframe_to_model(model_df[0], model_df[1])
reannotated_model.summary()

Metabolite,Reaction,Flux,C-Number,C-Flux
ca2_e,EX_ca2_e,0.0003987,0,0.00%
co2_e,EX_co2_e,3.7,1,99.99%
cobalt2_e,EX_cobalt2_e,0.0002874,0,0.00%
cu2_e,EX_cu2_e,0.0002658,0,0.00%
fe2_e,EX_fe2_e,0.0006614,0,0.00%
fe3_e,EX_fe3_e,0.0006022,0,0.00%
h2o_e,EX_h2o_e,2.42,0,0.00%
h_e,EX_h_e,0.8795,0,0.00%
k_e,EX_k_e,0.01495,0,0.00%
mg2_e,EX_mg2_e,0.002541,0,0.00%

Metabolite,Reaction,Flux,C-Number,C-Flux
5mdru1p_c,DM_5mdru1p_c,-0.0005959,6,100.00%
o2_e,EX_o2_e,-5.585,0,0.00%


In [48]:
not_in_new_annotation = set([g for r in reannotated_model.reactions for g in [gene.id for gene in r.genes] if g.startswith('SGL')==False ])
len(not_in_new_annotation)

41

In [49]:
not_annotated_genes_in_new_model = set([gene for gene in not_in_new_annotation if gene in [g.id for g in new_model.genes]])
len(not_annotated_genes_in_new_model)

7

In [50]:
not_annotated_genes_in_matlab_model = set([gene for gene in not_in_new_annotation if gene in [g.id for g in matlab_model.genes]])
len(not_annotated_genes_in_matlab_model)

36

In [51]:
not_annotated_genes_in_new_model.intersection(not_annotated_genes_in_matlab_model)

{'sll8031', 'ssl3803'}

In [52]:
genes_with_annotated_traditional_names = set([gene for gene in not_in_new_annotation if gene in annot_table.gene_name.tolist()])
len(genes_with_annotated_traditional_names)

0

In [53]:
not_reannotated_reactions = model_df[0].loc[model_df[0].GPR.str.contains('|'.join(list(not_in_new_annotation)))]
print('Still %s reactions to reannotate' % len(not_reannotated_reactions))
display(not_reannotated_reactions)

Still 38 reactions to reannotate


Unnamed: 0,Abbreviation,Reaction,GPR,Lower bound,Upper bound,Objective,Confidence Score,Subsystem,Description
8,3OAR100,3odecACP_c + h_c + nadph_c <=> 3hdecACP_c + na...,SGL_RS07245 or SGL_RS12965 or sll5079,-1000.0,1000.0,0.0,4,Fatty acid biosynthesis,3-oxoacyl-[acyl-carrier-protein] reductase (n-...
10,3OAR120,3oddecACP_c + h_c + nadph_c <=> 3hddecACP_c + ...,SGL_RS07245 or SGL_RS12965 or sll5079,-1000.0,1000.0,0.0,4,Fatty acid biosynthesis,3-oxoacyl-[acyl-carrier-protein] reductase (n-...
12,3OAR140,3omrsACP_c + h_c + nadph_c --> 3hmrsACP_c + na...,SGL_RS07245 or SGL_RS12965 or sll5079,0.0,1000.0,0.0,4,Fatty acid biosynthesis,3-oxoacyl-[acyl-carrier-protein] reductase (n-...
15,ACONT,cit_c <=> icit_c,slr0665,-1000.0,1000.0,0.0,4,Citrate cycle (TCA cycle),Aconitate hydratase
16,3OAR160,3opalmACP_c + h_c + nadph_c <=> 3hpalmACP_c + ...,SGL_RS07245 or SGL_RS12965 or sll5079,-1000.0,1000.0,0.0,4,Fatty acid biosynthesis,3-oxoacyl-[acyl-carrier-protein] reductase (n-...
19,3OAR180,3ooctdACP_c + h_c + nadph_c --> 3hoctaACP_c + ...,SGL_RS07245 or SGL_RS12965 or sll5079,0.0,1000.0,0.0,4,Fatty acid biosynthesis,3-oxoacyl-[acyl-carrier-protein] reductase (n-...
23,FQR,2.0 fdxrd_c + 2.0 h_c + pq_u --> 2.0 fdxo_2_2_...,ssr2016,0.0,1000.0,0.0,4,Photosynthesis,Cyclic Electron Flow
27,3OAR40,actACP_c + h_c + nadph_c <=> 3haACP_c + nadp_c,SGL_RS07245 or SGL_RS12965 or sll5079,-1000.0,1000.0,0.0,4,Fatty acid biosynthesis,3-oxoacyl-[acyl-carrier-protein] reductase (n-...
31,3OAR60,3ohexACP_c + h_c + nadph_c <=> 3hhexACP_c + na...,SGL_RS07245 or SGL_RS12965 or sll5079,-1000.0,1000.0,0.0,4,Fatty acid biosynthesis,3-oxoacyl-[acyl-carrier-protein] reductase (n-...
51,3OAR80,3ooctACP_c + h_c + nadph_c <=> 3hoctACP_c + na...,SGL_RS07245 or SGL_RS12965 or sll5079,-1000.0,1000.0,0.0,4,Fatty acid biosynthesis,3-oxoacyl-[acyl-carrier-protein] reductase (n-...


Now, we have reannotated all genes with traditional names but **for several issues there are still 38 reactions to reannotate**. As there is no common reason for the missed reannotation, **I manually checked all reactions in the previous df, creating a dictionary with the final substitutions**. This manual curation was done with the aid of **BIGG database**, which was used to check for matching reactions and for retrieving old locus tags unavailable in *iJN678*.

In [54]:
final_reannotation_dict = { 'sll5079' : 'SGL_RS00360',
                            'slr0665' : 'SGL_RS17360',
                            'ssr2016' : 'SGL_RS06050',
                            'bhy' : 'SGL_RS06460',
                            ' and ssl3803' : '',
                            ' and sml0004' : '',
                            'nylA' : 'SGL_RS15160',
                            'cydA' : 'SGL_RS05010',
                            'desA' : 'SGL_RS10130',
                            'desB' : 'SGL_RS10825',
                            'desD_des6_' : 'SGL_RS11885',
                            'desC_des9_' : 'SGL_RS15040',
                            'pyrD' : 'SGL_RS17005',
                            'ilvE' : 'SGL_RS16510',
                            ' and sml0003' : '',
                            ' and smr0009' : '',
                            'pykF or pykF' : 'SGL_RS15640 or SGL_RS07035',
                            'slr1829' : 'SGL_RS06265',
                            'phbC' : 'SGL_RS06270',
                            'apqZ' : 'SGL_RS08525',
                            'ndhA' : 'SGL_RS17000',
                            'ndhB' : 'SGL_RS02610',
                            'ndhE' : 'SGL_RS16985',
                            'ndhG' : 'SGL_RS16990',
                            'ndhH' : 'SGL_RS08920',
                            'ndhJ' : 'SGL_RS10735',
                            '(SGL_RS10730 or sll8031)' : 'SGL_RS10730',
                            'slr1623' : 'SGL_RS11490',
                            'sll1262' : 'SGL_RS09990',
                            'sll1220' : 'SGL_RS09760',
                            'ndhD3' : 'SGL_RS06290',
                            'ndhF' : 'SGL_RS16595 and SGL_RS06295 and SGL_RS15190',
                            'sll1734' : 'SGL_RS06285',
                            'sll1735' : 'SGL_RS06280',
                            '(ndhD or ndhD2)' : 'SGL_RS16590 and SGL_RS12625 and SGL_RS03225',
                            'sll8031 and ' : '',
                            'ssl0352 and ' : '',
                            'atpI and atpG and atpF and SGL_RS02675 and SGL_RS09705 and SGL_RS09710 and SGL_RS02700 and SGL_RS02695 and SGL_RS02680 and atp1' : 'SGL_RS02705 and SGL_RS02670 and SGL_RS02690 and SGL_RS02700 and SGL_RS02695 and SGL_RS02685 and SGL_RS02680 and SGL_RS02675 and SGL_RS09705 and SGL_RS09710',
                            }

In [55]:
model_df[0].GPR.replace(final_reannotation_dict, regex=True, inplace=True)
reannotated_model = dataframe_to_model(model_df[0], model_df[1])
reannotated_model.summary()

Metabolite,Reaction,Flux,C-Number,C-Flux
ca2_e,EX_ca2_e,0.0003987,0,0.00%
co2_e,EX_co2_e,3.7,1,99.99%
cobalt2_e,EX_cobalt2_e,0.0002874,0,0.00%
cu2_e,EX_cu2_e,0.0002658,0,0.00%
fe2_e,EX_fe2_e,0.0006614,0,0.00%
fe3_e,EX_fe3_e,0.0006022,0,0.00%
h2o_e,EX_h2o_e,2.42,0,0.00%
h_e,EX_h_e,0.8795,0,0.00%
k_e,EX_k_e,0.01495,0,0.00%
mg2_e,EX_mg2_e,0.002541,0,0.00%

Metabolite,Reaction,Flux,C-Number,C-Flux
5mdru1p_c,DM_5mdru1p_c,-0.0005959,6,100.00%
o2_e,EX_o2_e,-5.585,0,0.00%


In [56]:
not_in_new_annotation = set([g for r in reannotated_model.reactions for g in [gene.id for gene in r.genes] if g.startswith('SGL')==False ])
len(not_in_new_annotation)

0

Finally all the GPRs are reannotated so now I will **save the model**

In [57]:
final_model_filepath = '/'.join(['models', 'iJN678_new_annotation.mat'])
cobra.io.save_matlab_model(reannotated_model, final_model_filepath)