In [1]:
import pandas as pd
import cobra

In [2]:
features = pd.read_csv('../data/pst_kegg.tsv', sep = '^')
features['Kegg Ontology'], features['RefSeq'] = features['KO'].str.split('|').str

# Separate the KO number and text description into separate columns.
features['KO'] = features['KO'].str[:6]
features['Kegg Ontology'] = features['Kegg Ontology'].str[6:]

# Replace missing annotations with consistent nomenclature
features['KO'] = features['KO'].str.replace('no KO ','none')
features.loc[features['KO'] == 'none','Kegg Ontology'] = 'none'

# Remove the RefSeq tag in the RefSeq column
features['RefSeq'] = features['RefSeq'].str[10:]

# extract EC numbers and then transfer to reactions

features

Unnamed: 0,kegg id,KO,Kegg Ontology,RefSeq
0,PSPTO_0001,K02313,chromosomal replication initiator protein,dnaA; chromosomal replication initiation protein
1,PSPTO_0002,K02338,DNA polymerase III subunit beta [EC:2.7.7.7],dnaN; DNA polymerase III subunit beta
2,PSPTO_0003,K03629,DNA replication and repair protein RecF,recF; DNA replication and repair protein RecF
3,PSPTO_0004,K02470,DNA gyrase subunit B [EC:5.99.1.3],gyrB; DNA gyrase subunit B
4,PSPTO_0005,K03427,type I restriction enzyme M protein [EC:2.1.1...,"type I restriction-modification system, M subunit"
5,PSPTO_0006,K01154,"type I restriction enzyme, S subunit [EC:3.1....","type I restriction-modification system, S subu..."
6,PSPTO_0007,none,none,hypothetical protein
7,PSPTO_0008,K01153,"type I restriction enzyme, R subunit [EC:3.1....",HsdR family type I site-specific deoxyribonucl...
8,PSPTO_0009,none,none,"ISPsy3, transposase"
9,PSPTO_5633,none,none,hypothetical protein


In [3]:
#annotate the genes with specific kegg information from the all_features_kegg df

#import working gapfilled model
psy = cobra.io.load_json_model('../results/PST_pao1_temp_biomass_v4.json')

#create an empty dictionary for Kegg IDs with KO and RefSeq
kegg_ontology = {}
refseq_functions = {}

#this properly calls all genes that are in the model currently and will call the annotated kegg function and refseq if desired
for index, row in features.iterrows():
    if row['kegg id'] in psy.genes:
        #print (row['kegg id'], row['Kegg Ontology'], row['RefSeq'])
        kegg_ontology[row['kegg id']]= row['Kegg Ontology']
        refseq_functions[row['kegg id']] = row['RefSeq'].split(';')[0]
    else:
        pass
print (refseq_functions)


{'PSPTO_4807': 'metal ion transporter', 'PSPTO_0690': 'ribD', 'PSPTO_3280': 'glxK', 'PSPTO_1005': 'gmd', 'PSPTO_0103': 'dadX', 'PSPTO_1843': 'aspartate kinase', 'PSPTO_1196': 'UDP-N-acetylglucosamine 2-epimerase', 'PSPTO_4441': 'murA', 'PSPTO_0085': 'coaBC', 'PSPTO_0592': 'trpG', 'PSPTO_1748': 'prephenate dehydrogenase/3-phosphoshikimate 1-carboxyvinyltransferase family protein', 'PSPTO_5126': 'aroB', 'PSPTO_4438': 'hisD', 'PSPTO_0170': 'hemF', 'PSPTO_0774': 'beta-alanine--pyruvate aminotransferase', 'PSPTO_1332': 'ilvE', 'PSPTO_0186': 'D,D-heptose 1,7-bisphosphate phosphatase', 'PSPTO_0961': 'panB', 'PSPTO_4501': 'carB', 'PSPTO_4916': 'high affinity branched-chain amino acid ABC transporter ATP-binding protein', 'PSPTO_0693': 'ribH-1', 'PSPTO_4918': 'braD', 'PSPTO_3860': 'aceE-1', 'PSPTO_5359': 'amino acid ABC transporter permease', 'PSPTO_2210': 'fabB', 'PSPTO_4337': 'pyk', 'PSPTO_3344': 'cysG', 'PSPTO_2788': 'ABC transporter substrate-binding protein', 'PSPTO_3375': 'nuoL', 'PSPTO_0

In [4]:
#add the refseq functions from dictionary refseq_functions as the name of the gene objects
#refseq functions come as annotated from KEGG
for gene, function in refseq_functions.items():
    if gene in psy.genes:
        gene_obj = psy.genes.get_by_id(gene)
        gene_obj.annotation = {'kegg.genes':'pst:'+gene}#,'refseq':function}
        
        # Assign the refseq function as the name for the gene
        gene_obj.name = function
        psy.genes.get_by_id(gene).name = function


In [5]:
# Next, load the modelseed reaction and compound aliases to annotate reactions and metabolites.
seed_rxn_aliases = pd.read_csv('../data/Reactions_Aliases.tsv', sep = '\t')
seed_cpd_aliases = pd.read_csv('../data/Compounds_Aliases.tsv', sep = '\t')

In [6]:
seed_rxn_aliases

Unnamed: 0,MS ID,Old MS ID,External ID,Source
0,rxn07912,rxn07912,12DGR120tipp,BiGG
1,rxn07913,rxn07913,12DGR140tipp,BiGG
2,rxn07914,rxn07914,12DGR141tipp,BiGG
3,rxn07915,rxn07915,12DGR160tipp,BiGG
4,rxn07916,rxn07916,12DGR161tipp,BiGG
5,rxn07917,rxn07917,12DGR180tipp,BiGG
6,rxn07918,rxn07918,12DGR181tipp,BiGG
7,rxn07919,rxn07919,12PPDRtex,BiGG
8,rxn07920,rxn07920,12PPDRtpp,BiGG
9,rxn07921,rxn07921,12PPDStex,BiGG


In [7]:
# Let's check out the annotation sources to see which ones are generalizable
# enough for memote.
seed_rxn_aliases['Source'].unique()

array(['BiGG', 'KEGG', 'KEGGaly', 'KEGGath', 'KEGGbdi', 'KEGGcre',
       'KEGGeco', 'KEGGgmx', 'KEGGosa', 'KEGGpop', 'KEGGsbi', 'KEGGvvi',
       'KEGGzma', 'MetaCyc', 'PlantCyc', 'AraCyc', 'BrachyCyc',
       'ChlamyCyc', 'CornCyc', 'EcoCyc', 'MaizeCyc', 'PoplarCyc',
       'RiceCyc', 'SorghumCyc', 'SoyCyc', 'AlgaGEM', 'AraGEM',
       'DF_Athaliana', 'JM_Creinhardtii', 'JP_Creinhardtii_MSB',
       'JP_Creinhardtii_NMeth', 'Maize_C4GEM', 'TS_Athaliana', 'iAF1260',
       'iAF692', 'iAG612', 'iAO358', 'iAbaylyiv4', 'iGT196', 'iIN800',
       'iIT341', 'iJN746', 'iJR904', 'iMA945', 'iMEO21', 'iMM904',
       'iMO1053-PAO1', 'iMO1056', 'iND750', 'iNJ661', 'iPS189', 'iRR1083',
       'iRS1563', 'iRS1597', 'iSB619', 'iSO783', 'iYO844'], dtype=object)

In [8]:
seed_cpd_aliases['Source'].unique()

array(['BiGG1', 'KEGG', 'MetaCyc', 'PlantCyc', 'BiGG', 'AraCyc',
       'BrachyCyc', 'ChlamyCyc', 'CornCyc', 'EcoCyc', 'MaizeCyc',
       'PoplarCyc', 'RiceCyc', 'SorghumCyc', 'SoyCyc', 'AlgaGEM',
       'AraGEM', 'DF_Athaliana', 'JM_Creinhardtii', 'JP_Creinhardtii_MSB',
       'JP_Creinhardtii_NMeth', 'Maize_C4GEM', 'TS_Athaliana', 'iAF1260',
       'iAF692', 'iAG612', 'iAO358', 'iAbaylyiv4', 'iGT196', 'iIN800',
       'iIT341', 'iJN746', 'iJR904', 'iMA945', 'iMEO21', 'iMM904',
       'iMO1053-PAO1', 'iMO1056', 'iND750', 'iNJ661', 'iPS189', 'iRR1083',
       'iRS1563', 'iRS1597', 'iSB619', 'iSO783', 'iYO844'], dtype=object)

In [9]:
# Replace the source IDs to be consistent with the identifiers memote is looking for
# These are the MIRIAM compliant versions of the resources, available at identifiers.org
# see the memote annotations.py file for the regular expressions expected for 
# identifiers from each resource.
seed_rxn_aliases.loc[seed_rxn_aliases['Source'] == 'KEGG', 'Source'] = 'kegg.reaction'
seed_rxn_aliases.loc[seed_rxn_aliases['Source'] == 'BiGG', 'Source'] = 'bigg.reaction'
seed_rxn_aliases.loc[seed_rxn_aliases['Source'] == 'MetaCyc', 'Source'] = 'biocyc'

seed_cpd_aliases.loc[seed_cpd_aliases['Source'] == 'KEGG', 'Source'] = 'kegg.compound'
seed_cpd_aliases.loc[seed_cpd_aliases['Source'] == 'BiGG', 'Source'] = 'bigg.compound'
seed_cpd_aliases.loc[seed_cpd_aliases['Source'] == 'MetaCyc', 'Source'] = 'biocyc'

In [10]:
# Get and add all of the reaction annotations
for reaction in psy.reactions:
    if not reaction.id.startswith('EX_'): # don't look for exchange reactions
        # get the reaction ID without compartment suffix e.g. '_c'
        reaction_baseid = reaction.id.split('_')[0]
        
        annotation_dict = {}
        annotation = seed_rxn_aliases.loc[seed_rxn_aliases['MS ID'] == reaction_baseid]
        for source in annotation['Source'].unique():
            db_annotation = annotation.loc[annotation['Source'] == source]
            annotation_id = db_annotation['External ID'].values[0]
            annotation_dict[source] = annotation_id
        
        # if there were no annotations, this reaction either has no alias
        # or is only in modelseed.
        if (reaction.id == (reaction_baseid + '_c0')) or (reaction.id == (reaction_baseid + '_e0')):
                # if the ID is a standard modelseed format, add a seed annotation.
                # otherwise, this might be a custom object (which should not have an annotation)
                annotation_dict['seed.reaction'] = reaction_baseid
        
        reaction.annotation = annotation_dict
        
        

In [11]:
# Get and add all of the metabolite annotations
for metabolite in psy.metabolites:
    # get the reaction ID without compartment suffix e.g. '_c'
    metabolite_baseid = metabolite.id.split('_')[0]

    annotation_dict = {}
    annotation = seed_cpd_aliases.loc[seed_cpd_aliases['MS ID'] == metabolite_baseid]
    for source in annotation['Source'].unique():
        db_annotation = annotation.loc[annotation['Source'] == source]
        annotation_id = db_annotation['External ID'].values[0]
        annotation_dict[source] = annotation_id

    if (metabolite.id == (metabolite_baseid + '_c0')) or (metabolite.id == (metabolite_baseid + '_e0')):
                # if the ID is a standard modelseed format, add a seed annotation.
                # otherwise, this might be a custom object (which should not have an annotation)
        annotation_dict['seed.compound'] = metabolite_baseid
    
    metabolite.annotation = annotation_dict

In [12]:
# Add inchi keys for all metabolites using the ModelSEED biochemistry files.
seed_cpd_structures = pd.read_csv('../data/ModelSEED_Structures.txt', sep = '\t')
seed_cpd_structures

Unnamed: 0,ID,Type,Aliases,Structure
0,cpd00001,InChIKey,C00001;C01328;OH;OXONIUM;WATER,XLYOFNOQVPJJNP-UHFFFAOYSA-N
1,cpd00001,InChI,C00001;C01328;OH;OXONIUM;WATER,InChI=1S/H2O/h1H2
2,cpd00001,SMILE,C00001;C01328;OH;OXONIUM;WATER,O
3,cpd00002,InChIKey,ATP;C00002,ZKHQWZAMYRWXGA-KQYNXXCUSA-K
4,cpd00002,InChI,ATP;C00002,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...
5,cpd00002,SMILE,ATP;C00002,O[C@@H]1[C@@H](COP(=O)(OP(=O)(OP(=O)(O)[O-])[O...
6,cpd00003,InChIKey,C00003;NAD,BAWFJGJZGIEFAR-NNYOXOHSSA-M
7,cpd00003,InChI,C00003;NAD,InChI=1S/C21H27N7O14P2/c22-17-12-19(25-7-24-17...
8,cpd00003,SMILE,C00003;NAD,O[C@@H]1[C@H](O)[C@H](O[C@H]1[n+]1cccc(c1)C(=O...
9,cpd00004,InChIKey,C00004;NADH,BOPGDPNILDQYTO-NNYOXOHSSA-L


In [13]:
# Add the inchi and inchikey annotations for metabolites
for metabolite in psy.metabolites:
    metabolite_baseid = metabolite.id.split('_')[0]
    if metabolite_baseid in seed_cpd_structures['ID'].tolist():
        annotations = seed_cpd_structures.loc[seed_cpd_structures['ID'] == metabolite_baseid]
        if 'InChI' in annotations['Type'].tolist():
            inchi = seed_cpd_structures.loc[(seed_cpd_structures['ID'] == metabolite_baseid) &
                                        (seed_cpd_structures['Type'] == 'InChI'),'Structure'].values[0]
            metabolite.annotation['inchi'] = inchi
        
        if 'InChIKey' in annotations['Type'].tolist():
            inchikey = seed_cpd_structures.loc[(seed_cpd_structures['ID'] == metabolite_baseid) &
                                        (seed_cpd_structures['Type'] == 'InChIKey'),'Structure'].values[0]
            metabolite.annotation['inchikey'] = inchikey

In [14]:
# Add SBO terms

# The expected SBO terms are as follows:
# Metabolite SBO:0000247
# Metabolic Reaction SBO:0000176
# Transport Reaction SBO:0000185
# Exchange Reaction SBO:0000627
# Demand Reaction SBO:0000628
# Sink Reactions SBO:0000632
# Gene SBO:0000243
# Biomass Reactions SBO:0000629

# For all metabolites, add the metabolite annotation
for metabolite in psy.metabolites:
    metabolite.annotation['sbo'] = 'SBO:0000247'
    
# for all reactions, add the reaction annotation.
# where applicable, add biomass, transport, and exchange
# terms as well.
for reaction in psy.reactions:
    annotations = ['SBO:0000176']
    if len(set([met.compartment for met in reaction.metabolites])) > 1:
        annotations.append('SBO:0000185')
    if reaction.id.startswith('EX_'):
        annotations.append('SBO:0000627')
    if reaction.id.lower().find('bio') > -1:
        annotations.append('SBO:0000629')
    reaction.annotation['sbo'] = annotations

    # add gene annotations
for gene in psy.genes:
    gene.annotation['sbo'] = 'SBO:0000243'

In [15]:
# save the model with added annotations
cobra.io.write_sbml_model(psy,'../results/v4_with_all_annotations.xml')

In [16]:
for reaction in psy.reactions:
    if len(set([met.compartment for met in reaction.metabolites])) > 1:
        print(reaction,reaction.annotation)

rxn02005_c0: cpd00061_c0 + cpd00794_e0 <=> cpd00020_c0 + cpd00523_c0 {'iPS189': 'TREpts', 'seed.reaction': 'rxn02005', 'iJR904': 'TREpts', 'iSB619': 'TREpts', 'iYO844': 'TREpts', 'bigg.reaction': 'TREpts', 'iAO358': 'rll_480', 'iMO1056': 'TREpts', 'sbo': ['SBO:0000176', 'SBO:0000185']}
rxn05146_c0: cpd00001_c0 + cpd00002_c0 + cpd00023_e0 --> cpd00008_c0 + cpd00009_c0 + cpd00023_c0 + cpd00067_c0 {'iAbaylyiv4': 'ABC-13-RXN', 'seed.reaction': 'rxn05146', 'iJR904': 'GLUabc', 'iSB619': 'GLUabc', 'bigg.reaction': 'GLUabc', 'biocyc': '3.6.3.21-RXN.ce.metaexp.GLT_GLT', 'iMO1056': 'GLUabc', 'sbo': ['SBO:0000176', 'SBO:0000185'], 'iRR1083': 'GLUabc'}
rxn05150_c0: cpd00001_c0 + cpd00002_c0 + cpd00034_e0 --> cpd00008_c0 + cpd00009_c0 + cpd00034_c0 + cpd00067_c0 {'seed.reaction': 'rxn05150', 'sbo': ['SBO:0000176', 'SBO:0000185'], 'iAF692': 'ZNabc', 'iSB619': 'ZNabc', 'iYO844': 'ZN2abc2', 'bigg.reaction': 'ZN2abc2', 'iMO1056': 'ZN2abc2'}
rxn05152_c0: cpd00001_c0 + cpd00002_c0 + cpd00041_e0 --> cpd00

In [17]:
psy.reactions.get_by_id('rxn05605_c0').annotation['sbo'] in ['SBO:0000176']

False