#  Script to update iGEL604 and prepare it for ec model generation using GECKO

# To do:
    
    # model = addReaction(model, 'FA_composition', 'reactionName', 'Cell membrane fatty acid composition pseudoreaction', 'reactionFormula', '2.78 C00249 + 1.02 C01530 + 0.06 C06424 -> 1 Membrane_FAs');
    # model = addReaction(model, 'CellMembrane', 'reactionName', 'Cell membrane pseudoreaction', 'reactionFormula', '1.93 C00093 + 1 Membrane_FAs -> 1.272 Cellmembrane');


In [1]:
import cobra
import os
import pandas as pd
import math

In [2]:
model = cobra.io.load_json_model('../../Models/iGEL604.json')  # Reads iGEL604

In [3]:
model  # Starting stats of iGEL604

0,1
Name,model
Memory address,0x010b6a7a30
Number of metabolites,1311
Number of reactions,1249
Number of groups,0
Objective expression,1.0*Biomass - 1.0*Biomass_reverse_57a34
Compartments,"c, e"


In [4]:
model.summary()

Metabolite,Reaction,Flux,C-Number,C-Flux
C00014[c],EX_Ammonia,9.749,0,0.00%
C00120_e[e],EX_Biotin,0.01116,10,0.16%
C14818_e[e],EX_Fe,0.004466,0,0.00%
C00504_e[e],EX_Folate,0.00614,19,0.17%
C00080[c],EX_H,13.57,0,0.00%
C00253_e[e],EX_Nicotinicacid,0.02718,6,0.24%
C00009_e[e],EX_Orthophosphate,1.138,0,0.00%
C00007[c],EX_Oxygen,32.66,0,0.00%
C00059[c],EX_Sulfate,0.1521,0,0.00%
C00378_e[e],EX_Thiamine,0.0006698,12,0.01%

Metabolite,Reaction,Flux,C-Number,C-Flux
Biomass[c],EX_Biomass,-0.9968,0,0.00%
C00011[c],EX_CO2,-32.33,1,100.00%
C00001[c],EX_H2O,-57.37,0,0.00%


In [5]:
#  Many reactions, e.g. exchange reactions, needs naming
for reaction in model.exchanges:
    print("%s  %s" %(reaction.id, reaction.name))

EX_Galactose  
EX_Mannose  
EX_Xylose  
EX_Glucose  
EX_Orthophosphate  
EX_Molybdate  
EX_VitB12  
EX_PyridoxinHydrochloride  
EX_Thiamine  
EX_Nicotinicacid  
EX_paminobenzoate  
EX_Biotin  
EX_Folate  
EX_Fe  
EX_C00116[e]  EX_C00116[e]
EX_C00089[e]  EX_C00089[e]
EX_C00185[e]  EX_C00185[e]
EX_C00259[e]  EX_C00259[e]
EX_C00249[e]  EX_C00249[e]
EX_C00246[e]  EX_C00246[e]


In [6]:
# Adds some missing reaction names. Needs updating "L" and "D" for some of the sugars. 
# More names can be added to csv file later.

reactionNamesToUpdate = pd.read_csv('Reaction_names.csv')

rxnIDs = reactionNamesToUpdate['Reaction'].tolist()  # List of reaction IDs in iGEL604 that lacks names
rxnNames = reactionNamesToUpdate['Reaction name'].tolist()  # List of reaction names to add to model

for i, rxnID in enumerate(rxnIDs):  # Loops over rxnIDs and updates model using the corresponding rxn names
    model.reactions.get_by_id(rxnID).name = rxnNames[i]
    
for reaction in model.exchanges:
    print("%s  %s %s" %(reaction.id, reaction.name, reaction.bounds))

EX_Galactose  Galactose exchange (0.0, 0.0)
EX_Mannose  Mannose exchange (0.0, 0.0)
EX_Xylose  Xylose exchange (-13.61, 0.0)
EX_Glucose  D-Glucose exchange (0.0, 0.0)
EX_Orthophosphate  Orthophosphate exchange (-1000.0, 1000.0)
EX_Molybdate  Molybdate exchange (-1000.0, 0.0)
EX_VitB12  Vitamin B12 exchange (-1000.0, 0.0)
EX_PyridoxinHydrochloride  Pyridoxin exchange (-1000.0, 0.0)
EX_Thiamine  Thiamine exchange (-1000.0, 0.0)
EX_Nicotinicacid  Nicotinic acid exchange (-1000.0, 0.0)
EX_paminobenzoate  p-Aminobenzoic acid exchange (-1000.0, 0.0)
EX_Biotin  Biotin exchange (-1000.0, 0.0)
EX_Folate  Folate exchange (-1000.0, 0.0)
EX_Fe  Iron exchange (-1000.0, 0.0)
EX_C00116[e]  Glycerol exchange (0.0, 0.0)
EX_C00089[e]  Sucrose exchange (0.0, 0.0)
EX_C00185[e]  Cellobiose exchange (0.0, 0.0)
EX_C00259[e]  L-Arabinose exchange (0.0, 0.0)
EX_C00249[e]  Palmitic acid exchange (0.0, 0.0)
EX_C00246[e]  Butanoic acid exchange (0.0, 0.0)


In [7]:
# Update reaction bounds for uptake reactions to avoid losing them during GECKO preprocessing
model.reactions.get_by_id('EX_Galactose').bounds = (-1000, 0.0)
model.reactions.get_by_id('EX_Mannose').bounds = (-1000, 0.0)
model.reactions.get_by_id('EX_Glucose').bounds = (-1000, 0.0)
model.reactions.get_by_id('EX_C00116[e]').bounds = (-1000, 0.0)
model.reactions.get_by_id('EX_C00089[e]').bounds = (-1000, 0.0)
model.reactions.get_by_id('EX_C00185[e]').bounds = (-1000, 0.0)
model.reactions.get_by_id('EX_C00259[e]').bounds = (-1000, 0.0)
model.reactions.get_by_id('EX_C00249[e]').bounds = (-1000, 0.0)
model.reactions.get_by_id('EX_C00246[e]').bounds = (-1000, 0.0)

for reaction in model.exchanges:
    print("%s  %s %s" %(reaction.id, reaction.name, reaction.bounds))

EX_Galactose  Galactose exchange (-1000, 0.0)
EX_Mannose  Mannose exchange (-1000, 0.0)
EX_Xylose  Xylose exchange (-13.61, 0.0)
EX_Glucose  D-Glucose exchange (-1000, 0.0)
EX_Orthophosphate  Orthophosphate exchange (-1000.0, 1000.0)
EX_Molybdate  Molybdate exchange (-1000.0, 0.0)
EX_VitB12  Vitamin B12 exchange (-1000.0, 0.0)
EX_PyridoxinHydrochloride  Pyridoxin exchange (-1000.0, 0.0)
EX_Thiamine  Thiamine exchange (-1000.0, 0.0)
EX_Nicotinicacid  Nicotinic acid exchange (-1000.0, 0.0)
EX_paminobenzoate  p-Aminobenzoic acid exchange (-1000.0, 0.0)
EX_Biotin  Biotin exchange (-1000.0, 0.0)
EX_Folate  Folate exchange (-1000.0, 0.0)
EX_Fe  Iron exchange (-1000.0, 0.0)
EX_C00116[e]  Glycerol exchange (-1000, 0.0)
EX_C00089[e]  Sucrose exchange (-1000, 0.0)
EX_C00185[e]  Cellobiose exchange (-1000, 0.0)
EX_C00259[e]  L-Arabinose exchange (-1000, 0.0)
EX_C00249[e]  Palmitic acid exchange (-1000, 0.0)
EX_C00246[e]  Butanoic acid exchange (-1000, 0.0)


In [8]:
# Remove some reactions that were added during iGEL604 testing
rxnIDs_to_remove = ['EX_3HB', '3HB_thioesterase','EX_Ethanol']

model.remove_reactions(rxnIDs_to_remove, remove_orphans=True)

In [9]:
# Model now strongly oversetimates growth since all exchanges are open. Also makes tons of lactate. 
#No longer valid for modelling, but will be fixed during the Gecko workflow.
model.summary()

Metabolite,Reaction,Flux,C-Number,C-Flux
C00014[c],EX_Ammonia,119.8,0,0.00%
C00120_e[e],EX_Biotin,0.1372,10,0.05%
C00116[e],EX_C00116[e],60.77,0,0.00%
C14818_e[e],EX_Fe,0.05486,0,0.00%
C00504_e[e],EX_Folate,0.07544,19,0.05%
C00031_e[e],EX_Glucose,503.8,6,99.84%
C00253_e[e],EX_Nicotinicacid,0.334,6,0.07%
C00009_e[e],EX_Orthophosphate,13.99,0,0.00%
C00007[c],EX_Oxygen,34.41,0,0.00%
C00059[c],EX_Sulfate,1.868,0,0.00%

Metabolite,Reaction,Flux,C-Number,C-Flux
Biomass[c],EX_Biomass,-12.25,0,0.00%
C00080[c],EX_H,-755.3,0,0.00%
C00001[c],EX_H2O,-368.4,0,0.00%
C00186[c],EX_Lactate,-922.0,3,100.00%


# Fix substrate SMILES

In [148]:
import requests
from bioservices import *

class metab_data_container:
    # Class that holds data for a metabolite from a cobry.core.model.metabolites
    

    
    def __init__(self, metabolite):
        self.id_in_model = metabolite.id
        
        self.kegg_id = self.format_id()
        
        self.metname = self.retreive_name()
        
        self.smiles = self.retrieve_smiles()
    
    
    
    def format_id(self):
        # Formats metabolite ID from iGEL604 to kegg compound id format
        kegg_id = self.id_in_model
        if '_e' in kegg_id:
            kegg_id = kegg_id.replace('_e', '')
        
        if  kegg_id[0:2] in ['C0','C1','C2']:
            kegg_id = kegg_id.rstrip('[c]').rstrip('[e]')  # Removes compartment identifiers
        return kegg_id
    
    def retreive_name(self):
        metabolite = self.kegg_id
        kegg_con = KEGG()

        try:
            kegg_con = KEGG()
            metname = kegg_con.parse(kegg_con.get(metabolite))['NAME']
        except: 
            metname = None
            
        return metname
        
    
    def retrieve_smiles(self):
        smiles = ''
        if self.metname is not None:
            for n in self.metname:  # Checks all names for smiles
                try:
                    url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/%s/property/CanonicalSMILES/TXT' % n
                    req = requests.get(url)
                    if req.status_code == 200:
                        smiles = req.content.splitlines()[0].decode()
                        break  # If smiles found, set smiles for metabolite and end here
                except :
                        pass
        return smiles  # Return smiles
    
    def get_id_and_smiles(self):
        return [self.id_in_model, self.smiles]
            
    
    

In [121]:
# Testing that the class works. Seems to work overall
testmetab = metab_data_container(model.metabolites[13])
print(testmetab.id_in_model)
print(testmetab.kegg_id)
print(testmetab.metname)
print(testmetab.smiles)
print(testmetab.get_id_and_smiles())

https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/UDP;/property/CanonicalSMILES/TXT
https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/Uridine 5'-diphosphate/property/CanonicalSMILES/TXT
C00015[c]
C00015
['UDP;', "Uridine 5'-diphosphate"]
C1=CN(C(=O)NC1=O)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)O)O)O
['C00015[c]', 'C1=CN(C(=O)NC1=O)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)O)O)O']


In [130]:
# Test looping through some metabolites

metabolite_data_containers = []
metabolite_id_list = []
smiles_list = []

for metabolite in model.metabolites[0:10]:
    metab_data = metab_data_container(metabolite)
    metabolite_data_containers.append(metab_data)
    
for item in metabolite_data_containers:
    data = item.get_id_and_smiles()
    metabolite_id_list.append(data[0])
    smiles_list.append(data[1])
    
print(metabolite_id_list)
print(smiles_list)

['C00001[c]', 'C00002[c]', 'C00003[c]', 'C00004[c]', 'C00005[c]', 'C00006[c]', 'C00007[c]', 'C00008[c]', 'C00009[c]', 'C00010[c]']
['O', 'C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O)N', 'C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)O)O)O)O)C(=O)N', 'C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)O)O)O)O', 'C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)OP(=O)(O)O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)OP(=O)(O)O)O)O)O)C(=O)N', 'O=O', 'C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(=O)(O)O)O)O)N', 'OP(=O)(O)O', 'CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)C(C(=O)NCCC(=O)NCCS)O']


In [None]:
# Loop trhough all metabolites
metabolite_data_containers = []
metabolite_id_list = []
smiles_list = []

for metabolite in model.metabolites[0:10]:
    metab_data = metab_data_container(metabolite)
    metabolite_data_containers.append(metab_data)
    
for item in metabolite_data_containers:
    data = item.get_id_and_smiles()
    metabolite_id_list.append(data[0])
    smiles_list.append(data[1])
    
print(metabolite_id_list)
print(smiles_list)

In [142]:
print(model.metabolites[1204:1210])
print(len(metabolite_data_containers))  # Crashed after 1205

[<Metabolite C20781[c] at 0x125d78eb0>, <Metabolite C20797[c] at 0x125d78ee0>, <Metabolite C20798[c] at 0x125d78f10>, <Metabolite C20830[c] at 0x125d78f40>, <Metabolite C20831[c] at 0x125d78f70>, <Metabolite C20838[c] at 0x125d78fa0>]
1205


In [144]:
templist = []
for item in metabolite_data_containers:
    templist.append(item.id_in_model)

for item in templist:
    print(item)
    

C00001[c]
C00002[c]
C00003[c]
C00004[c]
C00005[c]
C00006[c]
C00007[c]
C00008[c]
C00009[c]
C00010[c]
C00011[c]
C00013[c]
C00014[c]
C00015[c]
C00016[c]
C00018[c]
C00019[c]
C00020[c]
C00021[c]
C00022[c]
C00024[c]
C00025[c]
C00026[c]
C00027[c]
C00029[c]
C00031[c]
C00032[c]
C00033[c]
C00035[c]
C00036[c]
C00037[c]
C00041[c]
C00042[c]
C00043[c]
C00044[c]
C00047[c]
C00048[c]
C00049[c]
C00051[c]
C00052[c]
C00053[c]
C00054[c]
C00055[c]
C00058[c]
C00059[c]
C00061[c]
C00062[c]
C00063[c]
C00064[c]
C00065[c]
C00067[c]
C00068[c]
C00072[c]
C00073[c]
C00074[c]
C00075[c]
C00077[c]
C00078[c]
C00079[c]
C00080[c]
C00081[c]
C00082[c]
C00083[c]
C00084[c]
C00085[c]
C00086[c]
C00088[c]
C00089[c]
C00091[c]
C00092[c]
C00093[c]
C00094[c]
C00095[c]
C00096[c]
C00097[c]
C00099[c]
C00100[c]
C00101[c]
C00103[c]
C00104[c]
C00105[c]
C00106[c]
C00108[c]
C00109[c]
C00111[c]
C00112[c]
C00114[c]
C00116[c]
C00117[c]
C00118[c]
C00119[c]
C00120[c]
C00121[c]
C00122[c]
C00123[c]
C00124[c]
C00127[c]
C00129[c]
C00130[c]
C00131[c]


In [146]:
model.metabolites.index('C20797[c]')  # Index of first missing metabolite is 1205

1205

In [149]:
# Find the last missing metabolites after updating code for the class, Rerun the whole search later for the publication
for metabolite in model.metabolites[1205:]:
    metab_data = metab_data_container(metabolite)
    metabolite_data_containers.append(metab_data)
    
for item in metabolite_data_containers:
    data = item.get_id_and_smiles()
    metabolite_id_list.append(data[0])
    smiles_list.append(data[1])
    
print(metabolite_id_list)
print(smiles_list)



['C00001[c]', 'C00002[c]', 'C00003[c]', 'C00004[c]', 'C00005[c]', 'C00006[c]', 'C00007[c]', 'C00008[c]', 'C00009[c]', 'C00010[c]', 'C00011[c]', 'C00013[c]', 'C00014[c]', 'C00015[c]', 'C00016[c]', 'C00018[c]', 'C00019[c]', 'C00020[c]', 'C00021[c]', 'C00022[c]', 'C00024[c]', 'C00025[c]', 'C00026[c]', 'C00027[c]', 'C00029[c]', 'C00031[c]', 'C00032[c]', 'C00033[c]', 'C00035[c]', 'C00036[c]', 'C00037[c]', 'C00041[c]', 'C00042[c]', 'C00043[c]', 'C00044[c]', 'C00047[c]', 'C00048[c]', 'C00049[c]', 'C00051[c]', 'C00052[c]', 'C00053[c]', 'C00054[c]', 'C00055[c]', 'C00058[c]', 'C00059[c]', 'C00061[c]', 'C00062[c]', 'C00063[c]', 'C00064[c]', 'C00065[c]', 'C00067[c]', 'C00068[c]', 'C00072[c]', 'C00073[c]', 'C00074[c]', 'C00075[c]', 'C00077[c]', 'C00078[c]', 'C00079[c]', 'C00080[c]', 'C00081[c]', 'C00082[c]', 'C00083[c]', 'C00084[c]', 'C00085[c]', 'C00086[c]', 'C00088[c]', 'C00089[c]', 'C00091[c]', 'C00092[c]', 'C00093[c]', 'C00094[c]', 'C00095[c]', 'C00096[c]', 'C00097[c]', 'C00099[c]', 'C00100[c]'

In [150]:
print(metabolite_id_list)

['C00001[c]', 'C00002[c]', 'C00003[c]', 'C00004[c]', 'C00005[c]', 'C00006[c]', 'C00007[c]', 'C00008[c]', 'C00009[c]', 'C00010[c]', 'C00011[c]', 'C00013[c]', 'C00014[c]', 'C00015[c]', 'C00016[c]', 'C00018[c]', 'C00019[c]', 'C00020[c]', 'C00021[c]', 'C00022[c]', 'C00024[c]', 'C00025[c]', 'C00026[c]', 'C00027[c]', 'C00029[c]', 'C00031[c]', 'C00032[c]', 'C00033[c]', 'C00035[c]', 'C00036[c]', 'C00037[c]', 'C00041[c]', 'C00042[c]', 'C00043[c]', 'C00044[c]', 'C00047[c]', 'C00048[c]', 'C00049[c]', 'C00051[c]', 'C00052[c]', 'C00053[c]', 'C00054[c]', 'C00055[c]', 'C00058[c]', 'C00059[c]', 'C00061[c]', 'C00062[c]', 'C00063[c]', 'C00064[c]', 'C00065[c]', 'C00067[c]', 'C00068[c]', 'C00072[c]', 'C00073[c]', 'C00074[c]', 'C00075[c]', 'C00077[c]', 'C00078[c]', 'C00079[c]', 'C00080[c]', 'C00081[c]', 'C00082[c]', 'C00083[c]', 'C00084[c]', 'C00085[c]', 'C00086[c]', 'C00088[c]', 'C00089[c]', 'C00091[c]', 'C00092[c]', 'C00093[c]', 'C00094[c]', 'C00095[c]', 'C00096[c]', 'C00097[c]', 'C00099[c]', 'C00100[c]'

In [151]:
print(smiles_list)

['O', 'C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O)N', 'C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)O)O)O)O)C(=O)N', 'C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)O)O)O)O', 'C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)OP(=O)(O)O)O)O)O', 'C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)OP(=O)(O)O)O)O)O)C(=O)N', 'O=O', 'C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(=O)(O)O)O)O)N', 'OP(=O)(O)O', 'CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)C(C(=O)NCCC(=O)NCCS)O', 'C(=O)=O', '', 'N', 'C1=CN(C(=O)NC1=O)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)O)O)O', 'CC1=CC2=C(C=C1C)N(C3=NC(=O)NC(=O)C3=N2)CC(C(C(COP(=O)(O)OP(=O)(O)OCC4C(C(C(O4)N5C=NC6=C(N=CN=C65)N)O)O)O)O)O', 'CC1=NC=C(C(=C1O)C=O)COP(=O)(O)O', 'C[S+](CCC(C(=O)[O-])N)CC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)O', 'C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)O)O)O)N

In [152]:
print(len(smiles_list))
print(len(metabolite_id_list))
print(len(model.metabolites))

1310
1310
1310


In [154]:
# Save data
import csv
with open('smiles_list.csv', 'w') as outfile:
    wr = csv.writer(outfile)
    wr.writerow(smiles_list)
    outfile.close()
    
with open('metabolite_id_list.csv', 'w') as outfile:
    wr = csv.writer(outfile)
    wr.writerow(metabolite_id_list)
    outfile.close()


In [159]:
import pickle

def load_data():
    try:
        with open("metabolite_data.dat") as f:
            data = pickle.load(f)
            return data
    except:
        metabolite_data_containers = []

def save_data(data):
    with open("metabolite_data.dat", "wb") as f:
        pickle.dump(data, f)

In [160]:
save_data(metabolite_data_containers)

PicklingError: Can't pickle <class '__main__.metab_data_container'>: it's not the same object as __main__.metab_data_container

In [162]:
testdata = load_data()
print(testdata)

None


In [None]:
# Fixes some metabolite names
metabolites = []
for metabolite in model.metabolites:
    if '_e' in metabolite.id:
        metabolite.id = metabolite.id.replace('_e', '')
    metabolites.append(metabolite.id)

curated_metabolites = []  # List of KEGG ids for metabolites in the model
for metabolite in metabolites:
    if  metabolite[0:2] in ['C0','C1','C2']:
        metabolite = metabolite.rstrip('[c]').rstrip('[e]')  # Removes compartment identifiers
        curated_metabolites.append(metabolite)

In [47]:
import requests

# One method to obtain SMILES by PubChem API using the website
def get_smiles(name):
    # smiles = redis_cli.get(name)
    # if smiles is None:
    try :
        url = 'https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/name/%s/property/CanonicalSMILES/TXT' % name
        req = requests.get(url)
        if req.status_code != 200:
            smiles = None
            return False
        else:
            smiles = req.content.splitlines()[0].decode()
            name_smiles[name] = smiles
            print('SMILES for %s: %s' %(name, smiles))
            return True
        # redis_cli.set(name, smiles, ex=None)

        # print smiles
    except :
        smiles = None
        return False


    

    
testnames = ['glucose', 'oxygen', 'water', 'palmitic acid']    
for name in testnames:
    get_smiles(name)
    
# Working!

SMILES for glucose: C(C1C(C(C(C(O1)O)O)O)O)O
SMILES for oxygen: O=O
SMILES for water: O
SMILES for palmitic acid: CCCCCCCCCCCCCCCC(=O)O


In [49]:
print(name_smiles)

{'glucose': 'C(C1C(C(C(C(O1)O)O)O)O)O', 'oxygen': 'O=O', 'water': 'O', 'palmitic acid': 'CCCCCCCCCCCCCCCC(=O)O'}


In [12]:
# Get metabolite names from KEGG
from bioservices import *

kegg_con = KEGG()
metnames = []
missing_metnames = []
for metabolite in curated_metabolites:
    try:
        kegg_con = KEGG()
        metname = kegg_con.parse(kegg_con.get(metabolite))['NAME']
        metnames.append(metname)
    except: missing_metnames.append(metabolite)
        
print(len(curated_metabolites))
print(len(metnames))
print(len(missing_metnames))



1298
1296
2


In [20]:
curated_metabolites

['C00001',
 'C00002',
 'C00003',
 'C00004',
 'C00005',
 'C00006',
 'C00007',
 'C00008',
 'C00009',
 'C00010',
 'C00011',
 'C00013',
 'C00014',
 'C00015',
 'C00016',
 'C00018',
 'C00019',
 'C00020',
 'C00021',
 'C00022',
 'C00024',
 'C00025',
 'C00026',
 'C00027',
 'C00029',
 'C00031',
 'C00032',
 'C00033',
 'C00035',
 'C00036',
 'C00037',
 'C00041',
 'C00042',
 'C00043',
 'C00044',
 'C00047',
 'C00048',
 'C00049',
 'C00051',
 'C00052',
 'C00053',
 'C00054',
 'C00055',
 'C00058',
 'C00059',
 'C00061',
 'C00062',
 'C00063',
 'C00064',
 'C00065',
 'C00067',
 'C00068',
 'C00072',
 'C00073',
 'C00074',
 'C00075',
 'C00077',
 'C00078',
 'C00079',
 'C00080',
 'C00081',
 'C00082',
 'C00083',
 'C00084',
 'C00085',
 'C00086',
 'C00088',
 'C00089',
 'C00091',
 'C00092',
 'C00093',
 'C00094',
 'C00095',
 'C00096',
 'C00097',
 'C00099',
 'C00100',
 'C00101',
 'C00103',
 'C00104',
 'C00105',
 'C00106',
 'C00108',
 'C00109',
 'C00111',
 'C00112',
 'C00114',
 'C00116',
 'C00117',
 'C00118',
 'C00119',

In [21]:
# Save downloaded data
import csv
with open('curated_met_ids.csv', 'w') as outfile:
    wr = csv.writer(outfile)
    wr.writerow(curated_metabolites)
    outfile.close()
    
with open('missing_metnames.csv', 'w') as outfile:
    wr = csv.writer(outfile)
    wr.writerow(missing_metnames)
    outfile.close()

with open ('metnames.csv', 'w') as outfile:
    wr = csv.writer(outfile)
    wr.writerow(metnames)
    outfile.close()

In [52]:
name_smiles = dict()

for metabolite in metnames:
    for name in metabolite:
        if get_smiles(name):
            break      

SMILES for Water: O
SMILES for Adenosine 5'-triphosphate: C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O)N
SMILES for beta-NAD+: C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)O)O)O)O)C(=O)N
SMILES for Reduced nicotinamide adenine dinucleotide: C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)O)O)O)O
SMILES for Reduced nicotinamide adenine dinucleotide phosphate: C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)OP(=O)(O)O)O)O)O
SMILES for beta-NADP+: C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)OP(=O)(O)O)O)O)O)C(=O)N
SMILES for O2: O=O
SMILES for Adenosine 5'-diphosphate: C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(=O)(O)O)O)O)N
SMILES for Orthophosphoric acid: OP(=O)(O)O
SMILES for CoA-SH: CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)C(C(=O)NCCC(=O)NCCS)O
SMILES for Carbon dioxide: C(=O)=O
SMILES fo

SMILES for 2-Oxo-3-phenylpropanoate: C1=CC=C(C=C1)CC(=O)C(=O)[O-]
SMILES for UDP-alpha-D-glucuronate: C1=CN(C(=O)NC1=O)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OC3C(C(C(C(O3)C(=O)O)O)O)O)O)O
SMILES for 3-Hydroxypyruvic acid: C(C(=O)C(=O)O)O
SMILES for Carbamoyl phosphate: C(=O)(N)OP(=O)(O)O
SMILES for 5'-Deoxy-5'-(methylsulfanyl)adenosine: CSCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)O
SMILES for 5-Methyluracil: CC1=CNC(=O)NC1=O
SMILES for (4-Aminobutyl) guanidine: C(CCN=C(N)N)CN
SMILES for Dracylic acid: C1=CC=C(C=C1)C(=O)O
SMILES for Wood sugar: C1C(C(C(C(O1)O)O)O)O
SMILES for L-Valine;: CC(C)C(C(=O)O)N
SMILES for 1,3-Dihydroxypropan-2-one: C(C(=O)CO)O
SMILES for 1-beta-D-Glucopyranosyl-4-D-glucopyranose: C(C1C(C(C(C(O1)OC2C(OC(C(C2O)O)O)CO)O)O)O)O
SMILES for L-Lactic acid: CC(C(=O)O)O
SMILES for L-Threonine;: CC(C(C(=O)O)N)O
SMILES for 2-Hydroxyethylamine: C(CO)N
SMILES for UDP-alpha-D-xylose: C1C(C(C(C(O1)OP(=O)(O)OP(=O)(O)OCC2C(C(C(O2)N3C=CC(=O)NC3=O)O)O)O)O)O
SMILES for Glucuronate: C1(C(C(OC(C1O)O

SMILES for cis-Aconitic acid: C(C(=CC(=O)O)C(=O)O)C(=O)O
SMILES for (2E)-3-Phenylprop-2-enoate: C1=CC=C(C=C1)C=CC(=O)[O-]
SMILES for L-2-Hydroxypropionaldehyde: CC(C=O)O
SMILES for 5-Amino-4-oxopentanoic acid: C(CC(=O)O)C(=O)CN
SMILES for 5-Aminovaleric acid: C(CCN)CC(=O)O
SMILES for 2-Oxoglutarate semialdehyde: C(CC(=O)C(=O)O)C=O
SMILES for N-Carbamoylputrescine: C(CCNC(=O)N)CN
SMILES for N2-Acetyl-L-ornithine: CC(=O)NC(CCCN)C(=O)O
SMILES for N-Carbamoyl-L-aspartate: C(C(C(=O)O)NC(=O)N)C(=O)O
SMILES for 5-Methyltetrahydrofolate: CN1C(CNC2=C1C(=O)NC(=N2)N)CNC3=CC=C(C=C3)C(=O)NC(CCC(=O)O)C(=O)O
SMILES for L-Aspartic 4-semialdehyde: C(C=O)C(C(=O)O)N
SMILES for 5,10-Methenyltetrahydrofolate: C1C2CN(C=[N+]2C3=C(N1)N=C(NC3=O)N)C4=CC=C(C=C4)C(=O)NC(CCC(=O)O)C(=O)O
SMILES for alpha-D-Galactopyranose 1-phosphate: C(C1C(C(C(C(O1)OP(=O)(O)O)O)O)O)O
SMILES for D-altro-Heptulose 1,7-biphosphate: C(C(C(C(C(C(=O)COP(=O)(O)O)O)O)O)O)OP(=O)(O)O
SMILES for (2E,6E)-Farnesyl diphosphate: CC(=CCCC(=CCCC(=

SMILES for (R)-Prunasin: C1=CC=C(C=C1)C(C#N)OC2C(C(C(C(O2)CO)O)O)O
SMILES for 2-Furoyl-CoA: CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)C(C(=O)NCCC(=O)NCCSC(=O)C4=CC=CO4)O
SMILES for 3-Oxohexanedioic acid: C(CC(=O)O)C(=O)CC(=O)O
SMILES for Vitamin B12s: CC1=CC2=C(C=C1C)N(C=N2)C3C(C(C(O3)CO)OP(=O)(O)OC(C)CNC(=O)CCC4(C(C5C6(C(C(C(=N6)C(=C7C(C(C(=N7)C=C8C(C(C(=N8)C(=C4[N-]5)C)CCC(=O)N)(C)C)CCC(=O)N)(C)CC(=O)N)C)CCC(=O)N)(C)CC(=O)N)C)CC(=O)N)C)O.[Co]
SMILES for Cyclohexan-1-ol: C1CCC(CC1)O
SMILES for Nicotinic acid adenine dinucleotide: C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)O)O)O)O)C(=O)O
SMILES for L-Histidinol: C1=C(NC=N1)CC(CO)N
SMILES for (R)-Pantothenate: CC(C)(CO)C(C(=O)NCCC(=O)O)O
SMILES for 4-Hydroxynitrobenzene: C1=CC(=CC=C1[N+](=O)[O-])O
SMILES for Coenzyme gamma-F420-2: CC(C(=O)NC(CCC(=O)NC(CCC(=O)O)C(=O)O)C(=O)O)OP(=O)(O)OCC(C(C(CN1C2=CC(=O)C=CC2=CC3=C1NC(=O)NC3=O)O)O)O
SMILES for (2E)-But-2-enoyl-CoA: CC=CC

SMILES for 2'-Deoxyinosine 5'-diphosphate: C1C(C(OC1N2C=NC3=C2N=CNC3=O)COP(=O)(O)OP(=O)(O)O)O
SMILES for 2'-Deoxyinosine 5'-triphosphate: C1C(C(OC1N2C=NC3=C2N=CNC3=O)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O
SMILES for 2'-Deoxyuridine 5'-diphosphate: C1C(C(OC1N2C=CC(=O)NC2=O)COP(=O)(O)OP(=O)(O)O)O
SMILES for FADH2: CC1=CC2=C(C=C1C)N(C3=C(N2)C(=O)NC(=O)N3)CC(C(C(COP(=O)(O)OP(=O)(O)OCC4C(C(C(O4)N5C=NC6=C(N=CN=C65)N)O)O)O)O)O
SMILES for AMP 3'-phosphate: C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)CO)OP(=O)(O)O)O)N
SMILES for Uridine 3'-phosphate: C1=CN(C(=O)NC1=O)C2C(C(C(O2)CO)OP(=O)(O)O)O
SMILES for Ethane-1,2-diol: C(CO)O
SMILES for Butyraldehyde: CCCC=O
SMILES for L-Cysteinylglycine: C(C(C(=O)NCC(=O)O)N)S
SMILES for Crithminic acid: CC1=CC=C(C=C1)C(=O)O
SMILES for 4-Hydroxy-3-methoxycinnamate: COC1=C(C=CC(=C1)C=CC(=O)O)O
SMILES for trans-Citral: CC(=CCCC(=CC=O)C)C
SMILES for Selenide: [Se-2]
SMILES for Pyromucic acid: C1=COC(=C1)C(=O)O
SMILES for Aminoformic acid: C(=O)(N)O
SMILES for Benzoylaminoacetic a

SMILES for (E)-3-Methylglutaconyl-1-CoA: CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O)CC(=O)O
SMILES for 3-Phosphooxypyruvate: C(C(=O)C(=O)[O-])OP(=O)([O-])[O-]
SMILES for Coproporphyrinogen III: CC1=C2CC3=C(C(=C(N3)CC4=C(C(=C(N4)CC5=C(C(=C(N5)CC(=C1CCC(=O)O)N2)C)CCC(=O)O)C)CCC(=O)O)CCC(=O)O)C
SMILES for 5-Oxopentanoic acid: C(CC=O)CC(=O)O
SMILES for L-2,4-Diaminobutyrate: C(CN)C(C(=O)O)N
SMILES for (S)-3-Aminoisobutyric acid: CC(CN)C(=O)O
SMILES for L-Glutamate 5-phosphate: C(CC(=O)OP(=O)(O)O)C(C(=O)O)N
SMILES for L-Xylulose 5-phosphate: C(C(C(C(=O)CO)O)O)OP(=O)(O)O
SMILES for dTDP-beta-L-rhamnose: CC1C(C(C(C(O1)OP(=O)(O)OP(=O)(O)OCC2C(CC(O2)N3C=C(C(=O)NC3=O)C)O)O)O)O
SMILES for 2-Methyl-3-acetoacetyl-CoA: CC(C(=O)C)C(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O
SMILES for 2-Methylcrotanoyl-CoA: CC=C(C)C(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N

SMILES for 4-Carboxymethyl-3-methylbut-2-en-1,4-olide: CC1=CC(=O)OC1CC(=O)O
SMILES for 4-Carboxymethyl-4-methylbut-2-en-1,4-olide: CC1(C=CC(=O)O1)CC(=O)O
SMILES for UDP-N-acetyl-2-amino-2-deoxy-alpha-D-glucuronate: CC(=O)NC1C(C(C(OC1OP(=O)(O)OP(=O)(O)OCC2C(C(C(O2)N3C=CC(=O)NC3=O)O)O)C(=O)O)O)O
SMILES for ditrans,octacis-Undecaprenyl diphosphate: CC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCOP(=O)(O)OP(=O)(O)O)C)C)C)C)C)C)C)C)C)C)C
SMILES for 5-(Methylsulfanyl)-D-ribulose 1-phosphate: CSCC(C(C(=O)COP(=O)(O)O)O)O
SMILES for 3-(O-Geranylgeranyl)-sn-glycerol 1-phosphate: CC(=CCCC(=CCCC(=CCCC(=CCOCC(COP(=O)(O)O)O)C)C)C)C
SMILES for UDP-N-acetyl-3-O-(1-carboxyvinyl)-alpha-D-glucosamine: CC(=O)NC1C(C(C(OC1OP(=O)(O)OP(=O)(O)OCC2C(C(C(O2)N3C=CC(=O)NC3=O)O)O)CO)O)OC(=C)C(=O)O
SMILES for 2-(Formamido)-N1-(5-phospho-D-ribosyl)acetamidine: C(C1C(C(C(O1)N=C(CNC=O)N)O)O)OP(=O)(O)O
SMILES for 5-Carboxymethyl-2-hydroxymuconic semialdehyde: C(C(=CO)C=CC(=O)C(=O)O)C(=O)O
SMILES for 3

SMILES for altro-Heptulose 7-phosphate: C(C(C(C(C(C(=O)CO)O)O)O)O)OP(=O)(O)O
SMILES for Lactose 6'-phosphate: C(C1C(C(C(C(O1)O)O)O)OC2C(C(C(C(O2)COP(=O)(O)O)O)O)O)O
SMILES for D-Gal-alpha1->6D-Glucose: C(C1C(C(C(C(O1)OCC2C(C(C(C(O2)O)O)O)O)O)O)O)O
SMILES for Manninotriose: C(C1C(C(C(C(O1)OCC2C(C(C(C(O2)OCC(C(C(C(C=O)O)O)O)O)O)O)O)O)O)O)O
SMILES for L-Dehydroascorbic acid: C(C(C1C(=O)C(=O)C(=O)O1)O)O
SMILES for 5beta-Cholestane-3alpha,7alpha,26-triol: CC(CCCC(C)C1CCC2C1(CCC3C2C(CC4C3(CCC(C4)O)C)O)C)CO
SMILES for 3alpha,7alpha-Dihydroxy-5beta-cholestan-26-al: CC(CCCC(C)C1CCC2C1(CCC3C2C(CC4C3(CCC(C4)O)C)O)C)C=O
SMILES for (24E)-3alpha,7alpha-Dihydroxy-5beta-cholest-24-en-26-oyl-CoA: CC(CCC=C(C)C(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O)C4CCC5C4(CCC6C5C(CC7C6(CCC(C7)O)C)O)C
SMILES for 3alpha,7alpha,12alpha-Trihydroxy-5beta-cholest-24-enoyl-CoA: CC(CCC=C(C)C(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C3

SMILES for (S)-3-Hydroxy-2-methylpropanoyl-CoA: CC(CO)C(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)([O-])OP(=O)([O-])OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)([O-])[O-])O
SMILES for (S)-3-Hydroxyisobutyrate: CC(CO)C(=O)[O-]
SMILES for (S)-Methylmalonate semialdehyde: CC(C=O)C(=O)O
SMILES for (2S)-2-Ethyl-2-hydroxy-3-oxobutanoic acid: CCC(C(=O)C)(C(=O)O)O
SMILES for (2R,3R)-2,3-Dihydroxy-3-methylpentanoic acid: CCC(C)(C(C(=O)O)O)O
SMILES for (2S)-2-Hydroxy-2-methyl-3-oxobutanoic acid: CC(=O)C(C)(C(=O)O)O
SMILES for dTDP-D-glucuronate: CC1=CN(C(=O)NC1=O)C2CC(C(O2)COP(=O)([O-])OP(=O)([O-])OC3C(C(C(C(O3)C(=O)[O-])O)O)O)O
SMILES for D-arabino-6-Phospho-hex-3-ulose: C(C(C(=O)C(C(COP(=O)(O)O)O)O)O)O
SMILES for UDP-3-O-[(3R)-3-hydroxymyristoyl]-alpha-D-glucosamine: CCCCCCCCCCCC(CC(=O)OC1C(C(OC(C1O)CO)OP(=O)(O)OP(=O)(O)OCC2C(C(C(O2)N3C=CC(=O)NC3=O)O)O)N)O
SMILES for (E)-3-Carboxy-2-methylprop-2-enoyl-CoA: CC(=CC(=O)O)C(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)

SMILES for (3R,4R)-3,4-Bis[(4-hydroxy-3-methoxyphenyl)methyl]oxolan-2-one: COC1=C(C=CC(=C1)CC2COC(=O)C2CC3=CC(=C(C=C3)O)OC)O
SMILES for 2'-Deoxy-5-hydroxymethylcytidine-5'-diphosphate: C1C(C(OC1N2C=C(C(=NC2=O)N)CO)COP(=O)(O)OP(=O)(O)O)O
SMILES for 2'-Deoxy-5-hydroxymethylcytidine-5'-triphosphate: C1C(C(OC1N2C=C(C(=NC2=O)N)CO)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O
SMILES for 4-Hydroxybutanoyl-CoA: CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)C(C(=O)NCCC(=O)NCCSC(=O)CCCO)O
SMILES for Sulfonylbismethane: CS(=O)(=O)C
SMILES for DMSO: CS(=O)C
SMILES for Methanesulfonate: CS(=O)(=O)[O-]
SMILES for TCA: C(=O)(C(Cl)(Cl)Cl)O
SMILES for (S)-3,7-Dimethyloct-6-enal: CC(CCC=C(C)C)CC=O
SMILES for (-)-Citronellol: CC(CCC=C(C)C)CCO
SMILES for 2-C-Methyl-D-erythritol 4-phosphate: CC(CO)(C(COP(=O)(O)O)O)O
SMILES for 4-(Cytidine 5'-diphospho)-2-C-methyl-D-erythritol: CC(CO)(C(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=CC(=NC2=O)N)O)O)O)O
SMILES for 2-Phospho-4-(cytidine 5'-diphospho)-2-C-met

SMILES for 3-Oxo-OPC8-CoA: CCC=CCC1C(CCC1=O)CCCCCC(=O)CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC2C(C(C(O2)N3C=NC4=C(N=CN=C43)N)O)OP(=O)(O)O)O
SMILES for OPC6-CoA: CCC=CCC1C(CCC1=O)CCCCCC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC2C(C(C(O2)N3C=NC4=C(N=CN=C43)N)O)OP(=O)(O)O)O
SMILES for 3-Oxo-OPC6-CoA: CCC=CCC1C(CCC1=O)CCCC(=O)CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC2C(C(C(O2)N3C=NC4=C(N=CN=C43)N)O)OP(=O)(O)O)O
SMILES for OPC4-CoA: CCC=CCC1C(CCC1=O)CCCC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC2C(C(C(O2)N3C=NC4=C(N=CN=C43)N)O)OP(=O)(O)O)O
SMILES for 3-Oxo-OPC4-CoA: CCC=CCC1C(CCC1=O)CC(=O)CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC2C(C(C(O2)N3C=NC4=C(N=CN=C43)N)O)OP(=O)(O)O)O
SMILES for (+)-7-Isojasmonic acid CoA: CCC=CCC1C(CCC1=O)CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC2C(C(C(O2)N3C=NC4=C(N=CN=C43)N)O)OP(=O)(O)O)O
SMILES for cis-3-Chloro-2-propenal: C(=CCl)C=O
SMILES for 1,3,7-Trimethylurate: CN1C2=C(NC1=O)N(C(=O)N(C2=

SMILES for (4R)-4-Hydroxy-2,6,6-trimethylcyclohex-1-en-1-carboxaldehyde: CC1=C(C(CC(C1)O)(C)C)C=O
SMILES for UDP-2,3-diacetamido-2,3-dideoxy-alpha-D-mannuronic acid: CC(=O)NC1C(C(OC(C1O)C(=O)O)OP(=O)(O)OP(=O)(O)OCC2C(C(C(O2)N3C=CC(=O)NC3=O)O)O)NC(=O)C
SMILES for Demethylmenaquinol: CC(=CCCC(=CCC1=C(C2=CC=CC=C2C(=C1)O)O)C)C
SMILES for Adenylyl-molybdopterin: C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(=O)(O)OCC4C(=C(C5C(O4)NC6=C(N5)C(=O)NC(=N6)N)S)S)O)O)N
SMILES for tritrans,heptacis-Undecaprenyl diphosphate: CC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCOP(=O)(O)OP(=O)(O)O)C)C)C)C)C)C)C)C)C)C)C
SMILES for Molybdopterin guanine dinucleotide cofactor: C1=NC2=C(N1C3C(C(C(O3)COP(=O)(O)OP(=O)(O)OCC4C(=C(C5C(O4)NC6=C(N5)C(=O)NC(=N6)N)[S-])[S-])O)O)N=C(NC2=O)N.O=[Mo+2]=O
SMILES for (+)-Inositol: C1(C(C(C(C(C1O)O)O)O)O)O
SMILES for Secophenol: CC1=C(C=C(C=C1)O)CCC2C3CCC(=O)C3(CCC2=O)C
SMILES for 3-Oxo-5,6-didehydrosuberoyl-CoA: CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC

SMILES for GDP-valienol: C1=C(C(C(C(C1OP(=O)(O)OP(=O)(O)OCC2C(C(C(O2)N3C=NC4=C3N=C(NC4=O)N)O)O)O)O)O)CO
SMILES for Molybdopterin cytosine dinucleotide cofactor: C1=CN(C(=O)N=C1N)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(=C(C4C(O3)NC5=C(N4)C(=O)NC(=N5)N)[S-])[S-])O)O.O=[Mo+2]=O
SMILES for alpha-D-Gal-(1->3)-alpha-D-GlcNAc-diphospho-ditrans,octacis-undecaprenol: CC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCOP(=O)(O)OP(=O)(O)OC1C(C(C(C(O1)CO)O)OC2C(C(C(C(O2)CO)O)O)O)NC(=O)C)C)C)C)C)C)C)C)C)C)C)C
SMILES for 4-Hydroxybenzoyl-AMP: C1=CC(=CC=C1C(=O)OP(=O)(O)OCC2C(C(C(O2)N3C=NC4=C(N=CN=C43)N)O)O)O
SMILES for Ni-sirohydrochlorin: CC1(C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(C(C(=N5)C=C1[N-]2)CCC(=O)[O-])(C)CC(=O)[O-])CC(=O)[O-])CCC(=O)[O-])C(=C3CC(=O)[O-])CCC(=O)[O-])CCC(=O)[O-])CC(=O)[O-].[Ni]
SMILES for Ni-sirohydrochlorin a,c-diamide: CC1(C(C2=CC3=NC(=CC4=C(C(=C([N-]4)C=C5C(C(C(=N5)C=C1[N-]2)CCC(=O)[O-])(C)CC(=O)N)CC(=O)[O-])CCC(=O)[O-])C(=C3CC(=O)[O-])CCC(=O)[O-])CCC(=O)[O-])CC(=O

In [53]:
print(name_smiles)

{'H2O;': None, 'Water': 'O', 'ATP;': None, "Adenosine 5'-triphosphate": 'C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(=O)(O)OP(=O)(O)O)O)O)N', 'NAD+;': None, 'NAD;': None, 'Nicotinamide adenine dinucleotide;': None, 'DPN;': None, 'Diphosphopyridine nucleotide;': None, 'Nadide;': None, 'beta-NAD+': 'C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)([O-])OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)O)O)O)O)C(=O)N', 'NADH;': None, 'DPNH;': None, 'Reduced nicotinamide adenine dinucleotide': 'C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)O)O)O)O', 'NADPH;': None, 'TPNH;': None, 'Reduced nicotinamide adenine dinucleotide phosphate': 'C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(O)OCC3C(C(C(O3)N4C=NC5=C(N=CN=C54)N)OP(=O)(O)O)O)O)O', 'NADP+;': None, 'NADP;': None, 'Nicotinamide adenine dinucleotide phosphate;': None, 'beta-Nicotinamide adenine dinucleotide phosphate;': None, 'TPN;': None, 'Triphosphopyridine nucleotide;': None, 'beta-NADP+': 'C1=CC(=C[N+](=C1)C2C(C(C(O2)C

In [None]:
# come back to this later maybe?
from bioservices import *

kegg_con = KEGG()
chebi_con = ChEBI()

metabs_with_smiles = []
smiles_list = []
missing_metabs = []

for metabolite in curated_metabolites: 
    kegg_entry = kegg_con.parse(kegg_con.get(metabolite))
    try:
        chebi_entry = chebi_con.getCompleteEntity('CHEBI:' + kegg_entry['DBLINKS']['ChEBI'])
    except:
        missing_metabs.append(metabolite)
    else:
        try:
            smiles_list.append(chebi_entry.smiles)
        except: 
            missing_metabs.append(metabolite)
        else:
            metabs_with_smiles.append(metabolite)
            

print(len(metabolites))
print(len(metabs_with_smiles))
print(len(missing_metabs))




In [None]:
print(len(metabolites))
print(len(metabs_with_smiles))
print(len(missing_metabs))
print(missing_metabs[0:5])


In [None]:
metnames = []
missing_metabs2 = []
for metabolite in missing_metabs:
    try:
        kegg_con = KEGG()
        metname = kegg_con.parse(kegg_con.get(metabolite))['NAME']
        metnames.append(metname)
    except: missing_metabs2.append(metabolite)
print(len(metnames))
print(len(missing_metabs2))

In [None]:
print(metnames)  # Names of metabs missing SMILES right now

In [None]:
chebi_entry = chebi_con.getCompleteEntity('CHEBI:' + kegg_entry['DBLINKS']['ChEBI'])

In [None]:
import csv
with open('metabs_with_smiles.csv', 'w') as outfile:
    wr = csv.writer(outfile)
    wr.writerow(metabs_with_smiles)
    outfile.close()
    
with open('missing_metabs.csv', 'w') as outfile:
    wr = csv.writer(outfile)
    wr.writerow(missing_metabs)
    outfile.close()

with open ('smiles.csv', 'w') as outfile:
    wr = csv.writer(outfile)
    wr.writerow(smiles_list)
    outfile.close()

In [None]:
# Adds compartment names to the model.
model.compartments = {'c': 'Cytoplasm'}
model.compartments = {'e': 'Extracellular'}
model.compartments

# Save model

In [None]:
# Output model for use with Gecko in Matlab
cobra.io.save_matlab_model(model, '../../Models/iGEL604_update.mat')

# To do later

In [None]:
# Finds 125 reactions still missing names in the model. Fix later given time.
counter = 0
for reaction in model.reactions:
    if reaction.name == '':
        print("%s  %s %s" %(reaction.id, reaction.name, reaction.bounds))
        counter += 1
print('Number of reactions missing name: %i' %(counter))

# Playing with KEGG API (REST)
Would love to be able to fill in missing reaction names using the rXYZ... KEGG identifiers in the model.

Tutorial: https://widdowquinn.github.io/2018-03-06-ibioic/02-sequence_databases/09-KEGG_programming.html#imports

In [None]:
from Bio.KEGG import REST
import io

In [None]:
def to_df(result):
    return pd.read_table(io.StringIO(result), header=None)

In [None]:
test = REST.kegg_info("reaction").read()  # Gets database info
print(test)

In [None]:
test = REST.kegg_list("reaction").read()
print(test)

In [None]:
# Get all G. LC300 pathways in dataframe
test = REST.kegg_list("pathway", "gel").read()
lc300_pathways = to_df(test)
lc300_pathways

In [None]:
lc300_info = REST.kegg_info("gel").read()
print(lc300_info)

In [None]:
REST.kegg_get("gel:IB49_00015").read()

In [None]:
help(REST.kegg_get)