# Export Pathway Info from the KEGG Database in PiMP

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import OrderedDict

In [3]:
import xmltodict
import json
import os

from rpy2 import robjects
from rpy2.robjects import pandas2ri
from bioservices.kegg import KEGG

pandas2ri.activate()

### Define some useful methods

In [4]:
def get_pw_cmpd_dict(path):
    robjects.r['load'](path + '/PiMP/data/pathways2Compounds.RData')
    a = robjects.r['pathways2Compounds']
    pw_cmpd_dict = OrderedDict(zip(a.names, map(list, list(a))))

    pathway_cmpd_dict = OrderedDict()
    for pathway, cmpd_list in pw_cmpd_dict.items():
        pw = pathway.replace("path:", "")
        pathway_cmpd_dict[pw] = cmpd_list

    return pathway_cmpd_dict

def get_pw_dict(path):
    robjects.r['load'](path + '/PiMP/data/pathways.RData')
    a = robjects.r['pathways']
    filtered_ids = _filter_pathway_name(a['id'].values)
    pw_dict = dict(zip(filtered_ids, a.name.values))
    return pw_dict

def _filter_pathway_name(p):
    return [x.replace('path:', '') for x in p]

def get_cmpid_pathway_dict(path):
    robjects.r['load'](path + '/PiMP/data/compounds2Pathways.RData')
    a = robjects.r['compounds2Pathways']
    compound_names = a.names
    compound_pathways = list(map(lambda x: _filter_pathway_name(x), list(a)))
    cmpd_pw_dict = OrderedDict(zip(compound_names, compound_pathways))
    return cmpd_pw_dict

def _produce_kegg_dict(path, param):
    kegg_location = path + '/PiMP/inst/dbs/kegg.xml'
    with open(kegg_location) as kegg_cmpd_file:
        cmpd_dict = xmltodict.parse(kegg_cmpd_file.read())

    kegg_dict = {}
    for compound in cmpd_dict['compounds']['compound']:
        kegg_dict[compound[param]] = compound['formula']
    return kegg_dict

def get_cmpd_name_formula_dict(path):
    return _produce_kegg_dict(path, 'name')

def get_cmpd_id_formula_dict(path):
    return _produce_kegg_dict(path, 'id')

def load_json(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    return data

def save_json(data, json_file):
    with open(json_file, 'w') as f:
        json.dump(data, f)

In [5]:
path = '/home/joewandy/git/pimp'

In [6]:
pw_dict = get_pw_dict(path)
pathway_cmpd_dict = dict(get_pw_cmpd_dict(path))
cmpd_formula_dict = get_cmpd_name_formula_dict(path)
cmpd_id_formula_dict = get_cmpd_id_formula_dict(path)
cmpd_id_pw_dict = dict(get_cmpid_pathway_dict(path))

In [7]:
pw_dict

{'map00010': 'Glycolysis / Gluconeogenesis',
 'map00020': 'Citrate cycle (TCA cycle)',
 'map00030': 'Pentose phosphate pathway',
 'map00040': 'Pentose and glucuronate interconversions',
 'map00051': 'Fructose and mannose metabolism',
 'map00052': 'Galactose metabolism',
 'map00053': 'Ascorbate and aldarate metabolism',
 'map00061': 'Fatty acid biosynthesis',
 'map00062': 'Fatty acid elongation',
 'map00071': 'Fatty acid degradation',
 'map00072': 'Synthesis and degradation of ketone bodies',
 'map00073': 'Cutin, suberine and wax biosynthesis',
 'map00100': 'Steroid biosynthesis',
 'map00120': 'Primary bile acid biosynthesis',
 'map00121': 'Secondary bile acid biosynthesis',
 'map00130': 'Ubiquinone and other terpenoid-quinone biosynthesis',
 'map00140': 'Steroid hormone biosynthesis',
 'map00190': 'Oxidative phosphorylation',
 'map00195': 'Photosynthesis',
 'map00230': 'Purine metabolism',
 'map00231': 'Puromycin biosynthesis',
 'map00232': 'Caffeine metabolism',
 'map00240': 'Pyrimidi

In [8]:
pathway_cmpd_dict

{'map00010': ['Pyruvate',
  'Acetyl-CoA',
  'D-Glucose',
  'Acetate',
  'Oxaloacetate',
  'Thiamin diphosphate',
  'Phosphoenolpyruvate',
  'Acetaldehyde',
  'D-Glucose 1-phosphate',
  'Glycerone phosphate',
  'D-Glyceraldehyde 3-phosphate',
  '(S)-Lactate',
  '3-Phospho-D-glycerate',
  'beta-D-Glucose',
  '3-Phospho-D-glyceroyl phosphate',
  'alpha-D-Glucose',
  'Ethanol',
  '2-Phospho-D-glycerate',
  'alpha-D-Glucose 6-phosphate',
  '2,3-Bisphospho-D-glycerate',
  'beta-D-Glucose 6-phosphate',
  'Salicin',
  '2-(alpha-Hydroxyethyl)thiamine diphosphate',
  'beta-D-Fructose 6-phosphate',
  'beta-D-Fructose 1,6-bisphosphate',
  'Arbutin',
  'Arbutin 6-phosphate',
  'Salicin 6-phosphate',
  'Enzyme N6-(lipoyl)lysine',
  'Enzyme N6-(dihydrolipoyl)lysine',
  '[Dihydrolipoyllysine-residue acetyltransferase] S-acetyldihydrolipoyllysine'],
 'map00020': ['Pyruvate',
  'Acetyl-CoA',
  '2-Oxoglutarate',
  'Oxaloacetate',
  'Succinate',
  'Thiamin diphosphate',
  'Phosphoenolpyruvate',
  'Succiny

In [9]:
cmpd_formula_dict

{'H2O': 'H2O',
 'ATP': 'C10H16N5O13P3',
 'NAD+': 'C21H28N7O14P2',
 'NADH': 'C21H29N7O14P2',
 'NADPH': 'C21H30N7O17P3',
 'NADP+': 'C21H29N7O17P3',
 'Oxygen': 'O2',
 'ADP': 'C10H15N5O10P2',
 'Orthophosphate': 'H3PO4',
 'CoA': 'C21H36N7O16P3S',
 'CO2': 'CO2',
 'Diphosphate': 'H4P2O7',
 'Ammonia': 'NH3',
 'UDP': 'C9H14N2O12P2',
 'FAD': 'C27H33N9O15P2',
 'Pyridoxal phosphate': 'C8H10NO6P',
 'S-Adenosyl-L-methionine': 'C15H22N6O5S',
 'AMP': 'C10H14N5O7P',
 'S-Adenosyl-L-homocysteine': 'C14H20N6O5S',
 'Pyruvate': 'C3H4O3',
 'Iron': 'Fe',
 'Acetyl-CoA': 'C23H38N7O17P3S',
 'L-Glutamate': 'C5H9NO4',
 '2-Oxoglutarate': 'C5H6O5',
 'Hydrogen peroxide': 'H2O2',
 'UDP-glucose': 'C15H24N2O17P2',
 'D-Glucose': 'C6H12O6',
 'Heme': 'C34H32FeN4O4',
 'Acetate': 'C2H4O2',
 'Manganese': 'Mn',
 'GDP': 'C10H15N5O11P2',
 'Oxaloacetate': 'C4H4O5',
 'Glycine': 'C2H5NO2',
 'Zinc cation': 'Zn',
 'L-Alanine': 'C3H7NO2',
 'Succinate': 'C4H6O4',
 'UDP-N-acetyl-alpha-D-glucosamine': 'C17H27N3O17P2',
 'GTP': 'C10H16N5O1

In [10]:
cmpd_id_formula_dict

{'C00001': 'H2O',
 'C00002': 'C10H16N5O13P3',
 'C00003': 'C21H28N7O14P2',
 'C00004': 'C21H29N7O14P2',
 'C00005': 'C21H30N7O17P3',
 'C00006': 'C21H29N7O17P3',
 'C00007': 'O2',
 'C00008': 'C10H15N5O10P2',
 'C00009': 'H3PO4',
 'C00010': 'C21H36N7O16P3S',
 'C00011': 'CO2',
 'C00013': 'H4P2O7',
 'C00014': 'NH3',
 'C00015': 'C9H14N2O12P2',
 'C00016': 'C27H33N9O15P2',
 'C00018': 'C8H10NO6P',
 'C00019': 'C15H22N6O5S',
 'C00020': 'C10H14N5O7P',
 'C00021': 'C14H20N6O5S',
 'C00022': 'C3H4O3',
 'C00023': 'Fe',
 'C00024': 'C23H38N7O17P3S',
 'C00025': 'C5H9NO4',
 'C00026': 'C5H6O5',
 'C00027': 'H2O2',
 'C00029': 'C15H24N2O17P2',
 'C00031': 'C6H12O6',
 'C00032': 'C34H32FeN4O4',
 'C00033': 'C2H4O2',
 'C00034': 'Mn',
 'C00035': 'C10H15N5O11P2',
 'C00036': 'C4H4O5',
 'C00037': 'C2H5NO2',
 'C00038': 'Zn',
 'C00041': 'C3H7NO2',
 'C00042': 'C4H6O4',
 'C00043': 'C17H27N3O17P2',
 'C00044': 'C10H16N5O14P3',
 'C00047': 'C6H14N2O2',
 'C00048': 'C2H2O3',
 'C00049': 'C4H7NO4',
 'C00051': 'C10H17N3O6S',
 'C00052':

In [11]:
cmpd_id_pw_dict

{'C00022': ['map00010',
  'map00020',
  'map00030',
  'map00040',
  'map00053',
  'map00250',
  'map00260',
  'map00270',
  'map00290',
  'map00330',
  'map00350',
  'map00360',
  'map00362',
  'map00430',
  'map00440',
  'map00473',
  'map00620',
  'map00621',
  'map00622',
  'map00630',
  'map00650',
  'map00660',
  'map00680',
  'map00710',
  'map00720',
  'map00730',
  'map00750',
  'map00760',
  'map00770',
  'map00900',
  'map02060',
  'map04066',
  'map04911',
  'map04930',
  'ingenza00003'],
 'C00024': ['map00010',
  'map00020',
  'map00061',
  'map00062',
  'map00071',
  'map00072',
  'map00253',
  'map00254',
  'map00280',
  'map00290',
  'map00300',
  'map00310',
  'map00360',
  'map00362',
  'map00380',
  'map00410',
  'map00430',
  'map00480',
  'map00562',
  'map00620',
  'map00621',
  'map00622',
  'map00625',
  'map00630',
  'map00640',
  'map00642',
  'map00650',
  'map00680',
  'map00720',
  'map00900',
  'map01056',
  'map04066',
  'map04725',
  'map04976',
  'ingenz

In [12]:
len(cmpd_id_formula_dict)

14761

In [13]:
len(cmpd_id_pw_dict)

7843

In [14]:
data = {
    'pw_dict': pw_dict,
    'pathway_cmpd_dict': pathway_cmpd_dict,
    'cmpd_formula_dict': cmpd_formula_dict,
    'cmpd_id_formula_dict': cmpd_id_formula_dict,
    'cmpd_id_pw_dict': cmpd_id_pw_dict
}

In [15]:
json_file = '/home/joewandy/git/PALS/data/kegg.json'
save_json(data, json_file)

### Regenerate PiMP compound XML file

In [None]:
def get_kegg_info(compound_id):
        s = KEGG()
        res = s.get(compound_id)
        return s.parse(res)
    
def get_all_kegg_compound_ids():
    s = KEGG()
    compound_ids = []
    for compound_id in s.compoundIds:
        compound_ids.append(compound_id.split(':')[1])
    return compound_ids

def get_all_kegg_compound_info():
    compound_ids = get_all_kegg_compound_ids()
    compound_info = {}
    for i in range(len(compound_ids)):
        cid = compound_ids[i]
        cinfo = get_kegg_info(cid)
        print('%d/%d %s' % (i+1, len(compound_ids), cinfo['ENTRY']))        
        compound_info[cid] = cinfo
    return compound_info

In [None]:
json_file = '/home/joewandy/git/PALS/data/all_kegg_compounds.json'

In [None]:
if os.path.exists(json_file):
    data = load_json(json_file)
else:
    all_kegg_compound_info = get_all_kegg_compound_info()
    data = {
        'cmpd_info': all_kegg_compound_info
    }
    save_json(data, json_file)

In [None]:
data['cmpd_info']['C00135']