# Export Pathway Info from the KEGG Database in PiMP

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from collections import OrderedDict

In [None]:
import xmltodict
import json

from rpy2 import robjects
from rpy2.robjects import pandas2ri
from bioservices.kegg import KEGG

pandas2ri.activate()

Define some useful methods

In [None]:
def get_pw_cmpd_dict(path):
    robjects.r['load'](path + '/PiMP/data/pathways2Compounds.RData')
    a = robjects.r['pathways2Compounds']
    pw_cmpd_dict = OrderedDict(zip(a.names, map(list, list(a))))

    pathway_cmpd_dict = OrderedDict()
    for pathway, cmpd_list in pw_cmpd_dict.items():
        pw = pathway.replace("path:", "")
        pathway_cmpd_dict[pw] = cmpd_list

    return pathway_cmpd_dict

def _filter_pathway_name(p):
    return [x.replace('path:', '') for x in p]

def get_cmpid_pathway_dict(path):
    robjects.r['load'](path + '/PiMP/data/compounds2Pathways.RData')
    a = robjects.r['compounds2Pathways']
    compound_names = a.names
    compound_pathways = list(map(lambda x: _filter_pathway_name(x), list(a)))
    cmpd_pw_dict = OrderedDict(zip(compound_names, compound_pathways))
    return cmpd_pw_dict

def _produce_kegg_dict(path, param):
    kegg_location = path + '/PiMP/inst/dbs/kegg.xml'
    with open(kegg_location) as kegg_cmpd_file:
        cmpd_dict = xmltodict.parse(kegg_cmpd_file.read())

    kegg_dict = {}
    for compound in cmpd_dict['compounds']['compound']:
        kegg_dict[compound[param]] = compound['formula']
    return kegg_dict

def get_cmpd_name_formula_dict(path):
    return _produce_kegg_dict(path, 'name')

def get_cmpd_id_formula_dict(path):
    return _produce_kegg_dict(path, 'id')

def get_kegg_info(compound_id):
        s = KEGG()
        res = s.get(compound_id)
        return s.parse(res)
    
def get_all_kegg_compound_ids():
    s = KEGG()
    compound_ids = []
    for compound_id in s.compoundIds:
        compound_ids.append(compound_id.split(':')[1])
    return compound_ids

def get_all_kegg_compound_info():
    compound_ids = get_all_kegg_compound_ids()
    compound_info = {}
    for i in range(len(compound_ids)):
        cid = compound_ids[i]
        cinfo = get_kegg_info(cid)
        print('%d/%d %s' % (i+1, len(compound_ids), cinfo['ENTRY']))        
        compound_info[cid] = cinfo
    return compound_info

In [None]:
path = '/home/joewandy/git/pimp'

In [None]:
pathway_cmpd_dict = dict(get_pw_cmpd_dict(path))
cmpd_formula_dict = get_cmpd_name_formula_dict(path)
cmpd_id_formula_dict = get_cmpd_id_formula_dict(path)
cmpd_id_pw_dict = dict(get_cmpid_pathway_dict(path))
all_kegg_compound_info = get_all_kegg_compound_info()

In [None]:
pathway_cmpd_dict

In [None]:
cmpd_formula_dict

In [None]:
cmpd_id_formula_dict

In [None]:
cmpd_id_pw_dict

In [None]:
len(cmpd_id_formula_dict)

In [None]:
len(cmpd_id_pw_dict)

In [None]:
data = {
    'pathway_cmpd_dict': pathway_cmpd_dict,
    'cmpd_formula_dict': cmpd_formula_dict,
    'cmpd_id_formula_dict': cmpd_id_formula_dict,
    'cmpd_id_pw_dict': cmpd_id_pw_dict,
    'cmpd_info': all_kegg_compound_info
}

In [None]:
import json
with open('/home/joewandy/git/PALS/data/kegg.json', 'w') as f:
    json.dump(data, f)