# Export Pathway Info from the KEGG Database in PiMP

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from collections import OrderedDict

In [3]:
import xmltodict
import json
import os

from rpy2 import robjects
from rpy2.robjects import pandas2ri

pandas2ri.activate()

### Define some useful methods

In [4]:
def get_pathway_dict(path):
    robjects.r['load'](path + '/PiMP/data/pathways.RData')
    a = robjects.r['pathways']
    filtered_ids = _filter_pathway_name(a['id'].values)
    pw_dict = {}
    for key, value in zip(filtered_ids, a.name.values):
        pw_dict[key] = {'display_name': value}
    return pw_dict

def _filter_pathway_name(p):
    return [x.replace('path:', '') for x in p]

def get_mapping_dict(path):
    robjects.r['load'](path + '/PiMP/data/compounds2Pathways.RData')
    a = robjects.r['compounds2Pathways']
    compound_names = a.names
    compound_pathways = list(map(lambda x: _filter_pathway_name(x), list(a)))
    cmpd_pw_dict = OrderedDict(zip(compound_names, compound_pathways))
    return dict(cmpd_pw_dict)

def get_entity_dict(path):
    kegg_location = path + '/PiMP/inst/dbs/kegg.xml'
    with open(kegg_location) as kegg_cmpd_file:
        cmpd_dict = xmltodict.parse(kegg_cmpd_file.read())

    kegg_dict = {}
    for compound in cmpd_dict['compounds']['compound']:
        compound_id = compound['id']
        kegg_dict[compound_id] = {
            'unique_id': compound['formula'],
            'display_name': compound['name']
        }
    return kegg_dict

def load_json(json_file):
    with open(json_file, 'r') as f:
        data = json.load(f)
    return data

def save_json(data, json_file):
    with open(json_file, 'w') as f:
        json.dump(data, f)

In [5]:
path = '/home/joewandy/git/pimp'

In [6]:
pathway_dict = get_pathway_dict(path)
entity_dict = get_entity_dict(path)
mapping_dict = get_mapping_dict(path) # key: entity id, value: pathway ids

In [7]:
pathway_dict

{'map00010': {'display_name': 'Glycolysis / Gluconeogenesis'},
 'map00020': {'display_name': 'Citrate cycle (TCA cycle)'},
 'map00030': {'display_name': 'Pentose phosphate pathway'},
 'map00040': {'display_name': 'Pentose and glucuronate interconversions'},
 'map00051': {'display_name': 'Fructose and mannose metabolism'},
 'map00052': {'display_name': 'Galactose metabolism'},
 'map00053': {'display_name': 'Ascorbate and aldarate metabolism'},
 'map00061': {'display_name': 'Fatty acid biosynthesis'},
 'map00062': {'display_name': 'Fatty acid elongation'},
 'map00071': {'display_name': 'Fatty acid degradation'},
 'map00072': {'display_name': 'Synthesis and degradation of ketone bodies'},
 'map00073': {'display_name': 'Cutin, suberine and wax biosynthesis'},
 'map00100': {'display_name': 'Steroid biosynthesis'},
 'map00120': {'display_name': 'Primary bile acid biosynthesis'},
 'map00121': {'display_name': 'Secondary bile acid biosynthesis'},
 'map00130': {'display_name': 'Ubiquinone and o

In [8]:
entity_dict

{'C00001': {'unique_id': 'H2O', 'display_name': 'H2O'},
 'C00002': {'unique_id': 'C10H16N5O13P3', 'display_name': 'ATP'},
 'C00003': {'unique_id': 'C21H28N7O14P2', 'display_name': 'NAD+'},
 'C00004': {'unique_id': 'C21H29N7O14P2', 'display_name': 'NADH'},
 'C00005': {'unique_id': 'C21H30N7O17P3', 'display_name': 'NADPH'},
 'C00006': {'unique_id': 'C21H29N7O17P3', 'display_name': 'NADP+'},
 'C00007': {'unique_id': 'O2', 'display_name': 'Oxygen'},
 'C00008': {'unique_id': 'C10H15N5O10P2', 'display_name': 'ADP'},
 'C00009': {'unique_id': 'H3PO4', 'display_name': 'Orthophosphate'},
 'C00010': {'unique_id': 'C21H36N7O16P3S', 'display_name': 'CoA'},
 'C00011': {'unique_id': 'CO2', 'display_name': 'CO2'},
 'C00013': {'unique_id': 'H4P2O7', 'display_name': 'Diphosphate'},
 'C00014': {'unique_id': 'NH3', 'display_name': 'Ammonia'},
 'C00015': {'unique_id': 'C9H14N2O12P2', 'display_name': 'UDP'},
 'C00016': {'unique_id': 'C27H33N9O15P2', 'display_name': 'FAD'},
 'C00018': {'unique_id': 'C8H10NO6

In [9]:
mapping_dict

{'C00022': ['map00010',
  'map00020',
  'map00030',
  'map00040',
  'map00053',
  'map00250',
  'map00260',
  'map00270',
  'map00290',
  'map00330',
  'map00350',
  'map00360',
  'map00362',
  'map00430',
  'map00440',
  'map00473',
  'map00620',
  'map00621',
  'map00622',
  'map00630',
  'map00650',
  'map00660',
  'map00680',
  'map00710',
  'map00720',
  'map00730',
  'map00750',
  'map00760',
  'map00770',
  'map00900',
  'map02060',
  'map04066',
  'map04911',
  'map04930',
  'ingenza00003'],
 'C00024': ['map00010',
  'map00020',
  'map00061',
  'map00062',
  'map00071',
  'map00072',
  'map00253',
  'map00254',
  'map00280',
  'map00290',
  'map00300',
  'map00310',
  'map00360',
  'map00362',
  'map00380',
  'map00410',
  'map00430',
  'map00480',
  'map00562',
  'map00620',
  'map00621',
  'map00622',
  'map00625',
  'map00630',
  'map00640',
  'map00642',
  'map00650',
  'map00680',
  'map00720',
  'map00900',
  'map01056',
  'map04066',
  'map04725',
  'map04976',
  'ingenz

In [10]:
len(entity_dict)

14761

In [11]:
len(mapping_dict)

7843

In [12]:
data = {
    'pathway_dict': pathway_dict,
    'entity_dict': entity_dict,
    'mapping_dict': mapping_dict
}

In [13]:
json_file = '/home/joewandy/git/PALS/pals/data/kegg.json'
save_json(data, json_file)

### Regenerate PiMP compound XML file

In [14]:
# from bioservices.kegg import KEGG

In [15]:
# def get_kegg_info(compound_id):
#         s = KEGG()
#         res = s.get(compound_id)
#         return s.parse(res)
    
# def get_all_kegg_compound_ids():
#     s = KEGG()
#     compound_ids = []
#     for compound_id in s.compoundIds:
#         compound_ids.append(compound_id.split(':')[1])
#     return compound_ids

# def get_all_kegg_compound_info():
#     compound_ids = get_all_kegg_compound_ids()
#     compound_info = {}
#     for i in range(len(compound_ids)):
#         cid = compound_ids[i]
#         cinfo = get_kegg_info(cid)
#         print('%d/%d %s' % (i+1, len(compound_ids), cinfo['ENTRY']))        
#         compound_info[cid] = cinfo
#     return compound_info

In [16]:
# json_file = '/home/joewandy/git/PALS/data/all_kegg_compounds.json'

In [17]:
# if os.path.exists(json_file):
#     data = load_json(json_file)
# else:
#     all_kegg_compound_info = get_all_kegg_compound_info()
#     data = {
#         'cmpd_info': all_kegg_compound_info
#     }
#     save_json(data, json_file)

In [18]:
# data['cmpd_info']['C00135']