In [51]:
import requests
import json
import time
import regex as re
import numpy as np
import pandas as pd

In [2]:
def get_kegg_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        return response
    else:
        return f"Error: {response.status_code}"

# Step 1 : Obtain human metabolic pathways

In [3]:
pathways_url = "http://rest.kegg.jp/list/pathway/hsa"
response = get_kegg_url(pathways_url)

In [4]:
pathways = {}
for line in response.text.split('\n')[:-1]:
    code, name = line[:-23].split('\t')
    pathways[code] = name
pathways

{'hsa01100': 'Metabolic pathways',
 'hsa01200': 'Carbon metabolism',
 'hsa01210': '2-Oxocarboxylic acid metabolism',
 'hsa01212': 'Fatty acid metabolism',
 'hsa01230': 'Biosynthesis of amino acids',
 'hsa01232': 'Nucleotide metabolism',
 'hsa01250': 'Biosynthesis of nucleotide sugars',
 'hsa01240': 'Biosynthesis of cofactors',
 'hsa00010': 'Glycolysis / Gluconeogenesis',
 'hsa00020': 'Citrate cycle (TCA cycle)',
 'hsa00030': 'Pentose phosphate pathway',
 'hsa00040': 'Pentose and glucuronate interconversions',
 'hsa00051': 'Fructose and mannose metabolism',
 'hsa00052': 'Galactose metabolism',
 'hsa00053': 'Ascorbate and aldarate metabolism',
 'hsa00500': 'Starch and sucrose metabolism',
 'hsa00520': 'Amino sugar and nucleotide sugar metabolism',
 'hsa00620': 'Pyruvate metabolism',
 'hsa00630': 'Glyoxylate and dicarboxylate metabolism',
 'hsa00640': 'Propanoate metabolism',
 'hsa00650': 'Butanoate metabolism',
 'hsa00562': 'Inositol phosphate metabolism',
 'hsa00190': 'Oxidative phospho

In [5]:
# Keeping only metabolic pathways, not disease or signaling
def filter_pathways(pair):
    key, value = pair
    pid = key[3:]
    keeper = np.arange(0000, 1300, 1)
    if int(pid) in keeper:
        return True
    else:
        return False

pathways_m = dict(filter(filter_pathways, pathways.items()))

In [7]:
# Popping out the general pathway containing all metabolic genes
pathways_m.pop('hsa01100')

KeyError: 'hsa01100'

In [8]:
pathways_m

{'hsa01200': 'Carbon metabolism',
 'hsa01210': '2-Oxocarboxylic acid metabolism',
 'hsa01212': 'Fatty acid metabolism',
 'hsa01230': 'Biosynthesis of amino acids',
 'hsa01232': 'Nucleotide metabolism',
 'hsa01250': 'Biosynthesis of nucleotide sugars',
 'hsa01240': 'Biosynthesis of cofactors',
 'hsa00010': 'Glycolysis / Gluconeogenesis',
 'hsa00020': 'Citrate cycle (TCA cycle)',
 'hsa00030': 'Pentose phosphate pathway',
 'hsa00040': 'Pentose and glucuronate interconversions',
 'hsa00051': 'Fructose and mannose metabolism',
 'hsa00052': 'Galactose metabolism',
 'hsa00053': 'Ascorbate and aldarate metabolism',
 'hsa00500': 'Starch and sucrose metabolism',
 'hsa00520': 'Amino sugar and nucleotide sugar metabolism',
 'hsa00620': 'Pyruvate metabolism',
 'hsa00630': 'Glyoxylate and dicarboxylate metabolism',
 'hsa00640': 'Propanoate metabolism',
 'hsa00650': 'Butanoate metabolism',
 'hsa00562': 'Inositol phosphate metabolism',
 'hsa00190': 'Oxidative phosphorylation',
 'hsa00910': 'Nitrogen m

In [9]:
with open("kegg_pathways_m.json", "w") as outfile: 
    json.dump(pathways_m, outfile)

# Step 2 : Obtain the list of gene associated with each pathway

## Get ncbi IDs from KEGG API

In [84]:
# Obtain NCBI gene IDs
genes_url = f"http://rest.kegg.jp/link/hsa/pathway"
genesh = get_kegg_url(genes_url)

In [85]:
genes_data = genesh.text.split('\n')

In [87]:
ncbi_ids = {}
for line in genes_data[:-1]:
    pathway, gene = line.split('\t')
    pathway = pathway[5:]
    if pathway in ncbi_ids:
        ncbi_ids[pathway].append(gene[4:])
    else :
        ncbi_ids[pathway] = [gene[4:]]

In [90]:
len(ncbi_ids)

359

In [155]:
with open("ncbi_ids.json", "w") as outfile: 
    json.dump(ncbi_ids, outfile)

## Get Gene symbol (names, HGNC) from NCBI Entrez API

API requests are made with a 1 second delay as Entrez guideline advise not to make more than 3 requests per second.

In [96]:
def fetch_gene_symbols(genelist):
    api_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    geneID_list = ','.join(genelist)
    url = f'{api_url}?db=gene&id={geneID_list}&retmode=json'
    response = requests.get(url)
    if response.status_code != 200:
        return f"Error: {response.status_code}"
    else:
        return response

In [97]:
def extract_symbols(json_data):
    symbol_list = []
    for uid in json_data['result']['uids']:
            gene_info = json_data['result'][uid]
            symbol_list.append(gene_info['nomenclaturesymbol'])
    return symbol_list

In [94]:
def get_official_symbols(ncbi_ids):
    genes_pathways = {}
    for pathway in ncbi_ids.keys() :
        time.sleep(1)
        ncbi_rep = fetch_gene_symbols(ncbi_ids[pathway])
        json_rep = ncbi_rep.json()
        symbols = extract_symbols(json_rep)
        genes_pathways[pathway] = symbols
        print(f'Pathway {pathway} : {ncbi_rep}')
    return genes_pathways

In [92]:
# Had to run this because initially the global pathway 'hsa01100' was not excluded from the list
# and it contains > 1500 genes which exceeds the NCBI limit of 200 per request and causes an error

missing_pathways = {}
for item in pathways_m.items():
    key, value = item
    if key not in gene_pathway:
        missing_pathways[key] = value

ncbi_ids_missing = {key:value for key, value in ncbi_ids.items() if key in missing_pathways}
for key, value in ncbi_ids_missing.items():
    print(f'Pathway {key} : {len(value)} genes')


Pathway hsa01200 : 116 genes
Pathway hsa01210 : 33 genes
Pathway hsa01212 : 57 genes
Pathway hsa01230 : 75 genes
Pathway hsa01232 : 85 genes
Pathway hsa01240 : 153 genes
Pathway hsa01250 : 37 genes


In [98]:
genes_pathway_missing = get_official_symbols(ncbi_ids_missing)
len(genes_pathway_missing)

Pathway hsa01200 : <Response [200]>
Pathway hsa01210 : <Response [200]>
Pathway hsa01212 : <Response [200]>
Pathway hsa01230 : <Response [200]>
Pathway hsa01232 : <Response [200]>
Pathway hsa01240 : <Response [200]>
Pathway hsa01250 : <Response [200]>


7

In [100]:
with open("gene_symbols_2.json", "w") as outfile: 
    json.dump(genes_pathway_missing, outfile)

# Step 3 : Get metabolite list per pathway

In [23]:
# Get information about compound database
url = "https://rest.kegg.jp/info/cpd"
info = get_kegg_url(url)
print(info.text)

compound         KEGG Compound Database
cpd              Release 110.0+/06-27, Jun 24
                 Kanehisa Laboratories
                 19,357 entries

linked db        pathway
                 brite
                 module
                 genome
                 glycan
                 reaction
                 enzyme
                 network
                 drug
                 pubchem
                 chebi



In [60]:
url = "https://rest.kegg.jp/get/hsa01200/conf"
entry_1200 = get_kegg_url(url)
entry_1200

<Response [200]>

In [63]:
entry_1200.text.split('\n')

['circ (711,307) 10\t/dbget-bin/www_bget?C00085\tC00085 (D-Fructose 6-phosphate)',
 'circ (1041,672) 10\t/dbget-bin/www_bget?C01182\tC01182 (D-Ribulose 1,5-bisphosphate)',
 'circ (711,672) 10\t/dbget-bin/www_bget?C00197\tC00197 (3-Phospho-D-glycerate)',
 'circ (413,535) 4\t/dbget-bin/www_bget?C00101\tC00101 (Tetrahydrofolate)',
 'circ (711,133) 10\t/dbget-bin/www_bget?C00267+C00221\tC00267 (alpha-D-Glucose), C00221 (beta-D-Glucose)',
 'circ (711,202) 10\t/dbget-bin/www_bget?C00668+C01172\tC00668 (alpha-D-Glucose 6-phosphate), C01172 (beta-D-Glucose 6-phosphate)',
 'circ (1041,202) 10\t/dbget-bin/www_bget?C00345\tC00345 (6-Phospho-D-gluconate)',
 'circ (711,412) 10\t/dbget-bin/www_bget?C00354\tC00354 (D-Fructose 1,6-bisphosphate)',
 'circ (876,202) 10\t/dbget-bin/www_bget?C01236\tC01236 (D-Glucono-1,5-lactone 6-phosphate)',
 'circ (711,602) 10\t/dbget-bin/www_bget?C00236\tC00236 (3-Phospho-D-glyceroyl phosphate)',
 'circ (711,517) 10\t/dbget-bin/www_bget?C00118\tC00118 (D-Glyceraldehyde

In [72]:
def get_extract_compound(pathway_list):
    cpd_pathway = {}
    pattern = r'\(((?>[^()]+|(?R))*)\)'
    for pathway in pathway_list:
        time.sleep(1)
        url = f"https://rest.kegg.jp/get/{pathway}/conf"
        entry = get_kegg_url(url)
        print(f'Pathway {pathway} : {entry}')
        entry_data = entry.text.split('\n')
        cpd_pathway[pathway] = []
        for line in entry_data :
            if line.startswith('circ'):
                _, _, name = line.split('\t')
                matches = re.findall(pattern, name)
                for match in matches :
                    cpd_pathway[pathway].append(match)
    return cpd_pathway

In [73]:
compound_pathway = get_extract_compound(pathways_m.keys())

Pathway hsa01200 : <Response [200]>
Pathway hsa01210 : <Response [200]>
Pathway hsa01212 : <Response [200]>
Pathway hsa01230 : <Response [200]>
Pathway hsa01232 : <Response [200]>
Pathway hsa01250 : <Response [200]>
Pathway hsa01240 : <Response [200]>
Pathway hsa00010 : <Response [200]>
Pathway hsa00020 : <Response [200]>
Pathway hsa00030 : <Response [200]>
Pathway hsa00040 : <Response [200]>
Pathway hsa00051 : <Response [200]>
Pathway hsa00052 : <Response [200]>
Pathway hsa00053 : <Response [200]>
Pathway hsa00500 : <Response [200]>
Pathway hsa00520 : <Response [200]>
Pathway hsa00620 : <Response [200]>
Pathway hsa00630 : <Response [200]>
Pathway hsa00640 : <Response [200]>
Pathway hsa00650 : <Response [200]>
Pathway hsa00562 : <Response [200]>
Pathway hsa00190 : <Response [200]>
Pathway hsa00910 : <Response [200]>
Pathway hsa00920 : <Response [200]>
Pathway hsa00061 : <Response [200]>
Pathway hsa00062 : <Response [200]>
Pathway hsa00071 : <Response [200]>
Pathway hsa00100 : <Response

In [76]:
with open("compounds.json", "w") as outfile: 
    json.dump(compound_pathway, outfile)

# Step 4 : pool all the results in one JSON file

In [79]:
with open("gene_symbols.json", "r") as outfile:
    gene_pathway = json.load(outfile)

In [79]:
with open("gene_symbols.json", "r") as outfile:
    gene_pathway = json.load(outfile)

In [101]:
print(len(gene_pathway))
gene_pathway_all = {**gene_pathway, **genes_pathway_missing}
print(len(gene_pathway_all))

85
92


In [102]:
print(len(compound_pathway))
print(len(pathways_m))
print(len(gene_pathway_all))

92
92
92


In [104]:
pool = {}
for p_code, p_name in pathways_m.items() :
    pool[p_code] = {"name" : p_name,
                    "genes" : gene_pathway_all[p_code],
                    "compounds" : compound_pathway[p_code]
                   }

In [106]:
with open("pathway_genes_compounds.json", "w") as outfile: 
    json.dump(pool, outfile)