In [16]:
# Ingest from KEGG API
# KEGG API documentation: https://www.kegg.jp/kegg/rest/keggapi.html
import re
import requests
from typing import Optional

BASE_URL = "https://rest.kegg.jp"


def fetch_kegg_data(
    endpoint: str,
    entries: str,
    *,
    timeout: int = 20,
    retries: int = 3,
    session: Optional[requests.Session] = None,
) -> str:
    """
    Generic KEGG REST API fetcher.

    Supports all KEGG endpoints:
        get, list, link, find, conv, etc.

    Example calls:
        fetch_kegg_data("get", "hsa00010")
        fetch_kegg_data("link", "reaction/hsa00010")
        fetch_kegg_data("get", "R00710")
        fetch_kegg_data("list", "pathway/hsa")

    Args:
        endpoint: KEGG REST endpoint.
        entries: Entry identifier or query string.
        timeout: HTTP timeout seconds.
        retries: Retry attempts.
        session: Optional shared requests session.

    Returns:
        Raw response text or empty string on failure.
    """

    url = f"{BASE_URL}/{endpoint}/{entries}"
    sess = session or requests.Session()

    for attempt in range(1, retries + 1):
        try:
            response = sess.get(url, timeout=timeout)
            response.raise_for_status()
            return response.text

        except requests.exceptions.RequestException as e:
            if attempt == retries:
                print(f"[KEGG ERROR] {url} -> {e}")
                return ""
            else:
                print(f"[KEGG RETRY {attempt}/{retries}] {url}")
    
    return ""

import re


def extract_kegg_modules(text: str) -> list[str]:
    """
    Extract KEGG module IDs (Mxxxxx) from pathway text.

    Args:
        text: Raw KEGG pathway entry.

    Returns:
        List of unique module IDs.
    """
    modules = re.findall(r'M\d{5}', text)
    return sorted(set(modules))


def extract_kegg_reactions(text: str) -> list[str]:
    """
    Extract KEGG reaction IDs (Rxxxxx) from module or reaction text.

    Args:
        text: Raw KEGG module entry or text containing reactions.

    Returns:
        List of unique reaction IDs.
    """
    reactions = re.findall(r'R\d{5}', text)
    return sorted(set(reactions))


def extract_kegg_compounds(text: str) -> list[str]:
    """
    Extract KEGG compound IDs (Cxxxxx) from text.

    Args:
        text: Raw KEGG text containing compounds.

    Returns:
        List of unique compound IDs.
    """
    compounds = re.findall(r'C\d{5}', text)
    return sorted(set(compounds))


In [9]:
response = fetch_kegg_data("get", "hsa00010")
print(response)

ENTRY       hsa00010                    Pathway
NAME        Glycolysis / Gluconeogenesis - Homo sapiens (human)
DESCRIPTION Glycolysis is the process of converting glucose into pyruvate and generating small amounts of ATP (energy) and NADH (reducing power). It is a central pathway that produces important precursor metabolites: six-carbon compounds of glucose-6P and fructose-6P and three-carbon compounds of glycerone-P, glyceraldehyde-3P, glycerate-3P, phosphoenolpyruvate, and pyruvate [MD:M00001]. Acetyl-CoA, another important precursor metabolite, is produced by oxidative decarboxylation of pyruvate [MD:M00307]. When the enzyme genes of this pathway are examined in completely sequenced genomes, the reaction steps of three-carbon compounds from glycerone-P to pyruvate form a conserved core module [MD:M00002], which is found in almost all organisms and which sometimes contains operon structures in bacterial genomes. Gluconeogenesis is a synthesis pathway of glucose from noncarbohydrate 

In [12]:
modules=extract_kegg_modules(response)
modules

['M00001', 'M00002', 'M00003', 'M00307']

In [13]:
response = fetch_kegg_data("get", modules[0])
print(response)

ENTRY       M00001            Pathway   Module
NAME        Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate
DEFINITION  (K00844,K12407,K00845,K25026,K00886,K08074,K00918) (K01810,K06859,K13810,K15916) (K00850,K16370,K21071,K24182,K00918) (K01623,K01624,K11645,K16305,K16306) K01803 ((K00134,K00150) K00927,K11389) (K01834,K15633,K15634,K15635) (K01689,K27394) (K00873,K12406)
DIAGRAM     M00001  Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate
CLASS       Pathway modules; Carbohydrate metabolism; Central carbohydrate metabolism
PATHWAY     map00010  Glycolysis / Gluconeogenesis
            map01200  Carbon metabolism
            map01100  Metabolic pathways
ORTHOLOGY   K00844,K12407,K00845,K25026  hexokinase/glucokinase [EC:2.7.1.1 2.7.1.2] [RN:R01786]
            K00886  polyphosphate glucokinase [EC:2.7.1.63] [RN:R02189]
            K08074,K00918  ADP-dependent glucokinase [EC:2.7.1.147] [RN:R09085]
            K01810,K06859,K13810,K15916  glucose-6-phosphate isomerase

In [19]:
reactions = extract_kegg_reactions(response)
reactions

['R00200',
 'R00658',
 'R00756',
 'R01015',
 'R01061',
 'R01063',
 'R01068',
 'R01512',
 'R01518',
 'R01786',
 'R02145',
 'R02189',
 'R05805',
 'R07159',
 'R09085',
 'R13199',
 'R20291']

In [25]:
response = fetch_kegg_data("get", reactions[1])
print(response)

ENTRY       R00658                      Reaction
NAME        2-phospho-D-glycerate hydro-lyase (phosphoenolpyruvate-forming)
DEFINITION  2-Phospho-D-glycerate <=> Phosphoenolpyruvate + H2O
EQUATION    C00631 <=> C00074 + C00001
RCLASS      RC00349  C00074_C00631
ENZYME      4.2.1.11
PATHWAY     rn00010  Glycolysis / Gluconeogenesis
            rn00680  Methane metabolism
            rn01100  Metabolic pathways
            rn01110  Biosynthesis of secondary metabolites
            rn01120  Microbial metabolism in diverse environments
            rn01200  Carbon metabolism
            rn01230  Biosynthesis of amino acids
MODULE      M00001  Glycolysis (Embden-Meyerhof pathway), glucose => pyruvate
            M00002  Glycolysis, core module involving three-carbon compounds
            M00003  Gluconeogenesis, oxaloacetate => fructose-6P
            M00346  Formaldehyde assimilation, serine pathway
BRITE       Enzymatic reactions [BR:br08201]
             4. Lyase reactions
              

In [26]:
def parse_reaction_entry(text: str) -> dict:
    """
    Parse a KEGG reaction entry.

    Extracts:
        - substrates
        - products
        - enzymes (EC numbers)

    Returns:
        {
            "substrates": [...],
            "products": [...],
            "enzymes": [...]
        }
    """

    equation = ""
    enzymes = []
    capture_equation = False
    capture_enzyme = False

    for line in text.splitlines():

        # ---- EQUATION block ----
        if line.startswith("EQUATION"):
            capture_equation = True
            capture_enzyme = False
            equation = line.replace("EQUATION", "").strip()
            continue

        if capture_equation:
            if line.startswith(" "):
                equation += " " + line.strip()
            else:
                capture_equation = False

        # ---- ENZYME block ----
        if line.startswith("ENZYME"):
            capture_enzyme = True
            enzymes.extend(line.replace("ENZYME", "").split())
            continue

        if capture_enzyme:
            if line.startswith(" "):
                enzymes.extend(line.strip().split())
            else:
                capture_enzyme = False

    # ---- parse equation ----
    if "=" not in equation:
        return {"substrates": [], "products": [], "enzymes": enzymes}

    left, right = equation.split("=")

    substrates = re.findall(r'C\d{5}', left)
    products = re.findall(r'C\d{5}', right)

    return {
        "substrates": sorted(set(substrates)),
        "products": sorted(set(products)),
        "enzymes": sorted(set(enzymes)),
    }


In [27]:
parse_reaction_entry(response)

{'substrates': ['C00631'],
 'products': ['C00001', 'C00074'],
 'enzymes': ['4.2.1.11']}