In [3]:
import os
import glob
import requests
from urllib.parse import quote
from typing import List, Tuple, Iterable, Dict, Optional

Pathway libraries (gene sets) were downloaded from https://maayanlab.cloud/Enrichr/#libraries and exported as .tsv with header: MEMBER_OF	gene	pathway	source. The `pathway` column contains only the term (no source), while `source` holds the filename base (e.g., Reactome_Pathways_2024).

In [4]:
def iter_memberof_triples_from_gmt_text(gmt_text: str, library_name: str, namespace: bool = True) -> Iterable[Tuple[str, str, str]]:
    for line in gmt_text.splitlines():
        if not line or line.startswith("#"):
            continue
        parts = line.rstrip("\n").split("\t")
        if len(parts) < 3:
            continue
        term = parts[0].strip()
        genes = [g.strip() for g in parts[2:] if g.strip()]
        if not genes:
            continue
        pathway = f"{library_name}:{term}" if namespace else term
        for gene in genes:
            yield ("MEMBER_OF", gene, pathway)


def parse_gmt_file_to_triples(file_path: str, library_name: Optional[str] = None, namespace: bool = True) -> Iterable[Tuple[str, str, str]]:
    if library_name is None:
        base = os.path.basename(file_path)
        library_name = os.path.splitext(base)[0]
    with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
        text = f.read()
    return iter_memberof_triples_from_gmt_text(text, library_name, namespace=namespace)


def build_memberof_triples_from_folder(
    input_dir: str,
    output_tsv_path: str,
    patterns: Optional[List[str]] = None,
    namespace_pathways: bool = False,
) -> Dict[str, int]:
    if patterns is None:
        patterns = ["*.gmt", "*.txt"]
    os.makedirs(os.path.dirname(output_tsv_path) or ".", exist_ok=True)
    files: List[str] = []
    for pat in patterns:
        files.extend(sorted(glob.glob(os.path.join(input_dir, pat))))
    seen = set()
    triples_written = 0
    files_processed = 0
    with open(output_tsv_path, "w", encoding="utf-8") as out:
        out.write("MEMBER_OF\tgene\tpathway\tsource\n")
        for fp in files:
            wrote_any = False
            source_name = os.path.splitext(os.path.basename(fp))[0]
            for triple in parse_gmt_file_to_triples(fp, library_name=None, namespace=namespace_pathways):
                row = (triple[0], triple[1], triple[2], source_name)
                if row in seen:
                    continue
                seen.add(row)
                out.write("\t".join(row) + "\n")
                triples_written += 1
                wrote_any = True
            if wrote_any:
                files_processed += 1
    return {"files_processed": files_processed, "rows": triples_written, "unique_rows": len(seen)}


In [5]:
INPUT_DIR = "/Users/polina/Variant_Drug_KG/scr/pathways/pathway_libs" 
OUTPUT_PATH = "/Users/polina/Variant_Drug_KG/scr/pathways/output/pathways_triples.tsv" 

stats = build_memberof_triples_from_folder(
    input_dir=INPUT_DIR,
    output_tsv_path=OUTPUT_PATH,
    patterns=["*.gmt", "*.txt"],
    namespace_pathways=False,
)
print({"wrote": stats, "output": OUTPUT_PATH})


{'wrote': {'files_processed': 5, 'rows': 403582, 'unique_rows': 403582}, 'output': '/Users/polina/Variant_Drug_KG/scr/pathways/output/pathways_triples.tsv'}
