In [None]:
"""
Author: Wen-Jou Chang
Baylor College of Medicine

This script performs gene-set enrichment analysis using Enrichr API.
"""

In [2]:

"""
Initialization
"""
# imports
import os
import pandas as pd
from collections import Counter
import time
import numpy as np
import math
import requests, json
# constant, data

category_names = ["cancer", "cardiovascular", "digestive", "endocrine", "hematological", "immune", "metabolic", "neurological", "obesity", "respiratory", "urogenital"]


In [None]:
"""
read all probes, list of probes can be obtained by running codes in probe_collection/pubmed_search_pipeline.ipynb
"""
cat_probes_dict = []

def read_in_probes(input_cat):
    if input_cat == "metabolic":
        df = pd.read_csv(f"data/probe/metabolic_diseases_all_probes.csv")
    else:
        df = pd.read_csv(f"data/probe/{input_cat}_all_probes.csv")
    probe_list = df["probeId"].to_list()
    c = dict(Counter(probe_list))
    cat_probes_dict.append(c.copy())
    return


for cat in category_names:
    start = time.time()
    print(cat)
    read_in_probes(cat)
    end = time.time()
    print(f'Time for {cat} code to run: ', end - start)

In [4]:
"""
Get genes associated with probes, official illumina manifest can be downloaded from illumina website.
"""

e = pd.read_csv("data/humanData/Illumina/EPIC_manifest_cleaned.csv")
hm = pd.read_csv("data/humanData/Illumina/HM450_manifest_cleaned.csv")
ref = pd.concat([e, hm])
ref.drop_duplicates(inplace=True)


def get_gene_illumina(target_id):
    tmp = ref[ref["Probe_ID"].isin(target_id)]
    res = []
    for names in tmp["UCSC_RefGene_Name"]:
        if isinstance(names, float) and math.isnan(names):
            continue
        names = names.split(";")
        res += names
    res = set(res)
    res.discard('')
    return res, len(target_id)

In [6]:
"""
Calls Enrichr API to perform enrichment analysis.
"""

# include target libraries in Enrichr that you want to run enrichment analysis on
target_libraries = ["Jensen_DISEASES", "DisGeNET", "GO_Biological_Process_2023",  "GWAS_Catalog_2023", "Human_Gene_Atlas", "Human_Phenotype_Ontology"] #"GO_Cellular_Component_2023", "GO_Molecular_Function_2023",

def run_enrichr(input_genes, pval=0.05, input_libraries=target_libraries, output_fp=None):
    if len(input_genes) == 0:
        return
    # input genes
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/addList'
    genes_str = '\n'.join(input_genes)
    description = 'Gene list for obesity'
    payload = {
        'list': (None, genes_str),
        'description': (None, description)
    }
    response = requests.post(ENRICHR_URL, files=payload)
    if not response.ok:
        raise Exception('Error analyzing gene list')
    data = json.loads(response.text)
    
    # get results for each library
    ENRICHR_URL = 'https://maayanlab.cloud/Enrichr/enrich'
    query_string = '?userListId=%s&backgroundType=%s'
    user_list_id = data["userListId"]
    res = []
    for lib in input_libraries:
        gene_set_library = lib
        response = requests.get(
            ENRICHR_URL + query_string % (user_list_id, gene_set_library)
        )
        if not response.ok:
            raise Exception('Error fetching enrichment results')
        data = json.loads(response.text)[lib]
        data = [r for r in data if r[6]<pval]
        temp = pd.DataFrame(data, columns=["Index", "Name", "P-value", "Odds Ratio", "Combined score", "Genes", "Adjusted p-value", "unknown1", "unknown2"])
        temp = temp[["Name", "P-value", "Adjusted p-value", "Odds Ratio", "Combined score", "Genes"]]
        temp[["Adjusted p-value", "Odds Ratio", "Combined score", "P-value"]] = temp[["Adjusted p-value", "Odds Ratio", "Combined score", "P-value"]].map(lambda x: round_to_significant_figures(x, 3))
        temp["Genes"] = temp["Genes"].apply(lambda x: "; ".join(x))
        temp["Database"] = lib
        temp.sort_values(by="Adjusted p-value", ascending=True, inplace=True)
        res.append(temp)
    result_df = pd.concat(res, ignore_index=True)

    if output_fp:
        result_df.to_csv(output_fp, index=0)
    return result_df

# import numpy as np
def round_to_significant_figures(x, sig):
    if x == 0:
        return 0
    else:
        return round(x, sig-int(np.floor(np.log10(abs(x))))-1)


In [None]:
"""
Generate a dataframe where rows are probes and columns are categories, cell value is the number of papers that report the probe for the category.
"""

paper_threshold = 2 # probes that are reported less than this number will be have occurrence reduced to 0
heatmap_df = []

for c in cat_probes_dict:
    c = {k:v for k, v in c.items() if v >= paper_threshold}
    heatmap_df.append(c)

heatmap_df = pd.DataFrame(heatmap_df)
heatmap_df = heatmap_df.fillna(0).transpose()
heatmap_df.columns = category_names

heatmap_df.to_csv(f"data/humanData/probe_based_heatmap_{paper_threshold}papers.csv") # this is in supplementary tables

In [None]:
"""
Runs GSEA analysis on each category
"""

paper_threshold = 2
df = pd.read_csv(f"data/humanData/probe_based_heatmap_{paper_threshold}papers.csv", index_col=0)
for cat in category_names:
    target_category = cat
    unique_probes = set(df[(df[target_category] != 0) & (df.drop(columns=[target_category]) == 0).all(axis=1)].index)
    g, l = get_gene_illumina(unique_probes) 
    print(f"{len(g):,} Genes Annotated to {len(unique_probes):,} Probes Reported in ≥ {paper_threshold} {target_category.capitalize()} Papers")
    run_enrichr(g, output_fp=f"{target_category}_{paper_threshold}papers_unique.csv") # DisGeNet results are in supplementary tables


In [None]:
# in case you want to actually print the genes for enrichr-kg web
paper_threshold = 1
target_category = "cancer"

df = pd.read_csv(f"data/humanData/probe_based_heatmap_{paper_threshold}papers.csv", index_col=0)
unique_probes = set(df[(df[target_category] != 0) & (df.drop(columns=[target_category]) == 0).all(axis=1)].index)

g, l = get_gene_illumina(unique_probes)
for gg in g:
    print(gg)