In [1]:
import pandas as pd
import os
import subprocess
import gseapy as gp
from gseapy.plot import barplot, dotplot

# get the path to the root of the repository
root_path = (
    subprocess.check_output(["git", "rev-parse", "--show-toplevel"])
    .decode("utf-8")
    .strip()
)
# set the working directory to the root of the repository
os.chdir(root_path)
os.makedirs("data/10.GO_Kegg", exist_ok=True)

In [56]:
def run_go_kegg_analysis(sample_name):
    # FILEPATH: /data0/work/guozhonghao/mHSC_RNA_seq/codes/10.GO_Kegg.ipynb
    df = pd.read_csv(f"data/07.DEG/DEG_DESeq2_{sample_name}.tsv", sep="\t", index_col=0)
    # filter log2FC > 2 and padj < 0.05
    df = df[(df["log2FoldChange"].abs() >= 2) & (df["pvalue"] < 0.05)]
    gene_list = df["GeneName"].tolist()
    gene_set = [
        "KEGG_2019_Mouse",
        "GO_Biological_Process_2023",
        "GO_Cellular_Component_2023",
        "GO_Molecular_Function_2023",
    ]
    os.makedirs(f"data/10.GO_Kegg/{sample_name}", exist_ok=True)
    kegg_df = gp.enrichr(
        gene_list=gene_list,
        organism="Mouse",
        gene_sets=gene_set[0],
        outdir=f"data/10.GO_Kegg/{sample_name}",
        cutoff=0.05,
        no_plot=True,
        format="pdf",
    )
    go_df = gp.enrichr(
        gene_list=gene_list,
        organism="Mouse",
        gene_sets=gene_set[1:],
        outdir=f"data/10.GO_Kegg/{sample_name}",
        no_plot=True,
        format="pdf",
    )
    barplot(
        kegg_df.res2d,
        column="P-value",
        title=f"KEGG - {sample_name}",
        xticklabels_rot=45,
        cutoff=0.05,
        ofname=f"data/10.GO_Kegg/{sample_name}/KEGG_{sample_name}_barplot.pdf",
    )
    dotplot(
        kegg_df.res2d,
        column="P-value",
        title=f"KEGG - {sample_name}",
        xticklabels_rot=45,
        cutoff=0.05,
        ofname=f"data/10.GO_Kegg/{sample_name}/KEGG_{sample_name}_dotplot.pdf",
    )
    barplot(
        go_df.results,
        title=f"GO - {sample_name}",
        group="Gene_set",
        color=["#e41a1c", "#377eb8", "#4daf4a"],
        column="P-value",
        xticklabels_rot=45,
        cutoff=0.05,
        ofname=f"data/10.GO_Kegg/{sample_name}/GO_{sample_name}_barplot.pdf",
        figsize=(10, 8),
    )
    dotplot(
        go_df.results,
        title=f"GO - {sample_name}",
        x="Gene_set",
        column="P-value",
        xticklabels_rot=45,
        cutoff=0.05,
        ofname=f"data/10.GO_Kegg/{sample_name}/GO_{sample_name}_dotplot.pdf",
        figsize=(8, 8),
    )


# apply the function to AB, AC, BC
for sample_name in ["BvsA", "CvsA", "CvsB"]:
    run_go_kegg_analysis(sample_name)