# Description

This notebooks writes a markdown table (in the manuscript) with the LVs that are enriched with the lipids-altering gene sets.

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import re
from pathlib import Path

import numpy as np
import pandas as pd
from IPython.display import HTML
from statsmodels.stats.multitest import multipletests

from entity import Trait
from data.cache import read_data
import conf

# Settings

In [3]:
FGSEA_INPUT_FILEPATH = Path(
    conf.RESULTS["CRISPR_ANALYSES"]["BASE_DIR"], "fgsea-hi_conf-all_lvs.tsv"
).resolve()

display(FGSEA_INPUT_FILEPATH)

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier/bases_data/base_orig/results/crispr_analyses/fgsea-hi_conf-all_lvs.tsv')

In [4]:
assert "CONTENT_DIR" in conf.MANUSCRIPT

OUTPUT_FILE_PATH = conf.MANUSCRIPT["CONTENT_DIR"] / "50.00.supplementary_material.md"
display(OUTPUT_FILE_PATH)
assert OUTPUT_FILE_PATH.exists()

PosixPath('/home/miltondp/projects/labs/greenelab/phenoplier_manuscript/content/50.00.supplementary_material.md')

In [5]:
PVAL_THRESHOLD = 0.01

# Data loading

## MultiPLIER summary

In [6]:
multiplier_model_summary = pd.read_pickle(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [7]:
multiplier_model_summary.shape

(2157, 5)

In [8]:
multiplier_model_summary.head()

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
1,KEGG_LYSINE_DEGRADATION,1,0.388059,0.866078,0.956005
2,REACTOME_MRNA_SPLICING,1,0.733057,4.8e-05,0.000582
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628,0.011366
4,KEGG_DNA_REPLICATION,1,0.549473,0.312155,0.539951
5,PID_MYC_ACTIVPATHWAY,1,0.639303,0.021702,0.083739


In [9]:
well_aligned_lvs = multiplier_model_summary[
    (
        multiplier_model_summary["FDR"] < 0.05
    )  # & (multiplier_model_summary["AUC"] >= 0.75)
]

display(well_aligned_lvs.shape)
display(well_aligned_lvs.head())

(463, 5)

Unnamed: 0,pathway,LV index,AUC,p-value,FDR
2,REACTOME_MRNA_SPLICING,1,0.733057,4.772691e-05,0.0005816211
3,MIPS_NOP56P_ASSOCIATED_PRE_RRNA_COMPLEX,1,0.680555,0.001628217,0.0113659
8,REACTOME_MITOTIC_G1_G1_S_PHASES,1,0.68617,0.0002517619,0.002392292
9,IRIS_Monocyte-Day0,2,0.890036,4.315812e-25,1.329887e-22
10,DMAP_MONO2,2,0.904676,1.31397e-16,1.574574e-14


In [10]:
well_aligned_lv_codes = set([f"LV{lvi}" for lvi in well_aligned_lvs["LV index"]])

In [11]:
len(well_aligned_lv_codes)

199

In [12]:
list(well_aligned_lv_codes)[:5]

['LV662', 'LV92', 'LV176', 'LV675', 'LV913']

## LVs enrichment on DEG from CRISPR screen

In [13]:
deg_enrich = pd.read_csv(
    FGSEA_INPUT_FILEPATH,
    sep="\t",
)

In [14]:
deg_enrich.shape

(1974, 11)

In [15]:
deg_enrich = deg_enrich.assign(
    lv_aligned=deg_enrich["lv"].apply(lambda x: x in well_aligned_lv_codes)
)

In [16]:
deg_enrich.head()

Unnamed: 0,pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx,fdr,lv_aligned
0,gene_set_decrease,0.116883,0.233766,0.12564,0.909509,1.237006,5,"PCYT2, UBE2J2, FBXW7",LV1,8,0.967285,True
1,gene_set_increase,0.285714,0.285714,0.07218,0.840253,1.258282,3,"ACACA, MBTPS1",LV1,2,0.967285,True
2,gene_set_decrease,0.741259,0.741259,0.026956,0.751964,1.005845,5,"TCF7L2, UBE2J2, PTEN",LV10,10,0.967285,True
3,gene_set_increase,0.427572,0.72028,0.052805,0.81414,1.320263,3,"MBTPS1, DGAT2",LV10,4,0.967285,True
4,gene_set_decrease,0.908092,0.908092,0.014514,0.5404,0.778719,5,"PTEN, TCF7L2",LV100,1,0.97425,False


# Get significantly enriched modules

In [17]:
df = deg_enrich[(deg_enrich["pval"] < PVAL_THRESHOLD)].sort_values(
    "pval", ascending=True
)

In [18]:
# df = deg_enrich[(deg_enrich["pval"] < PVAL_THRESHOLD) & (deg_enrich["lv_aligned"])].sort_values(
#     "pval", ascending=True
# )

In [19]:
# df = deg_enrich[(deg_enrich["pval"] < PVAL_THRESHOLD)].sort_values(
#     "pval", ascending=True
# )

In [20]:
df.shape

(15, 12)

In [21]:
df.sort_values("fdr")

Unnamed: 0,pathway,pval,padj,log2err,ES,NES,size,leadingEdge,lv,rep_idx,fdr,lv_aligned
936,gene_set_decrease,0.000554,0.001108,0.477271,0.999703,1.388986,5,"FBXW7, TCF7L2",LV520,6,0.967285,False
1560,gene_set_decrease,0.002201,0.004403,0.431708,0.997776,1.3596,5,"UBE2J2, TCF7L2",LV801,6,0.967285,False
918,gene_set_decrease,0.00246,0.004921,0.431708,0.997331,1.382906,5,"FBXW7, TCF7L2",LV512,2,0.967285,False
327,gene_set_increase,0.003533,0.007067,0.431708,0.998221,1.582398,3,"DGAT2, ACACA",LV246,6,0.967285,True
1140,gene_set_decrease,0.00357,0.007141,0.431708,0.999299,1.400503,5,"PTEN, FBXW7",LV612,3,0.967285,True
690,gene_set_decrease,0.004053,0.008105,0.407018,0.996738,1.348694,5,"PCYT2, TCF7L2",LV41,4,0.967285,False
1341,gene_set_increase,0.004573,0.009147,0.407018,0.998666,1.49578,3,"ACACA, DGAT2",LV702,9,0.967285,False
1129,gene_set_increase,0.005837,0.011675,0.407018,0.996739,1.51066,3,"ACACA, DGAT2",LV607,10,0.967285,True
1757,gene_set_increase,0.00673,0.013459,0.407018,0.99748,1.545604,3,"ACACA, DGAT2",LV890,10,0.967285,False
1640,gene_set_decrease,0.006953,0.013906,0.407018,0.99407,1.374274,5,"UBE2J2, TCF7L2",LV838,6,0.967285,True


# Prepare table

In [22]:
df = df.assign(pval=df["pval"].apply(lambda x: f"{x:.4f}"))

In [23]:
df = df.assign(fdr=df["fdr"].apply(lambda x: f"{x:.2e}"))

In [24]:
df = df.rename(
    columns={
        "pathway": "Lipids gene-set",
        "pval": "p-value",
        "leadingEdge": "Leading edge",
        #         "fdr": "FDR",
        "lv": "Gene module",
    }
)

In [25]:
df = df.replace(
    {
        "Lipids gene-set": {
            "gene_set_decrease": "decrease",
            "gene_set_increase": "increase",
        },
    }
)

In [26]:
df = df.replace(
    {
        "Leading edge": {
            "([A-Z\d]+)": "*\\1*",
        }
    },
    regex=True,
)

In [27]:
df["Gene module"] = df.apply(
    lambda x: f"**{x['Gene module']}**" if x["lv_aligned"] else x["Gene module"], axis=1
)

In [28]:
df = df[["Gene module", "Lipids gene-set", "Leading edge", "p-value"]]

In [29]:
df

Unnamed: 0,Gene module,Lipids gene-set,Leading edge,p-value
936,LV520,decrease,"*FBXW7*, *TCF7L2*",0.0006
1560,LV801,decrease,"*UBE2J2*, *TCF7L2*",0.0022
918,LV512,decrease,"*FBXW7*, *TCF7L2*",0.0025
327,**LV246**,increase,"*DGAT2*, *ACACA*",0.0035
1140,**LV612**,decrease,"*PTEN*, *FBXW7*",0.0036
690,LV41,decrease,"*PCYT2*, *TCF7L2*",0.0041
1341,LV702,increase,"*ACACA*, *DGAT2*",0.0046
1129,**LV607**,increase,"*ACACA*, *DGAT2*",0.0058
1757,LV890,increase,"*ACACA*, *DGAT2*",0.0067
1640,**LV838**,decrease,"*UBE2J2*, *TCF7L2*",0.007


# Save lipids-increasing

In [30]:
gene_set_name = "increase"

In [31]:
# result_set is either phenomexcan or emerge
LV_FILE_MARK_TEMPLATE = (
    "<!-- lipids_gene_sets:modules_enriched_{gene_set}:{position} -->"
)

In [32]:
TABLE_CAPTION = f"Table: Gene modules (LVs) nominally enriched for the lipids-increasing gene-set from the CRISPR-screen (*P* < {PVAL_THRESHOLD}). LVs significantly aligned with pathways (FDR < 0.05) from the MultiPLIER models are shown in boldface. {{table_id}}"

In [33]:
TABLE_CAPTION_ID = "#tbl:sup:lipids_crispr:modules_enriched_{gene_set}"

In [34]:
# start
lv_file_mark_start = LV_FILE_MARK_TEMPLATE.format(
    gene_set=gene_set_name, position="start"
)
display(lv_file_mark_start)

# end
lv_file_mark_end = LV_FILE_MARK_TEMPLATE.format(gene_set=gene_set_name, position="end")
display(lv_file_mark_end)

# table caption
TABLE_CAPTION_ID = TABLE_CAPTION_ID.format(gene_set=gene_set_name)
display(TABLE_CAPTION_ID)

'<!-- lipids_gene_sets:modules_enriched_increase:start -->'

'<!-- lipids_gene_sets:modules_enriched_increase:end -->'

'#tbl:sup:lipids_crispr:modules_enriched_increase'

In [35]:
new_content = df[df["Lipids gene-set"] == gene_set_name].to_markdown(
    index=False, disable_numparse=True
)

In [36]:
# add table caption
table_caption = TABLE_CAPTION.format(
    table_id="{" + TABLE_CAPTION_ID + "}",
)
display(table_caption)

'Table: Gene modules (LVs) nominally enriched for the lipids-increasing gene-set from the CRISPR-screen (*P* < 0.01). LVs significantly aligned with pathways (FDR < 0.05) from the MultiPLIER models are shown in boldface. {#tbl:sup:lipids_crispr:modules_enriched_increase}'

In [37]:
new_content += "\n\n" + table_caption

In [38]:
full_new_content = (
    lv_file_mark_start + "\n" + new_content.strip() + "\n" + lv_file_mark_end
)

In [39]:
with open(OUTPUT_FILE_PATH, "r", encoding="utf8") as f:
    file_content = f.read()

In [40]:
new_file_content = re.sub(
    lv_file_mark_start + ".*?" + lv_file_mark_end,
    full_new_content,
    file_content,
    flags=re.DOTALL,
)

In [41]:
with open(OUTPUT_FILE_PATH, "w", encoding="utf8") as f:
    f.write(new_file_content)  # .replace("\beta", r"\beta"))

# Save lipids-decreasing

In [42]:
gene_set_name = "decrease"

In [43]:
# result_set is either phenomexcan or emerge
LV_FILE_MARK_TEMPLATE = (
    "<!-- lipids_gene_sets:modules_enriched_{gene_set}:{position} -->"
)

In [44]:
TABLE_CAPTION = f"Table: Gene modules (LVs) nominally enriched for the lipids-decreasing gene-set from the CRISPR-screen (*P* < {PVAL_THRESHOLD}). LVs significantly aligned with pathways (FDR < 0.05) from the MultiPLIER models are shown in boldface. {{table_id}}"

In [45]:
TABLE_CAPTION_ID = "#tbl:sup:lipids_crispr:modules_enriched_{gene_set}"

In [46]:
# start
lv_file_mark_start = LV_FILE_MARK_TEMPLATE.format(
    gene_set=gene_set_name, position="start"
)
display(lv_file_mark_start)

# end
lv_file_mark_end = LV_FILE_MARK_TEMPLATE.format(gene_set=gene_set_name, position="end")
display(lv_file_mark_end)

# table caption
TABLE_CAPTION_ID = TABLE_CAPTION_ID.format(gene_set=gene_set_name)
display(TABLE_CAPTION_ID)

'<!-- lipids_gene_sets:modules_enriched_decrease:start -->'

'<!-- lipids_gene_sets:modules_enriched_decrease:end -->'

'#tbl:sup:lipids_crispr:modules_enriched_decrease'

In [47]:
new_content = df[df["Lipids gene-set"] == gene_set_name].to_markdown(
    index=False, disable_numparse=True
)

In [48]:
# add table caption
table_caption = TABLE_CAPTION.format(
    table_id="{" + TABLE_CAPTION_ID + "}",
)
display(table_caption)

'Table: Gene modules (LVs) nominally enriched for the lipids-decreasing gene-set from the CRISPR-screen (*P* < 0.01). LVs significantly aligned with pathways (FDR < 0.05) from the MultiPLIER models are shown in boldface. {#tbl:sup:lipids_crispr:modules_enriched_decrease}'

In [49]:
new_content += "\n\n" + table_caption

In [50]:
full_new_content = (
    lv_file_mark_start + "\n" + new_content.strip() + "\n" + lv_file_mark_end
)

In [51]:
with open(OUTPUT_FILE_PATH, "r", encoding="utf8") as f:
    file_content = f.read()

In [52]:
new_file_content = re.sub(
    lv_file_mark_start + ".*?" + lv_file_mark_end,
    full_new_content,
    file_content,
    flags=re.DOTALL,
)

In [53]:
with open(OUTPUT_FILE_PATH, "w", encoding="utf8") as f:
    f.write(new_file_content)  # .replace("\beta", r"\beta"))