# Description

TODO

# Modules loading

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd
from IPython.display import HTML
from statsmodels.stats.multitest import multipletests

from entity import Trait
from data.cache import read_data
import conf

# Settings

In [3]:
FGSEA_INPUT_FILEPATH = Path(
    conf.RESULTS["CRISPR_ANALYSES"]["BASE_DIR"], "fgsea-all_lvs.tsv"
).resolve()

# Data loading

## LVs enrichment on DEG from CRISPR screen

In [4]:
deg_enrich = pd.read_csv(
    FGSEA_INPUT_FILEPATH,
    sep="\t",
).drop(columns=["padj"])

In [5]:
deg_enrich.shape

(19730, 9)

In [6]:
deg_enrich.head()

Unnamed: 0,pathway,pval,log2err,ES,NES,size,leadingEdge,lv,rep_idx
0,gene_set_decrease,0.184815,0.095921,0.578418,1.126666,35,"PTBP1, KEAP1, PEX14, DLST, PCYT2, MAD2L2, GLRX...",LV1,1
1,gene_set_increase,0.235764,0.082205,0.539155,1.076511,63,"CHERP, RANGAP1, HNRNPL, RPS2, E4F1, TAF1C, GAT...",LV1,1
2,gene_set_decrease,0.198801,0.09168,0.578418,1.123767,35,"PTBP1, KEAP1, PEX14, DLST, PCYT2, MAD2L2, GLRX...",LV1,2
3,gene_set_increase,0.251748,0.078711,0.539155,1.078565,63,"CHERP, RANGAP1, HNRNPL, RPS2, E4F1, TAF1C, GAT...",LV1,2
4,gene_set_decrease,0.162837,0.103576,0.578418,1.13718,35,"PTBP1, KEAP1, PEX14, DLST, PCYT2, MAD2L2, GLRX...",LV1,3


In [7]:
deg_enrich = deg_enrich.dropna()

In [8]:
# for each lv/pathway pair we ran fgsea 10 times; here take the maximum pvalue (least significant) among those runs
deg_enrich_max_idx = deg_enrich.groupby(["lv", "pathway"])["pval"].idxmax()

In [9]:
deg_enrich = deg_enrich.loc[deg_enrich_max_idx].reset_index(drop=True)
display(deg_enrich.shape)
display(deg_enrich.head())

(1973, 9)

Unnamed: 0,pathway,pval,log2err,ES,NES,size,leadingEdge,lv,rep_idx
0,gene_set_decrease,0.198801,0.09168,0.578418,1.123767,35,"PTBP1, KEAP1, PEX14, DLST, PCYT2, MAD2L2, GLRX...",LV1,2
1,gene_set_increase,0.251748,0.078711,0.539155,1.078565,63,"CHERP, RANGAP1, HNRNPL, RPS2, E4F1, TAF1C, GAT...",LV1,2
2,gene_set_decrease,0.963037,0.008938,0.434799,0.678421,20,"VDR, ACVR1B, KEAP1, NDUFV2, PEX14, NDUFS3, WDR26",LV10,5
3,gene_set_increase,1.0,0.0,0.236619,0.378164,25,"USP39, SRP19, ZNF3, RPL18, RPS19, RAP1GDS1, CH...",LV10,2
4,gene_set_decrease,0.042957,0.216543,0.831205,1.711885,27,"NDUFB7, PTBP1, RRAGC, PPP2R2B, SQLE",LV100,6


## MultiPLIER summary

In [10]:
# multiplier_model_summary = read_data(conf.MULTIPLIER["MODEL_SUMMARY_FILE"])

In [11]:
# multiplier_model_summary.shape

In [12]:
# multiplier_model_summary.head()

# Adjust p-values

In [13]:
adj_pvals = multipletests(deg_enrich["pval"], alpha=0.05, method="fdr_bh")

In [14]:
adj_pvals

(array([False, False, False, ..., False, False, False]),
 array([0.75562051, 0.77994705, 1.        , ..., 0.94094348, 0.71867843,
        0.91289763]),
 2.5997277061207136e-05,
 2.5342118601115056e-05)

In [15]:
np.sum(adj_pvals[0])

4

In [16]:
deg_enrich = deg_enrich.assign(padj=adj_pvals[1])

In [17]:
deg_enrich.head()

Unnamed: 0,pathway,pval,log2err,ES,NES,size,leadingEdge,lv,rep_idx,padj
0,gene_set_decrease,0.198801,0.09168,0.578418,1.123767,35,"PTBP1, KEAP1, PEX14, DLST, PCYT2, MAD2L2, GLRX...",LV1,2,0.755621
1,gene_set_increase,0.251748,0.078711,0.539155,1.078565,63,"CHERP, RANGAP1, HNRNPL, RPS2, E4F1, TAF1C, GAT...",LV1,2,0.779947
2,gene_set_decrease,0.963037,0.008938,0.434799,0.678421,20,"VDR, ACVR1B, KEAP1, NDUFV2, PEX14, NDUFS3, WDR26",LV10,5,1.0
3,gene_set_increase,1.0,0.0,0.236619,0.378164,25,"USP39, SRP19, ZNF3, RPL18, RPS19, RAP1GDS1, CH...",LV10,2,1.0
4,gene_set_decrease,0.042957,0.216543,0.831205,1.711885,27,"NDUFB7, PTBP1, RRAGC, PPP2R2B, SQLE",LV100,6,0.718678


# Analysis

In [18]:
df = deg_enrich[(deg_enrich["padj"] < 0.05)].sort_values("padj", ascending=True)

In [19]:
df.shape

(4, 10)

In [20]:
df.sort_values("pval")

Unnamed: 0,pathway,pval,log2err,ES,NES,size,leadingEdge,lv,rep_idx,padj
1350,gene_set_increase,1.738856e-07,0.690132,0.905734,1.565818,49,"RPS14, RPL31, RPS19, RPS11, RPS6, RPL37, RPSA,...",LV707,3,0.000257
1283,gene_set_decrease,2.605291e-07,0.674963,0.858542,1.636153,43,"NDUFA4, COX6A1, ATP5O, NDUFB10, COX5A, NDUFS3,...",LV678,3,0.000257
1790,gene_set_increase,4.293645e-05,0.557332,0.732205,1.486815,60,"RPS6, RPL6, RPLP0, RPL19, RPL31, RPL7, RPS13, ...",LV905,6,0.028238
1812,gene_set_increase,6.373604e-05,0.538434,0.627667,1.444149,83,"SAFB, LUC7L3, HSP90B1, CHD4, SNRPD3, ISY1, DKC...",LV915,4,0.031438


# Save

In [21]:
# override the original file with adjusted p-values
deg_enrich.to_csv(
    FGSEA_INPUT_FILEPATH,
    sep="\t",
)