# Negative control: comparing identically generated, unperturbed cohorts

## imports + setup

In [1]:
import logging
import warnings

import cloudpathlib
import dask.dataframe as dd
import helpers
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io
import scipy.stats
from statsmodels.stats.multitest import multipletests

In [2]:
warnings.filterwarnings("ignore", message="divide by zero")
warnings.filterwarnings("ignore", message="invalid value encountered in double_scalars")

In [3]:
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")

# helpers.logging.configure_logging()
# udp_handler = logging.handlers.DatagramHandler("localhost", 12000)
# udp_handler.setLevel("DEBUG")
# logging.getLogger().addHandler(udp_handler)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(helpers.logging.formatter)
stream_handler.setLevel("INFO")
logging.getLogger().addHandler(stream_handler)

In [4]:
logging.getLogger("helpers").setLevel("DEBUG")

In [5]:
path_root = cloudpathlib.CloudPath(
    "gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57"
)

In [6]:
!gsutil ls -hlR $path_root

gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/:

gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/data/:
  24.6 MiB  2022-09-14T19:42:35Z  gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/data/bulkrnaseq.txt

gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/outdir/:
 30.78 MiB  2022-09-14T19:42:30Z  gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/outdir/CIBERSORTxHiRes_NA_B_Window36.txt
 30.96 MiB  2022-09-14T19:42:34Z  gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/outdir/CIBERSORTxHiRes_NA_CAF_Window36.txt
 30.75 MiB  2022-09-14T19:42:29Z  gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/outdir/CIBERSORTxHiRes_NA_Endothelial_Window36.txt
 27.44 KiB  2022-09-14T19:42:30Z  gs://liulab/evaluating_cibersortx/ident

In [7]:
# janky plotly hack
px.line(x=[1, 2], y=[1, 2])

## load data and results

## DEG analysis

### bulk level

In [8]:
df_bulk_rnaseq = pd.read_csv(
    path_root / "data" / "bulkrnaseq.txt",
    sep="\t",
    engine="pyarrow",
    index_col="GeneSymbol",
)
# df_bulk_rnaseq = df_bulk_rnaseq.sample(n=100, random_state=0)
df_bulk_rnaseq.columns = pd.MultiIndex.from_tuples(
    df_bulk_rnaseq.columns.map(lambda x: x.split("/")).map(tuple),
    names=["cohort_id", "sample_id"],
)
df_bulk_rnaseq = df_bulk_rnaseq.stack(level="sample_id")
df_bulk_rnaseq

Unnamed: 0_level_0,cohort_id,no_perturbations_1,no_perturbations_2
GeneSymbol,sample_id,Unnamed: 2_level_1,Unnamed: 3_level_1
A1BG,TCGA-3N-A9WB-06A-11R-A38C-07,113.839703,16.548933
A1BG,TCGA-3N-A9WC-06A-11R-A38C-07,37.796220,24.874189
A1BG,TCGA-BF-AAP0-06A-11R-A39D-07,39.781144,18.760724
A1BG,TCGA-D3-A1Q3-06A-11R-A18T-07,63.064430,17.427817
A1BG,TCGA-D3-A1Q5-06A-11R-A18T-07,16.496005,102.289767
...,...,...,...
ZZZ3,TCGA-GN-A4U4-06A-11R-A32P-07,0.782295,36.108743
ZZZ3,TCGA-HR-A2OG-06A-21R-A18U-07,5.491985,47.471332
ZZZ3,TCGA-WE-A8ZX-06A-11R-A37K-07,9.234191,35.222933
ZZZ3,TCGA-YG-AA3P-06A-11R-A38C-07,73.656539,162.861163


```
cohort_id  GeneSymbol  sample_id                   
0          A1BG        TCGA-3N-A9WB-06A-11R-A38C-07     44.044627
                       TCGA-3N-A9WC-06A-11R-A38C-07     89.240604
```

In [9]:
import importlib
importlib.reload(helpers.deg_analysis)

<module 'helpers.deg_analysis' from '/home/jupyter/deconv/helpers/deg_analysis.py'>

In [10]:
groups = df_bulk_rnaseq.groupby("GeneSymbol")
df_bulk_rnaseq_stats = helpers.deg_analysis.compute_stats(groups)
df_bulk_rnaseq_stats

Unnamed: 0,GeneSymbol,pval,fold_change,sparsity_overall,-log10_pval,log2_fold_change,-log10_pval_signed,significant_bh_fdr=0.5,pval_adj_bh,-log10_pval_adj_bh,-log10_pval_adj_bh_signed
0,A1BG,0.390738,1.180582,0.01,0.408115,0.239499,0.408115,False,1.0,-0.0,-0.0
1,A2M,0.239836,1.212499,0.00,0.620085,0.277983,0.620085,False,1.0,-0.0,-0.0
2,A2ML1,0.958765,0.851313,0.00,0.018288,-0.232238,-0.018288,False,1.0,-0.0,0.0
3,A4GALT,0.497107,0.959941,0.03,0.303551,-0.058982,-0.303551,False,1.0,-0.0,0.0
4,A4GNT,0.257732,23.467142,0.91,0.588832,4.552570,0.588832,False,1.0,-0.0,-0.0
...,...,...,...,...,...,...,...,...,...,...,...
16058,ZYG11A,0.475530,0.930321,0.00,0.322822,-0.104200,-0.322822,False,1.0,-0.0,0.0
16059,ZYG11B,0.764268,0.970503,0.00,0.116754,-0.043196,-0.116754,False,1.0,-0.0,0.0
16060,ZYX,0.641693,1.040088,0.00,0.192673,0.056706,0.192673,False,1.0,-0.0,-0.0
16061,ZZEF1,0.849638,1.019877,0.00,0.070766,0.028396,0.070766,False,1.0,-0.0,-0.0


In [11]:
fig = px.scatter(
    df_bulk_rnaseq_stats,
    x="log2_fold_change",
    y="-log10_pval",
    color="significant_bh_fdr=0.5",
    hover_name="GeneSymbol",
    hover_data=["pval", "pval_adj_bh", "sparsity_overall"],
)
fig.update_layout(
    title="Differential expression in independent, identically-generated in silico cohorts",
    xaxis_title=r"$\log_{2} [\text{fold change}]$",
    yaxis_title=r"$-\log_{10} [\text{p-value (Mann-Whitney U)}]$",
    legend_title="Significant with FDR=0.5?",
    font=dict(family="Courier New, monospace", color="RebeccaPurple"),
    height=750,
)
fig.update_traces(marker=dict(size=5))
fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01))
fig.add_hline(y=df_bulk_rnaseq_stats.attrs["-log10_pval_threshold_bh"])
fig