# Negative control: comparing identically generated, unperturbed cohorts

I generated two sets of 50 simulated bulk RNA-seq samples
- I used the same set of fraction vectors for each set of samples
- I sampled different cell type GEPs for each set of samples

Here I compare their gene expression. I expect no significant differences.

## imports + setup

In [22]:
import logging
import warnings

import upath
import helpers
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [23]:
warnings.filterwarnings("ignore", message="divide by zero")
warnings.filterwarnings("ignore", message="invalid value encountered in double_scalars")

In [24]:
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")

# helpers.logging.configure_logging()
# udp_handler = logging.handlers.DatagramHandler("localhost", 12000)
# udp_handler.setLevel("DEBUG")
# logging.getLogger().addHandler(udp_handler)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(helpers.logging.formatter)
stream_handler.setLevel("INFO")
logging.getLogger().addHandler(stream_handler)

In [25]:
logging.getLogger("helpers").setLevel("DEBUG")

In [26]:
path_root = upath.UPath(
    "gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57"
)

In [27]:
!gsutil ls -hlR $path_root

gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/:

gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/data/:
  24.6 MiB  2022-09-14T19:42:35Z  gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/data/bulkrnaseq.txt

gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/outdir/:
 30.78 MiB  2022-09-14T19:42:30Z  gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/outdir/CIBERSORTxHiRes_NA_B_Window36.txt
 30.96 MiB  2022-09-14T19:42:34Z  gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/outdir/CIBERSORTxHiRes_NA_CAF_Window36.txt
 30.75 MiB  2022-09-14T19:42:29Z  gs://liulab/evaluating_cibersortx/identically_generated_cohorts_100/2022-09-14_19:37:57/outdir/CIBERSORTxHiRes_NA_Endothelial_Window36.txt
 27.44 KiB  2022-09-14T19:42:30Z  gs://liulab/evaluating_cibersortx/ident

In [28]:
# janky plotly hack
px.line(x=[1, 2], y=[1, 2])

## load data and results

In [None]:
df_fractions = helpers.data_io_and_formatting.load_concatenated_fractions(
    path_root / "outdir" / "fractions.txt"
)
df_fractions

In [None]:
df_fractions.unstack("cohort_id").stack("cell_type")

## DEG analysis

In [71]:
import importlib

importlib.reload(helpers)
importlib.reload(helpers.running_cibersortx)
importlib.reload(helpers.running_cibersortx.reading_output_files)

<module 'helpers.running_cibersortx.reading_output_files' from '/Users/william/src/deconv/helpers/running_cibersortx/reading_output_files.py'>

### Comparing gene expression in simulated bulk RNA-seq

In [None]:
df_bulk_rnaseq = helpers.data_io_and_formatting.load_concatenated_bulk_rnaseq(
    path_root / "data" / "bulkrnaseq.txt"
)

In [None]:
groups = df_bulk_rnaseq.groupby("GeneSymbol")
df_bulk_rnaseq_stats = helpers.deg_analysis.compute_stats(groups)
df_bulk_rnaseq_stats

In [None]:
fig = make_volcano_figure(df_bulk_rnaseq_stats)
fig.update_layout(
    title="Differential expression in independent, identically-generated in silico cohorts"
)
fig

### Comparing gene expression in malignant cells, inferred by CIBERSORTx

In [34]:
path_pattern = path_root / "outdir" / "CIBERSORTxHiRes_NA_Malignant_Window*txt"
df_rnaseq_inferred_malignant_ = helpers.running_cibersortx.read_hires_cell_type_geps(
    path_pattern
)

In [61]:
def _(df):
    df = df.copy()
    df.index = pd.MultiIndex.from_tuples(
        df.index.map(lambda x: (x[1],) + tuple(x[2].split("/"))),
        names=["GeneSymbol", "cohort_id", "sample_id"],
    )
    df = df.unstack("cohort_id")
    return df


df_rnaseq_inferred_malignant = _(df_rnaseq_inferred_malignant_)

In [63]:
df_rnaseq_inferred_malignant

Unnamed: 0_level_0,cohort_id,no_perturbations_1,no_perturbations_2
GeneSymbol,sample_id,Unnamed: 2_level_1,Unnamed: 3_level_1
A1BG,TCGA-3N-A9WB-06A-11R-A38C-07,95.255786,24.372390
A1BG,TCGA-3N-A9WC-06A-11R-A38C-07,53.174111,34.702169
A1BG,TCGA-BF-AAP0-06A-11R-A39D-07,60.294705,26.482029
A1BG,TCGA-D3-A1Q3-06A-11R-A18T-07,79.520935,24.490696
A1BG,TCGA-D3-A1Q5-06A-11R-A18T-07,22.564014,93.590395
...,...,...,...
ZZZ3,TCGA-GN-A4U4-06A-11R-A32P-07,2.025979,63.830709
ZZZ3,TCGA-HR-A2OG-06A-21R-A18U-07,14.495677,71.295664
ZZZ3,TCGA-WE-A8ZX-06A-11R-A37K-07,19.927466,59.921914
ZZZ3,TCGA-YG-AA3P-06A-11R-A38C-07,102.752720,134.138110


In [64]:
df_rnaseq_inferred_malignant_stats = helpers.deg_analysis.compute_stats(
    df_rnaseq_inferred_malignant.groupby("GeneSymbol")
)
df_rnaseq_inferred_malignant_stats

Unnamed: 0,GeneSymbol,pval,fold_change,sparsity_overall,-log10_pval,log2_fold_change,-log10_pval_signed,significant_bh_fdr=0.5,pval_adj_bh,-log10_pval_adj_bh,-log10_pval_adj_bh_signed
0,A1BG,0.383171,1.144704,0.0,0.416607,0.194975,0.416607,False,1.0,-0.0,-0.0
1,A2M,0.215924,1.195021,0.0,0.665700,0.257036,0.665700,False,1.0,-0.0,-0.0
2,A2ML1,0.991749,0.976970,0.0,0.003598,-0.033613,-0.003598,False,1.0,-0.0,0.0
3,A4GALT,1.000000,1.000000,0.0,-0.000000,0.000000,-0.000000,False,1.0,-0.0,-0.0
4,A4GNT,1.000000,1.000000,0.0,-0.000000,0.000000,-0.000000,False,1.0,-0.0,-0.0
...,...,...,...,...,...,...,...,...,...,...,...
16058,ZYG11A,0.514746,0.956705,0.0,0.288407,-0.063854,-0.288407,False,1.0,-0.0,0.0
16059,ZYG11B,0.732920,0.974143,0.0,0.134944,-0.037794,-0.134944,False,1.0,-0.0,0.0
16060,ZYX,0.702011,1.030262,0.0,0.153656,0.043011,0.153656,False,1.0,-0.0,-0.0
16061,ZZEF1,0.865876,0.958538,0.0,0.062544,-0.061093,-0.062544,False,1.0,-0.0,0.0


In [70]:
fig = helpers.deg_analysis.make_volcano_figure(df_rnaseq_inferred_malignant_stats)
fig.update_layout(
    title="Differential expression analysis of inferred gene expression<br>in malignant cells, by CIBERSORTx"
)
fig