# Negative control: comparing identically generated, unperturbed cohorts

I generated two sets of 50 simulated bulk RNA-seq samples
- I used the same set of fraction vectors for each set of samples
- I sampled different cell type GEPs for each set of samples

Here I compare their gene expression. I expect no significant differences.

## imports + setup

In [None]:
import logging
import warnings

import helpers
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import upath

In [None]:
import importlib

importlib.reload(helpers)
importlib.reload(helpers.running_cibersortx)
importlib.reload(helpers.running_cibersortx.reading_output_files)

In [None]:
warnings.filterwarnings("ignore", message="divide by zero")
warnings.filterwarnings("ignore", message="invalid value encountered in double_scalars")

In [None]:
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")

# helpers.logging.configure_logging()
# udp_handler = logging.handlers.DatagramHandler("localhost", 12000)
# udp_handler.setLevel("DEBUG")
# logging.getLogger().addHandler(udp_handler)

stream_handler = logging.StreamHandler()
stream_handler.setFormatter(helpers.logging.formatter)
stream_handler.setLevel("INFO")
logging.getLogger().addHandler(stream_handler)

In [None]:
logging.getLogger("helpers").setLevel("DEBUG")

In [None]:
path_root = (
    upath.UPath("gs://liulab/evaluating_cibersortx/")
    / "perturbing_100_genes_in_malignant_cells_by_many_factors_of_2"
    / "20220920_08h43m05s"
)

In [None]:
!gsutil ls -hlR $path_root | head

In [None]:
# janky plotly hack
px.line(x=[1, 2], y=[1, 2])

## Comparing 2x perturbation only

### Comparing gene expression in simulated bulk RNA-seq

In [None]:
df_bulk_rnaseq = helpers.data_io_and_formatting.load_concatenated_bulk_rnaseq(
    path_root / "data" / "bulkrnaseq.txt"
)

In [None]:
groups = df_bulk_rnaseq.groupby("GeneSymbol")
df_bulk_rnaseq_stats = helpers.deg_analysis.compute_stats(groups)
df_bulk_rnaseq_stats

In [None]:
fig = make_volcano_figure(df_bulk_rnaseq_stats)
fig.update_layout(
    title="Differential expression in independent, identically-generated in silico cohorts"
)
fig

### Comparing gene expression in malignant cells, inferred by CIBERSORTx

In [None]:
path_pattern = path_root / "outdir" / "CIBERSORTxHiRes_NA_Malignant_Window*txt"
df_rnaseq_inferred_malignant_ = helpers.running_cibersortx.read_hires_cell_type_geps(
    path_pattern
)

In [None]:
def _(df):
    df = df.copy()
    df.index = pd.MultiIndex.from_tuples(
        df.index.map(lambda x: (x[1],) + tuple(x[2].split("/"))),
        names=["GeneSymbol", "cohort_id", "sample_id"],
    )
    df = df.unstack("cohort_id")
    return df


df_rnaseq_inferred_malignant = _(df_rnaseq_inferred_malignant_)

In [None]:
df_rnaseq_inferred_malignant

In [None]:
df_rnaseq_inferred_malignant_stats = helpers.deg_analysis.compute_stats(
    df_rnaseq_inferred_malignant.groupby("GeneSymbol")
)
df_rnaseq_inferred_malignant_stats

In [None]:
fig = helpers.deg_analysis.make_volcano_figure(df_rnaseq_inferred_malignant_stats)
fig.update_layout(
    title="Differential expression analysis of inferred gene expression<br>in malignant cells, by CIBERSORTx"
)
fig