# Perturbing cell type composition

In [None]:
import importlib
import logging

import dask.dataframe as dd
import helpers
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from upath import UPath

In [None]:
import plotly.io as pio

pio.renderers["png"].scale = 5

pio.renderers.default += "+png"

In [None]:
helpers.logging.configure_logging()
logging.getLogger("helpers").setLevel("DEBUG")
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")
logger.debug("test")

In [None]:
path_root = UPath("gs://liulab/differential_composition") / "20230120_04h22m54s"
# 20221208_14h18m38s last week
# 20221215_05h09m48s like last week but with more 0 perturbation experiments
# 20221215_05h06m59s recreation of last week, showed a significantly different perturbation=0
# 20221215_04h42m35s with N = 1000
# 20221215_06h29m44s a few experiments per perturbation
# 20230120_04h22m54s better fraction perturbation and sampling

In [None]:
!gsutil ls {path_root}

## check fractions

In [None]:
df_fractions = dd.read_parquet(path_root / "**" / "fractions.parquet").compute()
df_fractions[["group_id", "sample_id"]] = (
    df_fractions.index.get_level_values("sample_id").str.split("/", expand=True).tolist()
)
process_row = lambda row: row["malignant_means"].split(",")[0 if row["group_id"] == "low" else 1]
df_fractions["malignant_mean_value"] = df_fractions.apply(process_row, axis=1)
df_fractions = df_fractions.set_index(["malignant_means", "group_id", "malignant_mean_value", "sample_id", "run_id"])

### dist plot of fractions per experiment

In [None]:
df = df_fractions.reset_index()
# df["color"] = df.apply(lambda row: "{} - {}".format(row["malignant_means"], row["group_id"]), axis=1)
fig = px.ecdf(
    df,
    x="Malignant",
    color="malignant_mean_value",
    title="eCDFs of fraction values sampled for different values of mean malignant fraction",
)
fig.update_layout(
    showlegend=False,
    width=900,
    height=600,
)
# fig.show(renderer="png", width=1000)
fig.show(renderer="jpg")

In [None]:
df = df_fractions.reset_index()
df = df.astype({"run_id": int})
df = df.query("run_id < 5")
df.sort_values(["malignant_means"], ascending=False, inplace=True)
fig = px.box(
    df,
    x="Malignant",
    facet_col="malignant_means",
    facet_row="run_id",
    color="group_id",
    labels={"Malignant": "Malignant fraction"},
)
fig.for_each_annotation(
    lambda a: a.update(text=a.text.split("=")[-1])
)  # https://github.com/plotly/plotly_express/issues/36#issuecomment-589718357
fig.update_layout(showlegend=False)
fig.update_layout(width=1000, height=500, title="Columns are different experiments, rows are independent runs")
fig.show(config=plotly_config)

### what are fraction means for each group and experiment?

In [None]:
fig = px.box(
    df.reset_index(),
    x="Malignant",
    facet_col="perturbation",
    facet_row="simulation",
    color="treatment",
    labels={"Malignant": "Malignant fraction"},
)
fig.update_layout(
    width=1500,
    height=600,
    title="Columns are different perturbations, rows are independent runs",
)

## volcano plots from run_id=00

- for bulk RNA-seq
- for inferred malignant-specific RNA-seq

In [None]:
importlib.reload(helpers.deg_analysis)

from helpers.deg_analysis import add_fdr_lines, make_volcano_facets

fig = make_volcano_facets(all_dge_results, horizontal=True)
fig = add_fdr_lines(fig, all_dge_results, horizontal=True)
# fig.update_layout(title=title, height=1200)
fig.update_layout(height=700, width=1000)

In [None]:
importlib.reload(helpers.running_cibersortx.reading_output_files)

from helpers.running_cibersortx.reading_output_files import read_hires_cell_type_gep

experiment_paths = list(path_root.glob("run_id=00/malignant_means=*"))
experiment_paths

In [None]:
key_names = ["experiment_name", "data_origin"]

experiments_data = dict()

for path in experiment_paths:
    experiments_data[(path.name, "bulk RNA-seq")] = pd.read_parquet(path / "deg_analysis/gene_stats_bulk.parquet")
    experiments_data[(path.name, "malignant cells (cibersortx)")] = pd.read_parquet(
        path / "deg_analysis/gene_stats_malignant_cibersortx.parquet"
    )

all_dge_results = pd.concat(experiments_data, names=key_names).reset_index(key_names).reset_index(drop=True)

all_dge_results

## plot of false positives

## plot of p-values