# analyze false positive genes

what explains false positives?

for each gene:
- get p-values, fold changes predicted by cibersortx
  - load from pre-computed results
- compute ratio of mean expression in malignant cells vs immune cells

make plots
- plot scatter: 
  - expression ratio vs average FPR (across trials)
  - expression ratio vs average p-value

looking for:
- does low malignant ratio predict false discovery?


In [2]:
import helpers
import numpy as np
import pandas as pd
import upath

In [3]:
path_root = upath.UPath("gs://liulab/differential_composition/20230120_04h22m54s")
parquet_paths = list(
    path_root.glob("*/malignant_means=*/deg_analysis/gene_stats_malignant_cibersortx.parquet")
)
print(parquet_paths[:3])

[GCSPath('gs://liulab/differential_composition/20230120_04h22m54s/run_id=00/malignant_means=0.55,0.85/deg_analysis/gene_stats_malignant_cibersortx.parquet'), GCSPath('gs://liulab/differential_composition/20230120_04h22m54s/run_id=00/malignant_means=0.6,0.8/deg_analysis/gene_stats_malignant_cibersortx.parquet'), GCSPath('gs://liulab/differential_composition/20230120_04h22m54s/run_id=00/malignant_means=0.65,0.75/deg_analysis/gene_stats_malignant_cibersortx.parquet')]


In [4]:
def extract_from_path(path: str, var_name: str) -> str:
    _ = path.split(var_name + "=")[1]
    return _.split("/")[0]


def load_gene_stats_malignant_cibersortx():
    df = pd.concat(
        {str(path): pd.read_parquet(path) for path in parquet_paths},
        names=["path", "index"],
    )
    df["run_id"] = df.index.get_level_values("path").map(
        lambda path: extract_from_path(path, "run_id")
    )
    df["malignant_means"] = df.index.get_level_values("path").map(
        lambda path: extract_from_path(path, "malignant_means")
    )
    # df = df.droplevel(["path", "index"])
    df = df.set_index(["run_id", "malignant_means", "gene_symbol"])
    return df


gene_stats_malignant_cibersortx = load_gene_stats_malignant_cibersortx()

In [5]:
gene_stats_malignant_cibersortx

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,pval,fold_change,sparsity_overall,-log10_pval,log2_fold_change,-log10_pval_signed,significant_bh_fdr=0.10,significant_bh_fdr=0.25,perturbed
run_id,malignant_means,gene_symbol,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
00,"0.55,0.85",A1BG,0.045960,0.762946,0.0,1.337620,-0.390348,-1.337620,False,True,False
00,"0.55,0.85",A2M,0.671588,0.979047,0.0,0.172897,-0.030550,-0.172897,False,False,False
00,"0.55,0.85",A2ML1,0.006673,0.734658,0.0,2.175654,-0.444854,-2.175654,True,True,False
00,"0.55,0.85",A4GALT,1.000000,1.000000,0.0,-0.000000,0.000000,-0.000000,False,False,False
00,"0.55,0.85",A4GNT,1.000000,1.000000,0.0,-0.000000,0.000000,-0.000000,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
09,"0.71,0.71",ZYG11A,0.142938,1.115149,0.0,0.844852,0.157237,0.844852,False,False,False
09,"0.71,0.71",ZYG11B,0.121704,1.105519,0.0,0.914697,0.144724,0.914697,False,False,False
09,"0.71,0.71",ZYX,0.148667,1.112070,0.0,0.827785,0.153247,0.827785,False,False,False
09,"0.71,0.71",ZZEF1,0.336206,1.119979,0.0,0.473394,0.163472,0.473394,False,False,False


In [6]:
def load_gene_stats_malignant_cibersortx_aggregated(df):
    df = df.groupby(["malignant_means", "gene_symbol"])[
        ["pval", "sparsity_overall", "-log10_pval", "log2_fold_change", "significant_bh_fdr=0.10"]
    ].agg("mean")
    return df


gene_stats_malignant_cibersortx_aggregated = load_gene_stats_malignant_cibersortx_aggregated(
    gene_stats_malignant_cibersortx
)

gene_stats_malignant_cibersortx_aggregated

Unnamed: 0_level_0,Unnamed: 1_level_0,pval,sparsity_overall,-log10_pval,log2_fold_change,significant_bh_fdr=0.10
malignant_means,gene_symbol,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"0.55,0.85",A1BG,0.559052,0.0,0.356863,0.059536,0.0
"0.55,0.85",A2M,0.410739,0.0,0.482232,-0.144573,0.0
"0.55,0.85",A2ML1,0.132443,0.0,1.466375,-0.363758,0.3
"0.55,0.85",A4GALT,1.000000,0.0,0.000000,0.000000,0.0
"0.55,0.85",A4GNT,1.000000,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...
"0.71,0.71",ZYG11A,0.685009,0.0,0.224542,0.049764,0.0
"0.71,0.71",ZYG11B,0.633558,0.0,0.317697,-0.016852,0.0
"0.71,0.71",ZYX,0.543524,0.0,0.433230,0.002201,0.0
"0.71,0.71",ZZEF1,0.456068,0.0,0.619656,0.096869,0.0


In [7]:
df_sc_rnaseq, df_sc_metadata = helpers.datasets.load_jerby_arnon_hg19_tpm()

MemoryError: Unable to allocate 1.27 GiB for an array with shape (7186, 23686) and data type float64

In [None]:
def compute_gene_expression_ratios(df_sc_rnaseq: pd.DataFrame, df_sc_metadata: pd.DataFrame):
    df = df_sc_metadata[["cell_type"]]
    # drop cells with missing cell_type
    df = df.dropna()
    # drop cells with cell_type "CAF" or "Endothelial"
    df = df[~df["cell_type"].isin(["CAF", "Endothelial"])]
    # add a string column with value "malignant" if cell type is "Malignant" or "other" otherwise
    df["cell_group"] = df["cell_type"].map(lambda x: "malignant" if x == "Malignant" else "other")
    # print(df)

    # add gene expression for each single_cell_id
    df_sc_rnaseq = pd.merge(df["cell_group"], df_sc_rnaseq.T, left_index=True, right_index=True)
    df = df_sc_rnaseq.groupby("cell_group").mean()
    df = df.rename_axis(columns="gene_symbol").T
    df["malignant_ratio"] = df["malignant"] / df["other"]
    df = df.sort_values("malignant_ratio", ascending=False)
    return df

In [None]:
def merge_gene_data(df_sc_rnaseq: pd.DataFrame, gene_stats: pd.DataFrame) -> pd.DataFrame:
    # genes = gene_stats.index.get_level_values("gene_symbol").unique()
    # filter out genes not in df_sc_rnaseq
    # genes = genes[genes.isin(df_sc_rnaseq.index)]
    # df_sc_rnaseq = df_sc_rnaseq.loc[genes]
    df_sc_rnaseq = df_sc_rnaseq.loc[
        df_sc_rnaseq.index.isin(gene_stats.index.get_level_values("gene_symbol"))
    ]
    gene_expression_ratios = compute_gene_expression_ratios(df_sc_rnaseq, df_sc_metadata)
    return pd.merge(gene_expression_ratios, gene_stats, on="gene_symbol")


df_gene_data = compute_all(df_sc_rnaseq)

df_gene_data

In [None]:
# number of genes per malignant_means
df_gene_data.groupby("malignant_means").size()

In [None]:
import plotly.express as px

In [None]:
def make_scatter(
    df_gene_data: pd.DataFrame,
):
    df_gene_data = df_gene_data.sort_index(ascending=[False, True])
    df_gene_data = df_gene_data.reset_index()
    fig = px.scatter(
        df_gene_data,
        x="malignant_ratio",
        # y="significant_bh_fdr=0.10",
        y="-log10_pval",
        hover_name="gene_symbol",
        hover_data=df_gene_data.columns,
        color=""
        facet_col="malignant_means",
    )
    fig.update_xaxes(type="log")
    fig.update_layout(
        title="Gene expression ratio vs. CIBERSORTx significance",
        xaxis_title="Malignant / Other",
        yaxis_title="CIBERSORTx significance",
    )
    # add histograms along y-axis
    return fig

fig = make_scatter(df_gene_data)
fig.show()

In [None]:
# density contour of pval and ratio
def make_plot(df_gene_data: pd.DataFrame):
    df_gene_data = df_gene_data.sort_index(ascending=[False, True])
    df = df_gene_data.reset_index()
    fig = px.density_contour(
        df,
        x="malignant_ratio",
        y="-log10_pval",
    )
    return fig


fig = make_plot(df_gene_data)
fig

NameError: name 'pd' is not defined