# imports & setup

In [1]:
import logging
import os
from pprint import pprint

import dask.dataframe as dd
import helpers
import numpy as np
import pandas as pd
import scipy.stats
from cloudpathlib import AnyPath as Path


In [2]:
import warnings

warnings.filterwarnings("ignore")


In [3]:
handler = logging.StreamHandler()
formatter = logging.Formatter(
    "%(asctime)s %(process)d/%(threadName)s %(name)s %(levelname)s\n%(message)s"
)
handler.setFormatter(formatter)
logging.getLogger().handlers = [handler]


In [4]:
logging.getLogger("gcsfs").setLevel("DEBUG")
logging.getLogger("google.cloud.bigquery").setLevel("DEBUG")
logging.getLogger("helpers").setLevel("DEBUG")
logging.getLogger("pandas").setLevel("DEBUG")
logging.getLogger("pyarrow").setLevel("DEBUG")


In [5]:
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")
logger.debug("test debug-level message")


2022-07-17 00:21:29,214 40252/MainThread __main__ DEBUG
test debug-level message


# loading data

## TCGA SKCM (real) bulk rna-seq

In [6]:
uri_tcga_skcm_bulk_rnaseq = (
    "gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/"
    "mixtures_real_tcga_skcm/tpm.parquet"
)


In [7]:
!gsutil ls -lh $uri_tcga_skcm_bulk_rnaseq

 45.23 MiB  2022-05-26T05:32:00Z  gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures_real_tcga_skcm/tpm.parquet
TOTAL: 1 objects, 47427179 bytes (45.23 MiB)


In [None]:
ddf_tcga_skcm_bulk_rnaseq = dd.read_parquet(uri_tcga_skcm_bulk_rnaseq, engine="pyarrow")
logger.debug(ddf_tcga_skcm_bulk_rnaseq.dtypes)


In [None]:
ddf_tcga_skcm_bulk_rnaseq


## TCGA SKCM fractions (estimated)

In [8]:
df_tcga_skcm_fractions_from_csx = helpers.datasets.load_tcga_skcm_fractions_from_csx()


2022-07-17 00:21:38,411 40252/MainThread helpers.datasets DEBUG
loading TCGA SKCM fractions estimated by CIBERSORTx
No project ID could be determined. Consider running `gcloud config set project` or setting the GOOGLE_CLOUD_PROJECT environment variable
2022-07-17 00:21:39,334 40252/MainThread gcsfs.credentials DEBUG
Connected with method google_default
2022-07-17 00:21:39,336 40252/fsspecIO gcsfs DEBUG
GET: b/{}/o/{}, ('liulab', 'downloaded_manually/derek_csx_tcga_skcm/CIBERSORTx_Job8_Results.txt'), None
2022-07-17 00:21:39,337 40252/fsspecIO gcsfs.credentials DEBUG
GCS refresh
2022-07-17 00:21:39,725 40252/fsspecIO gcsfs DEBUG
GET: https://storage.googleapis.com/download/storage/v1/b/liulab/o/downloaded_manually%2Fderek_csx_tcga_skcm%2FCIBERSORTx_Job8_Results.txt?alt=media, (), {'Range': 'bytes=0-86860'}


## TCGA SKCM sample types (metastatic, primary, etc)

In [None]:
from google.cloud import bigquery

bqclient = bigquery.Client()

query_string = """
SELECT * 
FROM `isb-cgc-bq.TCGA.biospecimen_gdc_current`
where project_short_name = "TCGA-SKCM"
    and sample_type_name = "Metastatic"
order by sample_barcode
"""

df_tcga_sample_metadata = (
    bqclient.query(query_string).result().to_dataframe(progress_bar_type="tqdm")
)


## pseudo bulk rna-seq

In [11]:
uri_pseudobulk_rnaseq = (
    "gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/"
    "mixtures/n_cells=5/malignant_from_one_sample=True/data.parquet"
)


In [12]:
!gsutil ls -lh $uri_pseudobulk_rnaseq

 39.95 MiB  2022-05-26T05:33:49Z  gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=5/malignant_from_one_sample=True/data.parquet
TOTAL: 1 objects, 41894774 bytes (39.95 MiB)


In [13]:
ddf_pseudobulk_rnaseq = (
    dd.read_parquet(
        uri_pseudobulk_rnaseq,
        engine="pyarrow",
    )
    .replace({"malignant_from_one_sample": {"True": True, "False": False}})
    .astype({"n_cells": "uint8", "malignant_from_one_sample": "bool"})
    .rename(columns={"tcga_aliquot_barcode_for_fractions": "aliquot_barcode"})
)

logger.debug(ddf_pseudobulk_rnaseq.dtypes)


2022-07-17 00:22:11,238 40252/fsspecIO gcsfs DEBUG
GET: b/{}/o/{}, ('liulab', 'data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=5/malignant_from_one_sample=True/data.parquet'), None
2022-07-17 00:22:11,445 40252/fsspecIO gcsfs DEBUG
GET: b/{}/o/{}, ('liulab', 'data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=5/malignant_from_one_sample=True/data.parquet'), None
2022-07-17 00:22:11,594 40252/fsspecIO gcsfs DEBUG
GET: b/{}/o/{}, ('liulab', 'data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=5/malignant_from_one_sample=True/data.parquet'), None
2022-07-17 00:22:11,745 40252/fsspecIO gcsfs DEBUG
GET: https://storage.googleapis.com/download/storage/v1/b/liulab/o/data%2Fpseudobulk_optimization%2F3_with_tcga_qc%2Fmixtures%2Fn_cells=5%2Fmalignant_from_one_sample=True%2Fdata.parquet?alt=media, (), {'Range': 'bytes=41829238-41894773'}
2022-07-17 00:22:12,274 40252/fsspecIO gcsfs DEBUG
GET: b/{}/o/{}, ('liulab', 'data/pseudobulk_optimization/3_with_tcga_qc/mixtur

In [14]:
ddf_pseudobulk_rnaseq


Unnamed: 0_level_0,gene_symbol,aliquot_barcode,tpm,n_cells,malignant_from_one_sample
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,category[unknown],category[unknown],float64,uint8,bool
,...,...,...,...,...


In [15]:
df_pseudobulk_rnaseq = ddf_pseudobulk_rnaseq.compute()


2022-07-17 00:23:17,066 40252/fsspecIO gcsfs DEBUG
GET: b/{}/o/{}, ('liulab', 'data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=5/malignant_from_one_sample=True/data.parquet'), None
2022-07-17 00:23:17,317 40252/fsspecIO gcsfs DEBUG
GET: b/{}/o/{}, ('liulab', 'data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=5/malignant_from_one_sample=True/data.parquet'), None
2022-07-17 00:23:17,469 40252/fsspecIO gcsfs DEBUG
GET: b/{}/o/{}, ('liulab', 'data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=5/malignant_from_one_sample=True/data.parquet'), None
2022-07-17 00:23:17,622 40252/fsspecIO gcsfs DEBUG
GET: https://storage.googleapis.com/download/storage/v1/b/liulab/o/data%2Fpseudobulk_optimization%2F3_with_tcga_qc%2Fmixtures%2Fn_cells=5%2Fmalignant_from_one_sample=True%2Fdata.parquet?alt=media, (), {'Range': 'bytes=41829238-41894773'}
2022-07-17 00:23:18,301 40252/fsspecIO gcsfs DEBUG
GET: b/{}/o/{}, ('liulab', 'data/pseudobulk_optimization/3_with_tcga_qc/mixtur

In [17]:
df_pseudobulk_rnaseq


Unnamed: 0,gene_symbol,aliquot_barcode,tpm,n_cells,malignant_from_one_sample
0,A1BG,TCGA-3N-A9WB-06A-11R-A38C-07,106.644738,5,True
1,A1BG,TCGA-3N-A9WC-06A-11R-A38C-07,72.082156,5,True
2,A1BG,TCGA-3N-A9WD-06A-11R-A38C-07,48.453905,5,True
3,A1BG,TCGA-BF-AAP0-06A-11R-A39D-07,29.958744,5,True
4,A1BG,TCGA-D3-A1Q1-06A-21R-A18T-07,38.982455,5,True
...,...,...,...,...,...
5911179,ZZZ3,TCGA-YG-AA3O-06A-11R-A38C-07,51.707960,5,True
5911180,ZZZ3,TCGA-YG-AA3P-06A-11R-A38C-07,52.941551,5,True
5911181,ZZZ3,TCGA-Z2-A8RT-06A-11R-A37K-07,34.450564,5,True
5911182,ZZZ3,TCGA-Z2-AA3S-06A-11R-A39D-07,190.622405,5,True


## determine immune high and low

In [None]:
def make_labels_for_aliquots(df_aliquot_fractions, df_sample_metadata):
    immune_cell_types = ["B", "Macrophage", "NK", "T", "T CD4", "T CD8"]
    aliquot_immune_fractions = (
        df_aliquot_fractions[immune_cell_types]
        .sum(axis="columns")
        .rename(index=lambda x: x[:-5])
    )
    return df_sample_metadata.assign({"immune_fraction": aliquot_immune_fractions)


make_labels_for_aliquots(df_tcga_skcm_fractions_from_csx, df_tcga_sample_metadata)

In [19]:
def do_stuff(df_sample_metadata):
    df = df_sample_metadata.copy()
    immune_threshold_low = df["immune_fraction"].quantile(0.2)
    immune_threshold_high = df["immune_fraction"].quantile(0.8)
    df["immune_low"] = df["immune_fraction"] <= immune_threshold_low
    df["immune_high"] = df["immune_fraction"] >= immune_threshold_high
    return df


do_stuff(
    make_labels_for_aliquots(df_tcga_skcm_fractions_from_csx, df_tcga_sample_metadata)
)


Unnamed: 0,gene_symbol,aliquot_barcode,tpm,n_cells,malignant_from_one_sample,immune_fraction
0,A1BG,TCGA-3N-A9WB-06A-11R-A38C-07,106.644738,5,True,0.006247
1,A2M,TCGA-3N-A9WB-06A-11R-A38C-07,285.508321,5,True,0.006247
2,A2ML1,TCGA-3N-A9WB-06A-11R-A38C-07,3.088469,5,True,0.006247
3,A4GALT,TCGA-3N-A9WB-06A-11R-A38C-07,1.554341,5,True,0.006247
4,A4GNT,TCGA-3N-A9WB-06A-11R-A38C-07,0.000000,5,True,0.006247
...,...,...,...,...,...,...
5911179,ZYG11A,TCGA-Z2-AA3V-06A-11R-A39D-07,43.598773,5,True,0.300102
5911180,ZYG11B,TCGA-Z2-AA3V-06A-11R-A39D-07,33.301372,5,True,0.300102
5911181,ZYX,TCGA-Z2-AA3V-06A-11R-A39D-07,138.176400,5,True,0.300102
5911182,ZZEF1,TCGA-Z2-AA3V-06A-11R-A39D-07,14.382751,5,True,0.300102


In [None]:
# left off here


# analysis

### analysis - computing stats on each dataset individually

In [None]:
ddf_tcga_skcm_bulk_rnaseq.join(df_immune_fraction_by_aliquot_barcode)

immune_threshold_low = df_rnaseq_with_immune["immune_fraction"].quantile(0.2)
immune_threshold_high = df_rnaseq_with_immune["immune_fraction"].quantile(0.8)


### analysis - combining all data, then computing stats

In [None]:
# merge real and pseudo data
ddf_bulk_rnaseq_all = ddf_tcga_skcm_bulk_rnaseq.merge(
    ddf_pseudobulk_rnaseq,
    on=["aliquot_barcode", "gene_symbol"],
    how="inner",
    suffixes=["_tcga_skcm", "_pseudo"],
)


In [None]:
ddf_bulk_rnaseq_all


### join these fractions onto bulk rna-seq data


In [None]:
ddf_bulk_rnaseq_all_with_immune = ddf_bulk_rnaseq_all.merge(
    df_immune_fraction_by_aliquot_barcode,
    left_on="aliquot_barcode",
    right_on="sample_id",
)

ddf_bulk_rnaseq_all_with_immune


In [None]:
# compute immune high and low
immune_threshold_low = ddf_bulk_rnaseq_all_with_immune["immune_fraction"].quantile(0.2)
immune_threshold_high = ddf_bulk_rnaseq_all_with_immune["immune_fraction"].quantile(0.8)


In [None]:
immune_threshold_low, immune_threshold_high


In [None]:
ddf_bulk_rnaseq_all_with_immune["immune_low"] = (
    ddf_bulk_rnaseq_all_with_immune["immune_fraction"] <= immune_threshold_low
)

ddf_bulk_rnaseq_all_with_immune["immune_high"] = (
    ddf_bulk_rnaseq_all_with_immune["immune_fraction"] >= immune_threshold_high
)


In [None]:
df_bulk_rnaseq_all_with_immune = ddf_bulk_rnaseq_all_with_immune.compute()


In [None]:
df_bulk_rnaseq_all_with_immune["aliquot_barcode"].value_counts()


In [None]:
df_bulk_rnaseq_all_with_immune[["immune_low", "immune_high"]].value_counts()


#### compute stats for each gene

In [None]:
### compute p-values...


def compute_stats(df):
    immune_low = df[df["immune_low"]]
    immune_high = df[df["immune_high"]]
    pval_pseudo = scipy.stats.mannwhitneyu(
        immune_high["tpm_pseudo"].values, immune_low["tpm_pseudo"].values
    )[1]
    neglog10pval_pseudo = -np.log10(pval_pseudo)
    foldchange_pseudo = (
        immune_high["tpm_pseudo"].mean() / immune_low["tpm_pseudo"].mean()
    )
    log2foldchange_pseudo = np.log2(foldchange_pseudo)

    pval_real = scipy.stats.mannwhitneyu(
        immune_high["tpm_tcga_skcm"].values, immune_low["tpm_tcga_skcm"].values
    )[1]
    neglog10pval_real = -np.log10(pval_real)
    foldchange_real = (
        immune_high["tpm_tcga_skcm"].mean() / immune_low["tpm_tcga_skcm"].mean()
    )
    log2foldchange_real = np.log2(foldchange_real)

    return pd.Series(
        dict(
            pval_pseudo=pval_pseudo,
            foldchange_pseudo=foldchange_pseudo,
            log2foldchange_pseudo=log2foldchange_pseudo,
            neglog10pval_pseudo=neglog10pval_pseudo,
            signedneglog10pval_pseudo=(
                neglog10pval_pseudo * np.sign(log2foldchange_pseudo)
            ),
            pval_real=pval_real,
            foldchange_real=foldchange_real,
            log2foldchange_real=log2foldchange_real,
            neglog10pval_real=neglog10pval_real,
            signedneglog10pval_real=(neglog10pval_real * np.sign(log2foldchange_real)),
        )
    )


df_gene_stats_by_immune = df_bulk_rnaseq_all_with_immune.groupby("gene_symbol").apply(
    compute_stats
)
df_gene_stats_by_immune = df_gene_stats_by_immune.reset_index()


In [None]:
df_gene_stats_by_immune


In [None]:
# how many genes have valid stats?

logger.debug(df_gene_stats_by_immune["pval_pseudo"].isna().value_counts())
logger.debug(df_gene_stats_by_immune["foldchange_pseudo"].isna().value_counts())
logger.debug(df_gene_stats_by_immune["pval_real"].isna().value_counts())
logger.debug(df_gene_stats_by_immune["foldchange_real"].isna().value_counts())


In [None]:
import plotly.express as px


In [None]:
fig = px.scatter(
    df_gene_stats_by_immune,
    x="log2foldchange_real",
    y="neglog10pval_real",
    title="real (tcga skcm): immune high / low",
    hover_name="gene_symbol",
    hover_data=["foldchange_real", "pval_real"],
)
fig.update_xaxes(range=(-10, 10))
fig.update_yaxes(range=(0, 30))
fig.update_traces(marker=dict(size=3))
fig.show(renderer="png", scale=1, width=1000, height=500)


In [None]:
fig.show(renderer="browser")


In [None]:
fig = px.scatter(
    df_gene_stats_by_immune,
    x="log2foldchange_pseudo",
    y="neglog10pval_pseudo",
    title="pseudobulks: immune high / low",
    hover_name="gene_symbol",
    hover_data=["foldchange_pseudo", "pval_pseudo"],
)
fig.update_xaxes(range=(-10, 10))
fig.update_yaxes(range=(0, 30))
fig.update_traces(marker=dict(size=3))
fig.show(renderer="png", scale=2, width=800, height=600)


In [None]:
fig.show(renderer="browser")


In [None]:
fig = px.scatter(
    df_gene_stats_by_immune,
    x="signedneglog10pval_real",
    y="signedneglog10pval_pseudo",
    trendline="ols",
    title="signed -log10(p-values): pseudobulks vs tcga skcm",
    hover_name="gene_symbol",
    hover_data=["log2foldchange_real", "log2foldchange_pseudo"],
)
fig.update_xaxes(range=(-25, 25))
fig.update_yaxes(range=(-25, 25))
fig.update_traces(marker=dict(size=3))
fig.show(renderer="png", scale=1, width=500, height=500)


In [None]:
fig.show(renderer="browser")


In [None]:
df_gene_stats_by_immune.info()


In [None]:
fig = px.scatter(
    df_gene_stats_by_immune,
    x="log2foldchange_real",
    y="log2foldchange_pseudo",
    trendline="ols",
    title="logs(fold-change): pseudobulks vs tcga skcm",
    hover_name="gene_symbol",
    # hover_data=["log2foldchange_real", "log2foldchange_pseudo"],
)
# fig.update_xaxes(range=(-25, 25))
# fig.update_yaxes(range=(-25, 25))
fig.update_traces(marker=dict(size=3))
fig.show(renderer="png", scale=1, width=500, height=500)
fig.show(renderer="browser")


In [None]:
fig.show(renderer="browser")


#### what's the overlap for significant genes between real, pseudo?

In [None]:
df_gene_stats_by_immune["percentile_neglog10pval_pseudo"] = df_gene_stats_by_immune[
    "neglog10pval_pseudo"
].rank(pct=True)
df_gene_stats_by_immune["percentile_neglog10pval_real"] = df_gene_stats_by_immune[
    "neglog10pval_real"
].rank(pct=True)


In [None]:
THRESHOLD = 0.9

df_gene_stats_by_immune["top_pseudo"] = (
    df_gene_stats_by_immune["percentile_neglog10pval_pseudo"] > THRESHOLD
)
df_gene_stats_by_immune["top_real"] = (
    df_gene_stats_by_immune["percentile_neglog10pval_real"] > THRESHOLD
)
df_gene_stats_by_immune["top_both"] = (
    df_gene_stats_by_immune["top_pseudo"] & df_gene_stats_by_immune["top_real"]
)
pd.crosstab(df_gene_stats_by_immune["top_pseudo"], df_gene_stats_by_immune["top_real"])


In [None]:
# TODO - a good summary stat here would be the odds ratio of this contingency table


In [None]:
# TODO - benjamini hochberg for a FDR-corrected significance test (use FDR of 0.1)


In [None]:
# TODO - also do scatter for fold change


In [None]:
# is it the same genes that showed up in the PCA analysis?
