In [1]:
import logging
import os
from pprint import pprint

import dask.dataframe as dd
import numpy as np
import pandas as pd
import scipy.stats
from cloudpathlib import AnyPath as Path

In [2]:
import warnings

# with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
handler = logging.StreamHandler()
formatter = logging.Formatter(
    "%(asctime)s %(process)d/%(threadName)s %(name)s %(levelname)s - %(message)s"
)
handler.setFormatter(formatter)
logging.getLogger().handlers = [handler]

In [4]:
logger = logging.getLogger(__name__)

In [5]:
!gsutil ls gs://liulab/data/pseudobulk_optimization

gs://liulab/data/pseudobulk_optimization/1_no_qc_subset/
gs://liulab/data/pseudobulk_optimization/2_no_qc/
gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/


In [6]:
uri_real_bulks = "gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures_real_tcga_skcm/tpm.parquet"
real_bulks = dd.read_parquet(uri_real_bulks, engine="pyarrow")

In [7]:
df_real_bulks = real_bulks.compute()

In [8]:
corrs = (
    df_real_bulks.pivot(
        index="gene_symbol",
        columns="aliquot_barcode",
        values="tpm",
    )
    .corr()
    .stack()
)

In [9]:
tcga_intersample_corr_stats = corrs.describe()
tcga_intersample_corr_stats

count    135424.000000
mean          0.714757
std           0.117332
min           0.123389
25%           0.651614
50%           0.735886
75%           0.799794
max           1.000000
dtype: float64

In [10]:
tcga_intersample_corr_stats["mean"]

0.7147571313366748

note: 368 * 367 / 2 = 67528

# ended here

In [None]:
def compute_intersample_metrics(df):
    z = df.pivot(
        index="gene_symbol",
        columns="tcga_aliquot_barcode_for_fractions",
        values="tpm_pseudo",
    ).corr()
    zz = z.values.flatten()
    zzz = zz[: -len(z)]
    return pd.Series(
        {
            "intersample_corr_median": np.median(zzz),
            "intersample_corr_mean": np.mean(zzz),
            "intersample_corr_stddev": np.std(zzz),
        }
    )


metadata = [
    ("intersample_corr_median", "float64"),
    ("intersample_corr_mean", "float64"),
    ("intersample_corr_stddev", "float64"),
]
results_intersample = (
    merged_groupby.apply(compute_intersample_metrics, meta=metadata).dropna().compute()
)
results_intersample = results_intersample.sort_index()