In [1]:
import logging
import os
from pprint import pprint

import dask.dataframe as dd
import helpers
import helpers.creating_mixtures
import numpy as np
import pandas as pd
from cloudpathlib import AnyPath as Path

In [2]:
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)

In [3]:
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")

In [4]:
uri = "gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures"

!gsutil ls -lhR {uri} | grep 2022

 31.03 MiB  2022-05-26T05:32:30Z  gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=1/malignant_from_one_sample=False/data.parquet
 30.76 MiB  2022-05-26T05:32:14Z  gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=1/malignant_from_one_sample=True/data.parquet
  42.6 MiB  2022-05-26T05:36:22Z  gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=10/malignant_from_one_sample=False/data.parquet
 42.33 MiB  2022-05-26T05:36:04Z  gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=10/malignant_from_one_sample=True/data.parquet
 42.82 MiB  2022-05-26T05:36:51Z  gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=11/malignant_from_one_sample=False/data.parquet
 42.59 MiB  2022-05-26T05:36:40Z  gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=11/malignant_from_one_sample=True/data.parquet
 43.14 MiB  2022-05-26T05:37:26Z  gs://liulab/data/pseudobulk_optimization/

In [5]:
# df = pd.read_parquet(uri)

df = dd.read_parquet(
    uri,
    engine="pyarrow",  # explicitly specify the pyarrow engine
)

In [6]:
df

Unnamed: 0_level_0,gene_symbol,tcga_aliquot_barcode_for_fractions,tpm,n_cells,malignant_from_one_sample
npartitions=40,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,category[unknown],category[unknown],float64,category[known],category[known]
,...,...,...,...,...
...,...,...,...,...,...
,...,...,...,...,...
,...,...,...,...,...


In [7]:
real = dd.read_parquet(
    "gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures_real_tcga_skcm/tpm.parquet",
    engine="pyarrow",  # explicitly specify the pyarrow engine
)

In [8]:
real

Unnamed: 0_level_0,gene_symbol,aliquot_barcode,tpm
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,category[unknown],category[unknown],float64
,...,...,...


In [9]:
merged_ = dd.merge(
    df,
    real,
    left_on=["gene_symbol", "tcga_aliquot_barcode_for_fractions"],
    right_on=["gene_symbol", "aliquot_barcode"],
    suffixes=["_pseudo", "_real"],
)

In [10]:
merged_

Unnamed: 0_level_0,gene_symbol,tcga_aliquot_barcode_for_fractions,tpm_pseudo,n_cells,malignant_from_one_sample,aliquot_barcode,tpm_real
npartitions=40,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
,category[unknown],category[unknown],float64,category[known],category[known],category[unknown],float64
,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...
,...,...,...,...,...,...,...


In [11]:
columns = [
    "n_cells",
    "malignant_from_one_sample",
    "aliquot_barcode",
    "gene_symbol",
    "tpm_pseudo",
    "tpm_real",
]
merged = merged_[columns].compute()

MemoryError: Unable to allocate 3.52 GiB for an array with shape (2, 236447360) and data type float64

In [None]:
merged

In [None]:
merged.info()

In [None]:
merged

In [None]:
merged_groupby = merged.groupby(["n_cells", "malignant_from_one_sample"])

In [None]:
merged_groupby.groups.keys()  # [["tpm_pseudo", "tpm_real"]]

In [None]:
merged_groupby[["tpm_pseudo", "tpm_real"]].corr("pearson")

In [None]:
merged_groupby[["tpm_pseudo", "tpm_real"]].corr("spearman")

In [None]:
corr_pearson = (
    merged_groupby[["tpm_pseudo", "tpm_real"]]
    .corr("pearson")
    .loc[:, :, "tpm_real"]["tpm_pseudo"]
)
corr_pearson

In [None]:
corr_spearman = (
    merged_groupby[["tpm_pseudo", "tpm_real"]]
    .corr("spearman")
    .loc[:, :, "tpm_real"]["tpm_pseudo"]
)
corr_spearman

In [None]:
import scipy.stats

In [None]:
merged_groupby.agg(func=scipy.stats.ks_2samp)

In [None]:
merged_groupby.agg(func=lambda x: x.shape)

In [None]:
agg_metrics = merged_groupby.apply(
    lambda df: np.corrcoef(df["tpm_pseudo"], df["tpm_real"])[0, 1]
).to_frame(name="corr_pearson")
agg_metrics["corr_spearman"] = merged_groupby.apply(
    lambda df: scipy.stats.spearmanr(df["tpm_pseudo"], df["tpm_real"])[0]
)

In [None]:
agg_metrics