In [1]:
import logging

import helpers
import numpy as np
import pandas as pd

In [8]:
# data quality
mets_only_tcga = True
gene_sparsity_ceiling_tcga = 0.5
# pseudobulk generation parameters
n_cells_per_cell_type = 10
malignant_from_one_sample = True

In [None]:
import pathlib

figure_path = pathlib.Path("figures-9c")
figure_path.mkdir(parents=True, exist_ok=True)
figure_path

data_path = "gs://liulab/data/9c"
figure_path.mkdir(parents=True, exist_ok=True)
figure_path


In [2]:
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)

In [3]:
rng = np.random.default_rng(seed=0)

In [4]:
%%time
bulk_tcga_skcm = helpers.datasets.load_tcga_skcm_hg19_scaled_estimate_firebrowse()
bulk_tcga_skcm *= 1_000_000 / bulk_tcga_skcm.sum()

CPU times: user 13.6 s, sys: 1.69 s, total: 15.3 s
Wall time: 38.9 s


In [5]:
%%time
fractions_tcga_skcm = helpers.datasets.load_tcga_skcm_fractions_from_csx()

CPU times: user 10.4 ms, sys: 3.8 ms, total: 14.2 ms
Wall time: 121 ms


In [6]:
%%time
sc_jerby_arnon, sc_metadata_jerby_arnon = helpers.datasets.load_jerby_arnon(
    ref_genome="hg19", units="tpm"
)
sc_jerby_arnon *= 1_000_000 / sc_jerby_arnon.sum()

CPU times: user 1min 14s, sys: 27.7 s, total: 1min 42s
Wall time: 45.6 s


# calculate exclusions

In [7]:
# determine gene exclusions

exclusions_genes = pd.DataFrame(index=bulk_tcga_skcm.index.union(sc_jerby_arnon.index))

## sparsity in tcga
genes_sparse_tcga_skcm = bulk_tcga_skcm[
    (bulk_tcga_skcm == 0).mean(axis=1) > gene_sparsity_ceiling_tcga
].index
exclusions_genes["sparse_in_tcga_skcm"] = exclusions_genes.index.map(
    lambda g: g in genes_sparse_tcga_skcm
)

## genes not in both cohorts
genes_in_both = bulk_tcga_skcm.index.intersection(sc_jerby_arnon.index)
exclusions_genes["not_in_both_cohorts"] = exclusions_genes.index.map(
    lambda gene_name: gene_name not in genes_in_both
)

good_genes = exclusions_genes.loc[~exclusions_genes.any(axis=1)].index

NameError: name 'gene_sparsity_ceiling_tcga' is not defined

# compute pseudobulks

In [None]:
for n_cells_per_cell_type in range(1, 21):
    for kth_trial in range(10):
        logger.info(
            f"creating pseudobulk, cell type GEPs for {n_cells_per_cell_type}, trial {kth_trial}"
        )
        (
            bulk_pseudo,
            bulk_pseudo_cell_type_geps,
        ) = helpers.creating_mixtures.make_mixtures(
            sc_jerby_arnon_cleaned,
            sc_metadata_jerby_arnon,
            fractions_tcga_skcm_cleaned.rename(
                index=lambda sample: f"pseudo_like_{sample}"
            ),
            n_cells_per_gep=n_cells_per_cell_type,
            normalization_factor=1_000_000,
            malignant_from_one_sample=True,
            rng=rng,
        )
        break

# evaluate pseudobulks 

In [None]:
results = pd.DataFrame(index=range(1, 21))

In [None]:
results.groupby("n_cells_per_cell_type").agg(["mean", "std"])