# about

optimizing generation of pseudobulk rna-seq samples

In [1]:
mets_only_tcga = True
gene_sparsity_ceiling_tcga = 0.5

# imports

In [15]:
import logging

import helpers
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io
from upath import UPath as Path

In [3]:
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)

In [4]:
logging.getLogger("gcsfs").setLevel("INFO")
logging.getLogger("google.cloud.bigquery").setLevel("DEBUG")
logging.getLogger("helpers").setLevel("DEBUG")
logging.getLogger("helpers.creating_mixtures").setLevel("INFO")
logging.getLogger("pandas").setLevel("DEBUG")
logging.getLogger("pyarrow").setLevel("DEBUG")

In [5]:
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")

In [6]:
rng = np.random.default_rng(seed=0)

# loading data

In [7]:
%%time
bulk_tcga_skcm = helpers.datasets.load_tcga_skcm_hg19_scaled_estimate_firebrowse()
bulk_tcga_skcm *= 1_000_000 / bulk_tcga_skcm.sum()

2022-05-26 05:28:30,539 - helpers.datasets - DEBUG - reading gs://liulab/firebrowse.org/SKCM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt


CPU times: user 15.7 s, sys: 1.01 s, total: 16.7 s
Wall time: 28.5 s


In [8]:
%%time
fractions_tcga_skcm = helpers.datasets.load_tcga_skcm_fractions_from_csx()

2022-05-26 05:28:59,017 - helpers.datasets - DEBUG - loading TCGA SKCM fractions estimated by CIBERSORTx


CPU times: user 13.8 ms, sys: 3.79 ms, total: 17.6 ms
Wall time: 80.4 ms


In [9]:
%%time
sc_jerby_arnon, sc_metadata_jerby_arnon = helpers.datasets.load_jerby_arnon(
    ref_genome="hg19", units="tpm"
)
sc_jerby_arnon *= 1_000_000 / sc_jerby_arnon.sum()

2022-05-26 05:28:59,105 - helpers.datasets - DEBUG - loading Jerby-Arnon scRNA-seq data
2022-05-26 05:29:26,880 - helpers.datasets - DEBUG - loading Jerby-Arnon metadata


CPU times: user 55.1 s, sys: 12.8 s, total: 1min 7s
Wall time: 28.4 s


## apply filters to data

In [10]:
# determine gene exclusions

exclusions_genes = pd.DataFrame(index=bulk_tcga_skcm.index.union(sc_jerby_arnon.index))

## sparsity in tcga
genes_sparse_tcga_skcm = bulk_tcga_skcm[
    (bulk_tcga_skcm == 0).mean(axis=1) > gene_sparsity_ceiling_tcga
].index
exclusions_genes["sparse_in_tcga_skcm"] = exclusions_genes.index.map(
    lambda g: g in genes_sparse_tcga_skcm
)

## genes not in both cohorts
genes_in_both = bulk_tcga_skcm.index.intersection(sc_jerby_arnon.index)
exclusions_genes["not_in_both_cohorts"] = exclusions_genes.index.map(
    lambda gene_name: gene_name not in genes_in_both
)

good_genes = exclusions_genes.loc[~exclusions_genes.any(axis=1)].index

https://console.cloud.google.com/bigquery?authuser=1&project=keen-dispatch-316219

In [11]:
# determine tcga sample exclusions

exclusions_tcga_samples = pd.DataFrame(index=bulk_tcga_skcm.columns)

## limit to metastases

if mets_only_tcga:
    query_text = """
        SELECT aliquot_barcode
        FROM `keen-dispatch-316219.gdc_tcga_skcm_subset.aliquot2caseIDmap_current`
        WHERE sample_type_name = 'Metastatic'
    """
    metastatic_aliquot_barcodes = pd.read_gbq(query_text)["aliquot_barcode"].values
    exclusions_tcga_samples["is_not_metastatic"] = exclusions_tcga_samples.index.map(
        lambda sample: sample not in metastatic_aliquot_barcodes
    )

good_tcga_samples = exclusions_tcga_samples.loc[
    ~exclusions_tcga_samples.any(axis=1)
].index

In [12]:
# apply exclusions
sc_jerby_arnon_cleaned = sc_jerby_arnon.loc[good_genes]
sc_jerby_arnon_cleaned *= 1_000_000 / sc_jerby_arnon_cleaned.sum()
bulk_tcga_skcm_cleaned = bulk_tcga_skcm.loc[good_genes][good_tcga_samples]
bulk_tcga_skcm_cleaned *= 1_000_000 / bulk_tcga_skcm_cleaned.sum()
fractions_tcga_skcm_cleaned = fractions_tcga_skcm.loc[good_tcga_samples]

print(
    "without exclusions:",
    [df.shape for df in (sc_jerby_arnon, bulk_tcga_skcm, fractions_tcga_skcm)],
)

print(
    "with exclusions:",
    [
        df.shape
        for df in (
            sc_jerby_arnon_cleaned,
            bulk_tcga_skcm_cleaned,
            fractions_tcga_skcm_cleaned,
        )
    ],
)

without exclusions: [(23686, 7186), (20501, 473), (473, 9)]
with exclusions: [(16063, 7186), (16063, 368), (368, 9)]


In [None]:
logger.setLevel("DEBUG")

rng = np.random.default_rng(0)
uri_base = Path("gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc")
print(uri_base)
!gsutil rm -r {str(uri_base)}


def reformat_rna_seq_dataframe(df, sample_column_name):
    return (
        df.rename_axis(columns=sample_column_name)
        .stack()
        .to_frame(name="tpm")
        .reset_index()
        .astype(
            {
                "gene_symbol": "category",
                sample_column_name: "category",
            }
        )
    )


bulk_tcga_skcm_cleaned_out = reformat_rna_seq_dataframe(
    bulk_tcga_skcm_cleaned, "aliquot_barcode"
)
uri_bulk_tcga_skcm_cleaned = str(uri_base / "mixtures_real_tcga_skcm" / "tpm.parquet")
logger.debug(f"writing real tcga skcm bulk samples to {uri_bulk_tcga_skcm_cleaned}")
bulk_tcga_skcm_cleaned_out.to_parquet(
    uri_bulk_tcga_skcm_cleaned,
    engine="pyarrow",
)

uri_bulk_tcga_skcm_fractions = str(
    uri_base / "mixtures_real_tcga_skcm" / "estimated_fractions.parquet"
)
logger.debug(
    f"writing fractions of real tcga bulk samples to {uri_bulk_tcga_skcm_fractions}"
)
fractions_tcga_skcm_cleaned.to_parquet(
    uri_bulk_tcga_skcm_fractions,
    engine="pyarrow",
)


for n_cells in range(1, 21):
    for malignant_from_one_sample in (True, False):
        logger.info(f"{n_cells}, {malignant_from_one_sample}")
        logger.debug("making")
        mixtures, cell_type_geps = helpers.creating_mixtures.make_mixtures(
            sc_data=sc_jerby_arnon_cleaned,
            sc_metadata=sc_metadata_jerby_arnon,
            sample_fractions=fractions_tcga_skcm_cleaned,
            n_cells_per_gep=n_cells,
            malignant_from_one_sample=malignant_from_one_sample,
            rng=rng,
        )
        logger.debug("reformatting")
        mixtures_out = reformat_rna_seq_dataframe(
            mixtures, "tcga_aliquot_barcode_for_fractions"
        )
        uri_mixtures = str(
            uri_base
            / "mixtures"
            / f"n_cells={n_cells}"
            / f"malignant_from_one_sample={malignant_from_one_sample}"
            / "data.parquet"
        )
        logger.debug(f"writing mixtures to {uri_mixtures}")
        mixtures_out.to_parquet(
            uri_mixtures,
            engine="pyarrow",
        )
        # logger.debug("writing cell type GEPs")
        # for sample_name, cell_type_gep in cell_type_geps.items():
        #     uri_cell_type_geps = str(
        #         uri_base
        #         / "cell_type_geps"
        #         / f"sample_name={sample_name}"
        #         / f"n_cells={n_cells}"
        #         / f"malignant_from_one_sample={malignant_from_one_sample}"
        #         / "data.parquet"
        #     )
        #     cell_type_gep.to_parquet(uri_cell_type_geps, engine="pyarrow")

gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc
Removing gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures_real_tcga_skcm/estimated_fractions.parquet#1653543074488854...
Removing gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures_real_tcga_skcm/tpm.parquet#1653543074158280...
/ [2 objects]                                                                   
Operation completed over 2 objects.                                              


2022-05-26 05:31:59,151 - __main__ - DEBUG - writing mixtures to gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures_real_tcga_skcm/tpm.parquet
2022-05-26 05:32:00,951 - __main__ - INFO - 1, True
2022-05-26 05:32:00,952 - __main__ - DEBUG - making
2022-05-26 05:32:12,452 - __main__ - DEBUG - reformatting
2022-05-26 05:32:13,782 - __main__ - DEBUG - writing mixtures to gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=1/malignant_from_one_sample=True/data.parquet
2022-05-26 05:32:14,850 - __main__ - INFO - 1, False
2022-05-26 05:32:14,850 - __main__ - DEBUG - making
2022-05-26 05:32:27,571 - __main__ - DEBUG - reformatting
2022-05-26 05:32:29,121 - __main__ - DEBUG - writing mixtures to gs://liulab/data/pseudobulk_optimization/3_with_tcga_qc/mixtures/n_cells=1/malignant_from_one_sample=False/data.parquet
2022-05-26 05:32:30,213 - __main__ - INFO - 2, True
2022-05-26 05:32:30,214 - __main__ - DEBUG - making
2022-05-26 05:32:38,065 - __main__ - DEBUG - 