In [1]:
import numpy as np
import pandas as pd

In [2]:
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io

plotly.io.renderers.default = "jupyterlab+png"

In [3]:
import helpers

In [4]:
import logging

handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)

logging.getLogger("helpers").setLevel("DEBUG")
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")

In [5]:
rng = np.random.default_rng(seed=0)

# load data

## from `gs://liulab/ftp/GSE115978/GSE115978_tpm.csv`

In [None]:
from helpers.datasets import GENE_SYMBOL_COLUMN_NAME, SINGLE_CELL_COLUMN_NAME

x = pd.read_csv(
    "gs://liulab/ftp/GSE115978/GSE115978_tpm.csv",
    index_col=0,
    nrows=300,
)
x = x.rename_axis(index=GENE_SYMBOL_COLUMN_NAME, columns=SINGLE_CELL_COLUMN_NAME)
x = x.sort_index(axis="rows")
x = x.sort_index(axis="columns")
x

## TCGA SKCM bulk RNA-seq

### from derek's file

In [None]:
mixtures_tcga_skcm = helpers.datasets.load_tcga_skcm()
# mixtures_tcga_skcm = helpers.creating_mixtures.normalize_expression(mixtures_tcga_skcm, 1_000_000)

In [None]:
mixtures_tcga_skcm

In [None]:
mixtures_tcga_skcm["TCGA.3N.A9WB.06A.11R.A38C.07"]

In [None]:
mixtures_tcga_skcm["TCGA.3N.A9WB.06A.11R.A38C.07"].loc["A1BG"]

### from BigQuery table (isb-cgc-bq)

```
isb-cgc-bq.TCGA.RNAseq_hg38_gdc_current
```

docs:
1. https://isb-cancer-genomics-cloud.readthedocs.io/en/latest/sections/BigQuery.html
1. https://isb-cgc.appspot.com/bq_meta_search/
1. https://isb-cgc.appspot.com/cohorts/new_cohort/


my queries:
1. https://console.cloud.google.com/bigquery?p=isb-cgc-bq&d=TCGA&t=RNAseq_hg38_gdc_current&page=table&project=text-to-speech-api-329815&ws=!1m5!1m4!4m3!1sisb-cgc-bq!2sTCGA!3sRNAseq_hg38_gdc_current

In [53]:
query = """
select *
from `isb-cgc-bq.TCGA.RNAseq_hg38_gdc_current` 
where aliquot_barcode = "TCGA-3N-A9WB-06A-11R-A38C-07"
limit 10
"""

df = pd.read_gbq(query, project_id="keen-dispatch-316219")
df = df.set_index("gene_name", drop=False)

In [None]:
df

In [52]:
pd.read_gbq(
    """
select 
    *
from 
    `isb-cgc-bq.TCGA.RNAseq_hg38_gdc_current` 
where 
    aliquot_barcode = "TCGA-3N-A9WB-06A-11R-A38C-07"
    and gene_type = "TEC"
""",
    project_id="keen-dispatch-316219",
)

Unnamed: 0,project_short_name,case_barcode,primary_site,sample_barcode,aliquot_barcode,gene_name,gene_type,Ensembl_gene_id,Ensembl_gene_id_v,HTSeq__Counts,HTSeq__FPKM,HTSeq__FPKM_UQ,sample_type_name,case_gdc_id,sample_gdc_id,aliquot_gdc_id,file_gdc_id_counts,file_gdc_id_fpkm,file_gdc_id_fpkm_uq,platform
0,TCGA-SKCM,TCGA-3N-A9WB,Skin,TCGA-3N-A9WB-06A,TCGA-3N-A9WB-06A-11R-A38C-07,RP11-849N15.3,TEC,ENSG00000279660,ENSG00000279660.1,16,0.224792,5512.420862,Metastatic,5564e6a7-2195-4b0d-994e-b0617b58e889,5b0c4070-1e90-4562-85e3-96dd34444e29,4c243ea9-dfe1-42f0-a887-3c901fb38542,48151fe4-7aa5-448f-9a3f-c69b77917c81,6f0f09df-cb9e-4ee6-9f0d-e0c7bb2aa001,a44af3c3-8e6d-4b34-a92c-993d687f0211,Illumina
1,TCGA-SKCM,TCGA-3N-A9WB,Skin,TCGA-3N-A9WB-06A,TCGA-3N-A9WB-06A-11R-A38C-07,CEP83-AS1,TEC,ENSG00000278916,ENSG00000278916.1,48,0.361099,8854.964536,Metastatic,5564e6a7-2195-4b0d-994e-b0617b58e889,5b0c4070-1e90-4562-85e3-96dd34444e29,4c243ea9-dfe1-42f0-a887-3c901fb38542,48151fe4-7aa5-448f-9a3f-c69b77917c81,6f0f09df-cb9e-4ee6-9f0d-e0c7bb2aa001,a44af3c3-8e6d-4b34-a92c-993d687f0211,Illumina
2,TCGA-SKCM,TCGA-3N-A9WB,Skin,TCGA-3N-A9WB-06A,TCGA-3N-A9WB-06A-11R-A38C-07,RP11-113K21.6,TEC,ENSG00000279900,ENSG00000279900.1,2,0.082619,2025.997601,Metastatic,5564e6a7-2195-4b0d-994e-b0617b58e889,5b0c4070-1e90-4562-85e3-96dd34444e29,4c243ea9-dfe1-42f0-a887-3c901fb38542,48151fe4-7aa5-448f-9a3f-c69b77917c81,6f0f09df-cb9e-4ee6-9f0d-e0c7bb2aa001,a44af3c3-8e6d-4b34-a92c-993d687f0211,Illumina
3,TCGA-SKCM,TCGA-3N-A9WB,Skin,TCGA-3N-A9WB-06A,TCGA-3N-A9WB-06A-11R-A38C-07,RP11-170K4.2,TEC,ENSG00000279311,ENSG00000279311.1,2,0.019409,475.962014,Metastatic,5564e6a7-2195-4b0d-994e-b0617b58e889,5b0c4070-1e90-4562-85e3-96dd34444e29,4c243ea9-dfe1-42f0-a887-3c901fb38542,48151fe4-7aa5-448f-9a3f-c69b77917c81,6f0f09df-cb9e-4ee6-9f0d-e0c7bb2aa001,a44af3c3-8e6d-4b34-a92c-993d687f0211,Illumina
4,TCGA-SKCM,TCGA-3N-A9WB,Skin,TCGA-3N-A9WB-06A,TCGA-3N-A9WB-06A-11R-A38C-07,RP11-131H24.5,TEC,ENSG00000279593,ENSG00000279593.1,0,0.000000,0.000000,Metastatic,5564e6a7-2195-4b0d-994e-b0617b58e889,5b0c4070-1e90-4562-85e3-96dd34444e29,4c243ea9-dfe1-42f0-a887-3c901fb38542,48151fe4-7aa5-448f-9a3f-c69b77917c81,6f0f09df-cb9e-4ee6-9f0d-e0c7bb2aa001,a44af3c3-8e6d-4b34-a92c-993d687f0211,Illumina
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1040,TCGA-SKCM,TCGA-3N-A9WB,Skin,TCGA-3N-A9WB-06A,TCGA-3N-A9WB-06A-11R-A38C-07,RP11-893F2.18,TEC,ENSG00000279792,ENSG00000279792.1,6,0.049440,1212.379853,Metastatic,5564e6a7-2195-4b0d-994e-b0617b58e889,5b0c4070-1e90-4562-85e3-96dd34444e29,4c243ea9-dfe1-42f0-a887-3c901fb38542,48151fe4-7aa5-448f-9a3f-c69b77917c81,6f0f09df-cb9e-4ee6-9f0d-e0c7bb2aa001,a44af3c3-8e6d-4b34-a92c-993d687f0211,Illumina
1041,TCGA-SKCM,TCGA-3N-A9WB,Skin,TCGA-3N-A9WB-06A,TCGA-3N-A9WB-06A-11R-A38C-07,RP11-178H8.7,TEC,ENSG00000279696,ENSG00000279696.1,75,0.444285,10894.879232,Metastatic,5564e6a7-2195-4b0d-994e-b0617b58e889,5b0c4070-1e90-4562-85e3-96dd34444e29,4c243ea9-dfe1-42f0-a887-3c901fb38542,48151fe4-7aa5-448f-9a3f-c69b77917c81,6f0f09df-cb9e-4ee6-9f0d-e0c7bb2aa001,a44af3c3-8e6d-4b34-a92c-993d687f0211,Illumina
1042,TCGA-SKCM,TCGA-3N-A9WB,Skin,TCGA-3N-A9WB-06A,TCGA-3N-A9WB-06A-11R-A38C-07,CTD-2286N8.1,TEC,ENSG00000279474,ENSG00000279474.1,0,0.000000,0.000000,Metastatic,5564e6a7-2195-4b0d-994e-b0617b58e889,5b0c4070-1e90-4562-85e3-96dd34444e29,4c243ea9-dfe1-42f0-a887-3c901fb38542,48151fe4-7aa5-448f-9a3f-c69b77917c81,6f0f09df-cb9e-4ee6-9f0d-e0c7bb2aa001,a44af3c3-8e6d-4b34-a92c-993d687f0211,Illumina
1043,TCGA-SKCM,TCGA-3N-A9WB,Skin,TCGA-3N-A9WB-06A,TCGA-3N-A9WB-06A-11R-A38C-07,RP11-642A1.1,TEC,ENSG00000280326,ENSG00000280326.1,0,0.000000,0.000000,Metastatic,5564e6a7-2195-4b0d-994e-b0617b58e889,5b0c4070-1e90-4562-85e3-96dd34444e29,4c243ea9-dfe1-42f0-a887-3c901fb38542,48151fe4-7aa5-448f-9a3f-c69b77917c81,6f0f09df-cb9e-4ee6-9f0d-e0c7bb2aa001,a44af3c3-8e6d-4b34-a92c-993d687f0211,Illumina


In [None]:
query = """
SELECT 
    --project_short_name,
    --case_barcode,
    --sample_barcode,
    aliquot_barcode,
    --primary_site,
    gene_name,
    gene_type,
    Ensembl_gene_id,
    HTSeq__Counts,
    HTSeq__FPKM,
    HTSeq__FPKM_UQ,
    sample_type_name,
    platform
FROM `isb-cgc-bq.TCGA.RNAseq_hg38_gdc_current` 
where aliquot_barcode = "TCGA-3N-A9WB-06A-11R-A38C-07"
"""

df = (
    pd.read_gbq(query, project_id="keen-dispatch-316219")
    .set_index("gene_name", drop=False)
    .sort_index()
)

In [None]:
df

In [None]:
df.loc[df["gene_name"].str[:2] == "PU"]

In [None]:
df.loc[df["gene_type"] == "protein_coding"]

In [None]:
df.gene_type.value_counts().index

In [None]:
df.groupby("gene_type").agg(
    count=("gene_name", "count"),
    HTSeq__Counts_sum=("HTSeq__Counts", "sum"),
    HTSeq__Counts_zero_frac=("HTSeq__Counts", lambda x: (x == 0).mean()),
    HTSeq__FPKM=("HTSeq__FPKM", "sum"),
    HTSeq__FPKM_UQ=("HTSeq__FPKM_UQ", "sum"),
).sort_values("count", ascending=False)

In [None]:
np.log10(df["HTSeq__Counts"] + 1).describe()

In [None]:
px.histogram(np.log10(df["HTSeq__Counts"] + 1))

In [None]:
df.loc["A1BG"]

### clinical metadata from isb-cgc-bq

In [15]:
query = """
select *
from `isb-cgc-bq.TCGA.biospecimen_gdc_current`
where case_barcode = "TCGA-3N-A9WB"
"""

pd.read_gbq(query, project_id="keen-dispatch-316219")

Unnamed: 0,sample_barcode,sample_gdc_id,case_barcode,case_gdc_id,sample_type,sample_type_name,program_name,project_short_name,batch_number,bcr,...,max_percent_tumor_cells,max_percent_tumor_nuclei,min_percent_lymphocyte_infiltration,min_percent_monocyte_infiltration,min_percent_necrosis,min_percent_neutrophil_infiltration,min_percent_normal_cells,min_percent_stromal_cells,min_percent_tumor_cells,min_percent_tumor_nuclei
0,TCGA-3N-A9WB-10A,2aad1db1-196a-4699-a5a3-24f8214d8b7d,TCGA-3N-A9WB,5564e6a7-2195-4b0d-994e-b0617b58e889,10,Blood Derived Normal,TCGA,TCGA-SKCM,393,Nationwide Children's Hospital,...,,,,,,,,,,
1,TCGA-3N-A9WB-06A,5b0c4070-1e90-4562-85e3-96dd34444e29,TCGA-3N-A9WB,5564e6a7-2195-4b0d-994e-b0617b58e889,6,Metastatic,TCGA,TCGA-SKCM,393,Nationwide Children's Hospital,...,80.0,80.0,3.0,1.0,0.0,2.0,3.0,17.0,80.0,80.0


In [16]:
query = """
select *
from `isb-cgc-bq.TCGA.biospecimen_gdc_current`
where project_short_name = "TCGA-SKCM"
"""

df = pd.read_gbq(query, project_id="keen-dispatch-316219")

In [20]:
df.loc[0]

sample_barcode                                             TCGA-GN-A4U4-06A
sample_gdc_id                          12dc3f5a-49a4-4f56-935d-eb09a6789523
case_barcode                                                   TCGA-GN-A4U4
case_gdc_id                            6dc9c5b0-77e6-4c53-a75c-8f01731c54e8
sample_type                                                              06
sample_type_name                                                 Metastatic
program_name                                                           TCGA
project_short_name                                                TCGA-SKCM
batch_number                                                            332
bcr                                          Nationwide Children's Hospital
days_to_collection                                                    526.0
days_to_sample_procurement                                             65.0
is_ffpe                                                                  NO
num_portions

In [19]:
df.nunique()

sample_barcode                         948
sample_gdc_id                          946
case_barcode                           471
case_gdc_id                            470
sample_type                              5
sample_type_name                         5
program_name                             1
project_short_name                       1
batch_number                            14
bcr                                      1
days_to_collection                     429
days_to_sample_procurement             518
is_ffpe                                  1
num_portions                             2
num_slides                               2
avg_percent_lymphocyte_infiltration     18
avg_percent_monocyte_infiltration        9
avg_percent_necrosis                    15
avg_percent_neutrophil_infiltration      4
avg_percent_normal_cells                19
avg_percent_stromal_cells               34
avg_percent_tumor_cells                 27
avg_percent_tumor_nuclei                19
max_percent

In [33]:
df["sample_type_name"].value_counts()

Blood Derived Normal     470
Metastatic               369
Primary solid Tumor      104
Solid Tissue Normal        3
Additional Metastatic      2
Name: sample_type_name, dtype: int64

In [37]:
df.groupby(["case_barcode", "sample_type_name", "sample_barcode"])[
    "sample_barcode"
].nunique().sort_index()

case_barcode  sample_type_name      sample_barcode  
TCGA-3N-A9WB  Blood Derived Normal  TCGA-3N-A9WB-10A    1
              Metastatic            TCGA-3N-A9WB-06A    1
TCGA-3N-A9WC  Blood Derived Normal  TCGA-3N-A9WC-10A    1
              Metastatic            TCGA-3N-A9WC-06A    1
TCGA-3N-A9WD  Blood Derived Normal  TCGA-3N-A9WD-10A    1
                                                       ..
TCGA-Z2-A8RT  Metastatic            TCGA-Z2-A8RT-06A    1
TCGA-Z2-AA3S  Blood Derived Normal  TCGA-Z2-AA3S-10A    1
              Metastatic            TCGA-Z2-AA3S-06A    1
TCGA-Z2-AA3V  Blood Derived Normal  TCGA-Z2-AA3V-10A    1
              Metastatic            TCGA-Z2-AA3V-06A    1
Name: sample_barcode, Length: 948, dtype: int64

In [46]:
query = """
select sample_barcode, sample_gdc_id, case_barcode, case_gdc_id, sample_type, sample_type_name, program_name, project_short_name
from `isb-cgc-bq.TCGA.biospecimen_gdc_current`
where sample_type_name = "Metastatic" and project_short_name = "TCGA-SKCM"
"""

samples_mets = pd.read_gbq(query, project_id="keen-dispatch-316219")

In [47]:
df.loc[0]

sample_barcode                            TCGA-GN-A4U4-06A
sample_gdc_id         12dc3f5a-49a4-4f56-935d-eb09a6789523
case_barcode                                  TCGA-GN-A4U4
case_gdc_id           6dc9c5b0-77e6-4c53-a75c-8f01731c54e8
sample_type                                             06
sample_type_name                                Metastatic
program_name                                          TCGA
project_short_name                               TCGA-SKCM
Name: 0, dtype: object

In [50]:
samples_mets = df["sample_barcode"]

In [51]:
samples_mets

0      TCGA-GN-A4U4-06A
1      TCGA-D3-A2JG-06A
2      TCGA-EE-A29H-06A
3      TCGA-BF-AAP0-06A
4      TCGA-EE-A3AD-06A
             ...       
364    TCGA-EE-A29B-06A
365    TCGA-FS-A4F5-06A
366    TCGA-QB-AA9O-06A
367    TCGA-GN-A261-06A
368    TCGA-EE-A3AF-06A
Name: sample_barcode, Length: 369, dtype: object

## Jerby-Arnon scRNA-seq

In [None]:
sc_data, sc_metadata = helpers.datasets.load_jerby_arnon()
sc_data = helpers.creating_mixtures.normalize_expression(sc_data, 1_000_000)

In [None]:
sc_data

In [None]:
sc_metadata

## pseudobulks from Jerby-Arnon scRNA-seq

### fractions from csx of TCGA SKCM

In [None]:
fractions = helpers.datasets.load_tcga_skcm_fractions_from_csx()

In [None]:
fractions

### compute pseudobulks

In [None]:
# import importlib
# importlib.reload(helpers.creating_mixtures)

In [None]:
mixtures_in_silico, cell_type_geps = helpers.creating_mixtures.make_mixtures(
    sc_data,
    sc_metadata,
    fractions,
    n_cells_per_gep=5,
    normalization_factor=1_000_000,
    rng=rng,
)

In [None]:
helpers.creating_mixtures.make_cell_type_geps(sc_data, sc_metadata, rng=rng)

In [None]:
x = sc_metadata.groupby("cell.types")

In [None]:
for thing in x.groups.values():
    print(thing)

In [None]:
type(thing)

In [None]:
sc_metadata.groupby("cell.types").apply(lambda group: list(rng.choice(group.index, 3)))

In [None]:
metadata = pd.read_csv(
    "gs://liulab/ftp/GSE115978/GSE115978_cell.annotations.csv",
    na_values={"cell.types": "?"},
)
metadata = metadata.replace({"cell.types": helpers.cell_type_naming.weird_to_nice})
metadata = metadata.rename(columns={"cells": helpers.datasets.SINGLE_CELL_COLUMN_NAME})
metadata = metadata.set_index(helpers.datasets.SINGLE_CELL_COLUMN_NAME, drop=False)
metadata = metadata.sort_index()
# metadata = metadata.sort_index(axis="columns")

In [None]:
metadata