## Explore TCGA healthy control samples

We want to answer the following:

* how many healthy controls per cancer type
* do any of them have positive labels
* are these getting included in final dataset (probably?)

In [2]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd

import mpmp.config as cfg
from mpmp.data_models.tcga_data_model import TCGADataModel
import mpmp.utilities.data_utilities as du

In [7]:
# just do the analysis for expression data for now
# we could look at the overlap if we want to in the future
sample_info_df = du.load_sample_info('me_27k')
print(sample_info_df.sample_type.unique())
sample_info_df.head()

['Primary Solid Tumor' 'Recurrent Solid Tumor' 'Additional - New Primary'
 'Metastatic' 'Primary Blood Derived Cancer - Peripheral Blood'
 'Additional Metastatic' 'Solid Tissue Normal']


Unnamed: 0_level_0,sample_type,cancer_type,id_for_stratification
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
TCGA-02-0001-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor
TCGA-02-0003-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor
TCGA-02-0006-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor
TCGA-02-0007-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor
TCGA-02-0009-01,Primary Solid Tumor,GBM,GBMPrimary Solid Tumor


In [16]:
# all of the normal samples have the term "Normal" in their sample_type
# see: https://gdc.cancer.gov/resources-tcga-users/tcga-code-tables/sample-type-codes
normal_count_df = (sample_info_df
    .assign(normal_count=sample_info_df.sample_type.str.contains('Normal'))
    .groupby('cancer_type')
    .sum()
    # .drop(columns='id_for_stratification')
    # .rename(columns={'sample_type': 'normal_count'})
)

cancer_count_df = (sample_info_df
    .assign(cancer_count=(~sample_info_df.sample_type.str.contains('Normal')))
    .groupby('cancer_type')
    .sum()
)

count_df = normal_count_df.merge(cancer_count_df, left_index=True, right_index=True)
count_df['normal_prop'] = (
    count_df.normal_count / (count_df.cancer_count + count_df.normal_count)
)
count_df

Unnamed: 0_level_0,normal_count,cancer_count,normal_prop
cancer_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ACC,0,79,0.0
BLCA,21,413,0.048387
BRCA,112,1076,0.094276
CESC,3,308,0.009646
CHOL,9,36,0.2
COAD,70,445,0.135922
DLBC,0,48,0.0
ESCA,15,184,0.075377
GBM,2,432,0.004608
HNSC,50,525,0.086957
