In [3]:
# How many genes pass our filters? And other assorted questions
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pancancer_evaluation.config as cfg
import pancancer_evaluation.utilities.data_utilities as du

In [4]:
# load relevant data
# TODO: this needs to generalize to more than just the top 50 genes
# can also probably decouple loading genes and loading mutation data
print('Loading gene label data...', file=sys.stderr)
genes_df = du.load_top_50()
pancancer_data = du.load_pancancer_data(verbose=True)
(sample_freeze_df,
 mutation_df,
 copy_loss_df,
 copy_gain_df,
 mut_burden_df) = pancancer_data
rnaseq_df = du.load_expression_data(verbose=True)

Loading gene label data...
Loading pan-cancer data from cached pickle file...
Loading gene expression data...


In [5]:
print(rnaseq_df.shape)
rnaseq_df.iloc[:5, :5]

(11060, 16148)


Unnamed: 0_level_0,1,10,100,1000,10000
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-02-0047-01,125.0,10.4,136.0,2300.0,1300.0
TCGA-02-0055-01,392.0,1.12,222.0,1820.0,903.0
TCGA-02-2483-01,272.0,4.64,256.0,2890.0,1320.0
TCGA-02-2485-01,83.9,20.0,129.0,6970.0,10100.0
TCGA-02-2486-01,108.0,3.66,205.0,2250.0,873.0


In [6]:
mutation_genes = set(mutation_df.columns)
copy_loss_genes = set(copy_loss_df.columns)
copy_gain_genes = set(copy_gain_df.columns)
overlap_genes = mutation_genes.intersection(copy_loss_genes.intersection(copy_gain_genes))
print('Genes with mutation information: {}'.format(len(overlap_genes)))

Genes with mutation information: 19660


In [8]:
cancer_types_df = pd.read_csv(
    Path(cfg.data_dir, 'tcga_sample_counts.tsv').resolve(),
    sep='\t'
)
print('Total cancer type/mutation combinations: {}'.format(cancer_types_df.shape[0] * len(overlap_genes)))
cancer_types_df.head()

Total cancer type/mutation combinations: 648780


Unnamed: 0,cancertype,n =
0,BRCA,1218
1,KIRC,606
2,LUAD,576
3,THCA,572
4,UCEC,567


In [9]:
def filter_cancer_types(gene, y_df, sample_freeze, mutation_burden):
    # copies most code from process_y_matrix in pancancer_utilities.tcga_utilities
    # 
    # note this is not including copy number variants, to do that we have to
    # know oncogene/TSG status for every gene (need to figure out where to get
    # this info)
    y_df = pd.DataFrame(y_df)
    y_df.columns = ['status']
    y_df = (
        y_df.merge(
            sample_freeze, how='left', left_index=True, right_on='SAMPLE_BARCODE'
        )
        .set_index('SAMPLE_BARCODE')
        .merge(mutation_burden, left_index=True, right_index=True)
    )
    disease_counts_df = pd.DataFrame(y_df.groupby('DISEASE').sum()['status'])
    disease_proportion_df = disease_counts_df.divide(
        y_df['DISEASE'].value_counts(sort=False).sort_index(), axis=0
    )
    filter_disease_df = (disease_counts_df > cfg.filter_count) & (disease_proportion_df > cfg.filter_prop)
    filter_disease_df.columns = ['disease_included']
    return filter_disease_df


valid_combos_df = pd.DataFrame()
counter = 0
for gene in overlap_genes:
    filter_df = filter_cancer_types(gene, mutation_df.loc[:, gene],
                                    sample_freeze_df, mut_burden_df)
    valid_df = (
        filter_df.query('disease_included')
        .drop(['disease_included'], axis='columns')
        .reset_index()
        .rename({'DISEASE': 'disease'}, axis='columns')
    )
    valid_df['gene'] = gene
    if len(valid_df) > 0:
        valid_combos_df = pd.concat((valid_combos_df, valid_df))
    counter += 1
    if counter % 500 == 0:
        print('{} done'.format(counter), file=sys.stderr)
        
print('done.', file=sys.stderr)

500 done
1000 done
1500 done
2000 done
2500 done
3000 done
3500 done
4000 done
4500 done
5000 done
5500 done
6000 done
6500 done
7000 done
7500 done
8000 done
8500 done
9000 done
9500 done
10000 done
10500 done
11000 done
11500 done
12000 done
12500 done
13000 done
13500 done
14000 done
14500 done
15000 done
15500 done
16000 done
16500 done
17000 done
17500 done
18000 done
18500 done
19000 done
19500 done


Unnamed: 0,disease,gene
0,UCEC,HPS3
0,COAD,NRXN2
1,SKCM,NRXN2
2,STAD,NRXN2
3,UCEC,NRXN2


In [12]:
print(len(valid_combos_df), '/', cancer_types_df.shape[0] * len(overlap_genes), 'valid mutation/cancer type combinations')
valid_combos_df.head()

13422 / 648780 valid mutation/cancer type combinations


Unnamed: 0,disease,gene
0,UCEC,HPS3
0,COAD,NRXN2
1,SKCM,NRXN2
2,STAD,NRXN2
3,UCEC,NRXN2


In [30]:
top_genes_df = du.load_top_50()
top_valid_df = valid_combos_df[valid_combos_df['gene'].isin(top_genes_df.gene)]
print(len(top_valid_df), 'combos out of', 50 *33, 'possibiliites')
unique_genes = np.unique(top_valid_df.gene)
print(len(unique_genes), 'genes have valid')
unique_cancers = np.unique(top_valid_df.disease)
print(len(unique_cancers), 'cancers have valid combinations, out of ' '
print(unique_cancers)

536 combos out of 1650 possibiliites
50 genes have valid
24 cancers have valid
['BLCA' 'BRCA' 'CESC' 'COAD' 'ESCA' 'GBM' 'HNSC' 'KICH' 'KIRC' 'KIRP'
 'LGG' 'LIHC' 'LUAD' 'LUSC' 'OV' 'PAAD' 'PRAD' 'READ' 'SARC' 'SKCM' 'STAD'
 'THCA' 'UCEC' 'UCS']
