In [1]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd

import mpmp.config as cfg
from mpmp.data_models.tcga_data_model import TCGADataModel
import mpmp.utilities.data_utilities as du

In [2]:
tcga_data = TCGADataModel(seed=cfg.default_seed, verbose=True)

Loading pan-cancer data from cached pickle file...
Loading expression data...
Loading sample info...


In [8]:
def gene_sample_count(gene, data_model, classification='neither'):
    print(gene, file=sys.stderr)
    try:
        tcga_data.process_data_for_gene(gene,
                                        classification,
                                        None)
        sample_count = tcga_data.X_df.shape[0]
    except KeyError:
        sample_count = np.nan
        
    # TODO: get cancer types?
    return (gene, sample_count)

In [5]:
# cache partial results and load them
output_file = Path('./gene_sample_count.tsv')
if output_file.is_file():
    output_df = pd.read_csv(output_file, sep='\t', index_col=0)
else:
    output_df = pd.DataFrame()
    
print(output_df.shape)
output_df.head()

In [9]:
print(gene_sample_count('TP53', tcga_data, classification='TSG'))

TP53


Loading sample IDs for mutation data
Loading sample IDs for expression data
Loading sample IDs for me_27k data
Loading sample IDs for me_27k_bmiq data
Loading sample IDs for me_450k data
Loading sample IDs for rppa data
Loading sample IDs for mirna data
Loading sample IDs for mut_sigs data
Taking intersection of sample IDs...done
('TP53', 3804)


In [6]:
vogelstein_df = du.load_vogelstein()
vogelstein_df.head()

Unnamed: 0,gene,Gene Name,# Mutated Tumor Samples**,Ocogene score*,Tumor Suppressor Gene score*,classification,Core pathway,Process
0,ABL1,"c-abl oncogene 1, receptor tyrosine kinase",851,0.926904,0.003046,Oncogene,Cell Cycle/Apoptosis,Cell Survival
1,ACVR1B,"activin A receptor, type IB",17,0.0,0.423077,TSG,TGF-b,Cell Survival
2,AKT1,v-akt murine thymoma viral oncogene homolog 1,155,0.929487,0.00641,Oncogene,PI3K,Cell Survival
3,ALK,anaplastic lymphoma receptor tyrosine kinase,189,0.72,0.01,Oncogene,PI3K; RAS,Cell Survival
4,APC,adenomatous polyposis coli,2561,0.024553,0.917222,TSG,APC,Cell Fate


In [None]:
save_every = 50

for gene_ix, gene_series in vogelstein_df.iterrows():
    if (gene_ix % save_every == 0) and (gene_ix != 0):
        output_df.to_csv(output_file, sep='\t')
    
        