## Random gene sampling

In [1]:
import os

import numpy as np
import pandas as pd

import mpmp.config as cfg
import mpmp.utilities.data_utilities as du

In [2]:
sample_info_df = du.load_sample_info('expression', verbose=True)
mutation_df = du.load_pancancer_data(verbose=True)[1]
print(sample_info_df.shape)
print(mutation_df.shape)

Loading sample info...
Loading pan-cancer data from cached pickle file...


(11060, 3)
(9074, 20938)


In [3]:
mutations_df = (mutation_df
    .merge(sample_info_df, how='inner', left_index=True, right_index=True)
    .drop(columns=['sample_type', 'id_for_stratification'])
)
print(mutations_df.shape)

(9074, 20939)


In [4]:
sum_df = mutations_df.groupby('cancer_type').agg('sum')
count_df = mutations_df.groupby('cancer_type').agg('count')
ratio_df = sum_df / count_df
sum_df.iloc[:5, :5]

Unnamed: 0_level_0,5S_rRNA,A1BG,A1CF,A2M,A2ML1
cancer_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ACC,0,0,0,0,0
BLCA,0,4,3,15,12
BRCA,0,3,8,12,10
CESC,0,1,4,5,6
CHOL,0,0,1,1,0


In [5]:
SUM_THRESHOLD = 10
PROP_THRESHOLD = 0.1

sum_df = (sum_df > SUM_THRESHOLD)
ratio_df = (ratio_df > PROP_THRESHOLD)
valid_df = sum_df & ratio_df

print(sum_df.sum().sum())
print(ratio_df.sum().sum())
valid_df.iloc[:5, :5]

40436
2798


Unnamed: 0_level_0,5S_rRNA,A1BG,A1CF,A2M,A2ML1
cancer_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
ACC,False,False,False,False,False
BLCA,False,False,False,False,False
BRCA,False,False,False,False,False
CESC,False,False,False,False,False
CHOL,False,False,False,False,False


In [6]:
print(valid_df.sum().sum())
print(valid_df.sum().sort_values(ascending=False).head(10))

2687
TTN       22
TP53      22
MUC16     17
FLG       12
RYR2      12
CSMD3     11
SYNE1     11
LRP1B     11
PIK3CA    11
SPTA1     10
dtype: int64


In [7]:
NUM_CANCERS = 3

valid_genes = valid_df.sum()[valid_df.sum() >= NUM_CANCERS]
print(valid_genes.head(10))

ABCA12      3
ABCA13      8
ADAMTS12    3
ADAMTS16    3
ADAMTS20    4
AFF2        3
AHNAK       6
AHNAK2      7
AKAP9       3
ALMS1       3
dtype: int64


In [8]:
# sample randomly from valid genes and write to dataframe
sampled_genes = valid_genes.sample(n=50, random_state=cfg.default_seed)
print(sampled_genes.head())

ATRX     3
AHNAK    6
NF1      5
TENM2    4
SDK1     4
dtype: int64


In [9]:
# get oncogene/TSG status from other gene lists
top50_df = du.load_top_50()
vogelstein_df = du.load_vogelstein()
gene_to_class_map = dict(zip(top50_df.gene, top50_df.classification))
for gene in vogelstein_df.gene:
    if gene not in gene_to_class_map:
        gene_to_class_map[gene] = vogelstein_df.loc[vogelstein_df.gene == gene, 'classification'].values[0]
        
print(list(gene_to_class_map.items())[:5])

[('TP53', 'TSG'), ('TTN', 'neither'), ('MUC16', 'neither'), ('PIK3CA', 'Oncogene'), ('CSMD3', 'neither')]


In [10]:
def get_class(gene):
    # if genes aren't in other gene lists, mark as 'neither'
    # we could do this in a more sophisticated way in the future, if we want
    try:
        return gene_to_class_map[gene]
    except KeyError:
        return 'neither'
    
classes = [get_class(gene) for gene in sampled_genes.index.values]
random_df = pd.DataFrame({
    'gene': sampled_genes.index.values,
    'classification': classes
}).set_index('gene')

random_df.head()

Unnamed: 0_level_0,classification
gene,Unnamed: 1_level_1
ATRX,TSG
AHNAK,neither
NF1,TSG
TENM2,neither
SDK1,neither


In [11]:
random_df.to_csv(cfg.random_genes, sep='\t')