# <span style='font-family:"Times New Roman"'> <span styel=''> **MASTER FILE CREATION**

*Emile Cohen* 
    
*March 2020*

**Goal:** In this Notebook, we create a master file that summarizes all useful information.

The Notebook is divided in 4 parts, representing the four parts of our Master file:
   
* **1. Patient/Sample Information**
* **2. TP53 Mutations**
* **3. TP53 Copy Numbers**
* **4. TP53 Computed Metrics**
* **5. Subgroup columns creation**
* **6. Merge tables**

**NB1:** In each part, you must run the cells from the begining in order to initialize the variables

**NB2:** In order to launch the last script (Merge Tables), you have to define the functions in each part.

**NB3:** All functions used for the plots are located in utils/custom_tools.py

---

In [4]:
%run -i '../../../utils/setup_environment.ipy'

import warnings
warnings.filterwarnings('ignore')
from scipy.stats import fisher_exact, ranksums, chi2, norm
from statsmodels.sandbox.stats.multicomp import multipletests
import matplotlib.gridspec as gridspec
import pickle

data_path = '../../../data/'
data_wgd = data_path + 'impact-facets-tp53/processed/wgd/'
data_no_wgd = data_path + 'impact-facets-tp53/processed/no_wgd/'

Setup environment... done!


<span style="color:green">✅ Working on **mskimpact_env** conda environment.</span>

In [82]:
# first we load all files for WGD
maf_cohort_wgd = pd.read_csv(data_wgd + 'maf_cohort_wgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
cohort_wgd = pd.read_csv(data_wgd + 'cohort_wgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
arm_level_wgd = pd.read_csv(data_wgd + 'arm_level_wgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
gene_level_wgd = pd.read_csv(data_wgd + 'gene_level_wgd.txt', sep='\t').drop('Unnamed: 0', axis=1)

In [83]:
# We load all files for non WGD

maf_cohort_nowgd = pd.read_csv(data_no_wgd + 'maf_cohort_nowgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
cohort_nowgd = pd.read_csv(data_no_wgd + 'cohort_nowgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
arm_level_nowgd = pd.read_csv(data_no_wgd + 'arm_level_nowgd.txt', sep='\t').drop('Unnamed: 0', axis=1)
gene_level_nowgd = pd.read_csv(data_no_wgd + 'gene_level_nowgd.txt', sep='\t').drop('Unnamed: 0', axis=1)

In [84]:
# Creating keys for mutations in maf files

# First we need to create a sample_mut_key to identify duplicated mutations
maf_cohort_wgd['mut_key'] = maf_cohort_wgd.apply(lambda h: str(h.Chromosome)+'_'+str(h.Start_Position)+'_'+str(h.Reference_Allele)+'_'+str(h.Tumor_Seq_Allele2), axis=1) 
maf_cohort_nowgd['mut_key'] = maf_cohort_nowgd.apply(lambda h: str(h.Chromosome)+'_'+str(h.Start_Position)+'_'+str(h.Reference_Allele)+'_'+str(h.Tumor_Seq_Allele2), axis=1) 

# Create a sample key to differentiate duplicates
maf_cohort_wgd['sample_mut_key'] = maf_cohort_wgd.apply(lambda h: h.Tumor_Sample_Barcode + h.mut_key, axis = 1)
maf_cohort_nowgd['sample_mut_key'] = maf_cohort_nowgd.apply(lambda h: h.Tumor_Sample_Barcode + h.mut_key, axis = 1)

In [22]:
# Load clinical data
clinical_data = pd.read_csv(data_path + 'cbioportal/raw/mskimpact_clinical_data-2.tsv', sep= '\t')

In [34]:
# Filtering the clinical data
samples_wgd = list(set(cohort_wgd.tumor_sample))
samples_nowgd = list(set(cohort_nowgd.tumor_sample))

clinical_wgd = clinical_data[clinical_data['Sample ID'].isin(samples_wgd)]
clinical_nowgd = clinical_data[clinical_data['Sample ID'].isin(samples_nowgd)]

# IMPORTANT: Defining the master type you want
In this script we can create two master files: one for WGD samples and one for non WGD samples.
So this parameter allows to select the type of master you want.

In [101]:
cohort_type = 'wgd'

---
# Patient/Sample Information

In this part, we focus on clinical information exported from CbioPortal.

The following columns are selected:
* Sample_Id
* Tumor_Id
* Patient_Id
* Patient Current Age
* Cancer_Type
* Cancer_Type_Detailed
* Sample_Type
* purity
* ploidy
* Overall Survival Status
* Overall Survival (Months)
* MSI Score
* MSI Type
* Tumor Mutational Burden

In [77]:
def create_sample_info(cohort:str):
    if cohort == 'wgd':
        cohort = cohort_wgd
        maf_cohort = maf_cohort_wgd
        clinical = clinical_wgd
    elif cohort == 'no_wgd':
        cohort = cohort_nowgd
        maf_cohort = maf_cohort_nowgd
        clinical = clinical_nowgd

    cohort_filt = cohort[['sample_id', 'tumor_sample', 'patient', 'ploidy']]
    clinical_filt = clinical[['Sample ID', 'Patient Current Age', 'Cancer Type', 'Cancer Type Detailed', 'Sample Type',
                  'Overall Survival (Months)', 'Overall Survival Status','MSI Score', 'MSI Type','Impact TMB Score']]
    purity = maf_cohort.drop_duplicates('Tumor_Sample_Barcode')[['Tumor_Sample_Barcode', 'purity']]

    # Merging these files
    sample_info = pd.merge(cohort_filt, purity, left_on='tumor_sample', right_on='Tumor_Sample_Barcode')
    sample_info =  pd.merge(sample_info, clinical_filt, left_on='tumor_sample', right_on='Sample ID').drop(['Tumor_Sample_Barcode', 'Sample ID'], axis=1)


    sample_info = sample_info[['sample_id', 'tumor_sample', 'patient', 'Cancer Type', 'Cancer Type Detailed',
                               'Patient Current Age','Sample Type', 'purity','ploidy','Overall Survival (Months)', 
                               'Overall Survival Status','MSI Score', 'MSI Type','Impact TMB Score']]

    sample_info.columns = ['Sample_Id', 'Tumor_Id', 'Patient_Id','Cancer_Type', 'Cancer_Type_Detailed', 'Patient_Current_Age',
                          'Sample_Type', 'purity', 'ploidy', 'Overall_Survival_Months', 'Overall_Survival_Status',
                          'MSI_Score', 'MSI_Type', 'TMB_Score']
    
    return sample_info


In [102]:
create_sample_info(cohort=cohort_type)

Unnamed: 0,Sample_Id,Tumor_Id,Patient_Id,Cancer_Type,Cancer_Type_Detailed,Patient_Current_Age,Sample_Type,purity,ploidy,Overall_Survival_Months,Overall_Survival_Status,MSI_Score,MSI_Type,TMB_Score
0,P-0025956-T01-IM6_P-0025956-N01-IM6,P-0025956-T01-IM6,P-0025956,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,71.0,Primary,0.273767,3.496971,3.584,DECEASED,0.00,Stable,5.3
1,P-0036909-T01-IM6_P-0036909-N01-IM6,P-0036909-T01-IM6,P-0036909,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,47.0,Metastasis,0.391316,2.871793,14.137,LIVING,0.37,Stable,3.5
2,P-0023546-T01-IM6_P-0023546-N01-IM6,P-0023546-T01-IM6,P-0023546,Prostate Cancer,Prostate Neuroendocrine Carcinoma,50.0,Primary,0.865628,3.115253,4.800,DECEASED,2.37,Stable,3.5
3,P-0023546-T02-IM6_P-0023546-N01-IM6,P-0023546-T02-IM6,P-0023546,Prostate Cancer,Prostate Adenocarcinoma,50.0,Primary,0.312907,3.136841,4.800,DECEASED,0.82,Stable,2.6
4,P-0018837-T01-IM6_P-0018837-N01-IM6,P-0018837-T01-IM6,P-0018837,Colorectal Cancer,Colon Adenocarcinoma,60.0,Primary,0.351778,3.727190,34.060,LIVING,0.45,Stable,5.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9893,P-0050750-T01-IM6_P-0050750-N01-IM6,P-0050750-T01-IM6,P-0050750,Pancreatic Cancer,Pancreatic Adenocarcinoma,55.0,Metastasis,0.301523,3.845451,0.099,LIVING,0.05,Stable,5.3
9894,P-0050643-T01-IM6_P-0050643-N01-IM6,P-0050643-T01-IM6,P-0050643,Breast Cancer,Breast Invasive Ductal Carcinoma,53.0,Primary,0.667936,4.419218,1.315,LIVING,0.30,Stable,4.4
9895,P-0050223-T02-IM6_P-0050223-N02-IM6,P-0050223-T02-IM6,P-0050223,Hepatobiliary Cancer,Cholangiocarcinoma,67.0,Primary,0.215128,4.725790,1.940,LIVING,0.00,Stable,3.5
9896,P-0048760-T01-IM6_P-0048760-N01-IM6,P-0048760-T01-IM6,P-0048760,Bone Cancer,Osteosarcoma,18.0,Primary,0.614603,3.947721,3.682,LIVING,5.79,Indeterminate,6.1


# TP53 Mutations
In this part, we focus on tp53 mutation information.

We gather all mutations per sample, and split it into different columns. We have the following columns:
* Tumor_Id	
* key_1 (2,3,4,5) --> Mutation key allowing to filter duplicates
* vc_1 (2,3,4,5) --> Variant Classification
* ccf_1 (2,3,4,5) --> Cancer Cell Fraction of the mutation
* vaf_1 (2,3,4,5) --> Variant Allele Frequency of the mutation
* HGVSp_1 (2,3,4,5) --> protein change
* spot_1 (2,3,4,5) --> Integer that defines the spot of the tp53 mutation
* tp53_count --> Number of tp53 mutations of the sample

In [104]:
def f_(x):
    # This function helps us to group mutations together in a single cell per patient
    return pd.DataFrame(dict(Tumor_Sample_Barcode = x['Tumor_Sample_Barcode'],  
                        muts = "%s" % ','.join(x['sample_mut_key_vc_ccf_vaf_hgv_spot'])))

def count_tp53_muts(x):
    count = 0
    for i in range(1,6):
        if x['tp53_key_' + str(i)]:
            count+= 1
    return count

cohort = cohort_type


# WARNING: THis function needs sample_info to work
def create_tp53_muts(cohort:str):
    if cohort == 'wgd':
        cohort = cohort_wgd
        maf_cohort = maf_cohort_wgd
        clinical = clinical_wgd
    elif cohort == 'no_wgd':
        cohort = cohort_nowgd
        maf_cohort = maf_cohort_nowgd
        clinical = clinical_nowgd

    '''
    This function aims to gather all tp53 mutation characteristics.
    For each sample we gather the tp53 mutations and their characteristics for all patients.
    '''
    # We load the  table created in maf_tp53_creation.ipynb
    maf_tp53 = maf_cohort[maf_cohort['Hugo_Symbol'] == 'TP53']
    maf_tp53['mut_spot'] = maf_tp53.HGVSp.str.extract('(\d+)')

    # We select only intresting columns
    maf_tp53_filtered = maf_tp53[['Tumor_Sample_Barcode','sample_mut_key', 'Variant_Classification',\
                                        'ccf_expected_copies', 't_var_freq', 'HGVSp','mut_spot' ]]

    # Let's Merge mut_key,Variant_classification, CF, CCF, and VAF to gather them
    maf_tp53_filtered['sample_mut_key_vc_ccf_vaf_hgv_spot'] = maf_tp53_filtered.apply(lambda x: str(x.sample_mut_key)+'%'+str(x.Variant_Classification)+'%'+str(x.ccf_expected_copies)+'%'+str(x.t_var_freq)+'%'+str(x.HGVSp)+'%'+str(x.mut_spot), axis=1)

    # We Select important columns
    final = maf_tp53_filtered[['Tumor_Sample_Barcode', 'sample_mut_key_vc_ccf_vaf_hgv_spot']]
    # We groupby Patient_Id and apply the function above to group mutations
    final = final.groupby(['Tumor_Sample_Barcode'], sort=False).apply(f_)

    # We separate the different mutations into 5 different columns (5 is the max number of tp53 mutations in our cohort)
    final[['mut_key_1','mut_key_2','mut_key_3','mut_key_4','mut_key_5']] = final.muts.str.split(',', expand=True)
    #final = final.drop(['mut_key_6'],axis=1)
    # Split the columns into mut_key_ and vc_
    final[['tp53_key_1','tp53_vc_1','tp53_ccf_1','tp53_vaf_1','tp53_HGVSp_1', 'tp53_spot_1']] = final.mut_key_1.str.split('%', expand=True)
    final[['tp53_key_2','tp53_vc_2','tp53_ccf_2','tp53_vaf_2','tp53_HGVSp_2', 'tp53_spot_2']] = final.mut_key_2.str.split('%', expand=True)
    final[['tp53_key_3','tp53_vc_3','tp53_ccf_3','tp53_vaf_3','tp53_HGVSp_3', 'tp53_spot_3']] = final.mut_key_3.str.split('%', expand=True)
    final[['tp53_key_4','tp53_vc_4','tp53_ccf_4','tp53_vaf_4','tp53_HGVSp_4', 'tp53_spot_4']] = final.mut_key_4.str.split('%', expand=True)
    final[['tp53_key_5','tp53_vc_5','tp53_ccf_5','tp53_vaf_5','tp53_HGVSp_5', 'tp53_spot_5']] = final.mut_key_5.str.split('%', expand=True)

    # We remove the muts column
    final = final.drop(['muts','mut_key_1','mut_key_2','mut_key_3','mut_key_4','mut_key_5'], axis=1)

    # We remove duplicates
    final = final.drop_duplicates('Tumor_Sample_Barcode')

    # We add the cohort patients that are not tp53 positive
    #First we create a dataframe with all missing samples

    cohort_samples = set(sample_info.Tumor_Id)
    final_samples = set(final.Tumor_Sample_Barcode)
    missing_samp = pd.DataFrame(cohort_samples - final_samples, columns = ['Tumor_Sample_Barcode'])
    #Then we append the two datframe
    final = final.append(missing_samp)

    # We rename the Tumor_Sample_Barcode column to have the same key as in other datframes
    final = final.rename(columns={'Tumor_Sample_Barcode': 'Tumor_Id'})

    # We add a last column tp53_count that represents the number of tp53 mutations per sample
    final = final.where(final.notnull(), None)
    final['tp53_count'] = final.apply(count_tp53_muts, axis = 1)

    # We change the type of vafs column to float64 instead of strings
    final = final.astype({'tp53_vaf_1': 'float64', 'tp53_vaf_2': 'float64', 'tp53_vaf_3': 'float64', 'tp53_vaf_4': 'float64', 'tp53_vaf_5': 'float64',
                       'tp53_ccf_1': 'float64', 'tp53_ccf_2': 'float64', 'tp53_ccf_3': 'float64', 'tp53_ccf_4': 'float64', 'tp53_ccf_5': 'float64'})

    return final

In [105]:
create_tp53_muts(cohort=cohort_type)

Unnamed: 0,Tumor_Id,tp53_key_1,tp53_vc_1,tp53_ccf_1,tp53_vaf_1,tp53_HGVSp_1,tp53_spot_1,tp53_key_2,tp53_vc_2,tp53_ccf_2,tp53_vaf_2,tp53_HGVSp_2,tp53_spot_2,tp53_key_3,tp53_vc_3,tp53_ccf_3,tp53_vaf_3,tp53_HGVSp_3,tp53_spot_3,tp53_key_4,tp53_vc_4,tp53_ccf_4,tp53_vaf_4,tp53_HGVSp_4,tp53_spot_4,tp53_key_5,tp53_vc_5,tp53_ccf_5,tp53_vaf_5,tp53_HGVSp_5,tp53_spot_5,tp53_count
8,P-0036909-T01-IM6,P-0036909-T01-IM617_7577121_G_A,Missense_Mutation,0.798,0.312169,p.Arg273Cys,273,,,,,,,,,,,,,,,,,,,,,,,,,1
13,P-0023546-T01-IM6,P-0023546-T01-IM617_7578442_T_C,Missense_Mutation,0.933,0.845070,p.Tyr163Cys,163,,,,,,,,,,,,,,,,,,,,,,,,,1
16,P-0023546-T02-IM6,P-0023546-T02-IM617_7578442_T_C,Missense_Mutation,1.000,0.636735,p.Tyr163Cys,163,,,,,,,,,,,,,,,,,,,,,,,,,1
21,P-0018837-T01-IM6,P-0018837-T01-IM617_7578406_C_T,Missense_Mutation,1.000,0.325843,p.Arg175His,175,,,,,,,,,,,,,,,,,,,,,,,,,1
48,P-0019444-T01-IM6,P-0019444-T01-IM617_7578461_C_A,Missense_Mutation,1.000,0.169002,p.Val157Phe,157,,,,,,,,,,,,,,,,,,,,,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3190,P-0029543-T01-IM6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
3191,P-0023447-T01-IM6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
3192,P-0002901-T01-IM3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0
3193,P-0027887-T01-IM6,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0


# TP53 Copy Numbers

In this part, we gather the information from gene_level table.
We creaste the following columns:
* Sample_Id 
* tcn --> total copy number
* mcn --> major copy number
* lcn --> lower copy number
* seg_length --> length of the segment
* cn_state --> copy number state
* cf --> Cell fraction of the cn_state
* wgd --> Wholde Genome Doubling (True or False)

In [119]:
# WARNING: THis function needs sample_info to work
def create_copy_number_state(cohort:str):
    if cohort == 'wgd':
        cohort = cohort_wgd
        maf_cohort = maf_cohort_wgd
        clinical = clinical_wgd
        arm_level = arm_level_wgd
        gene_level = gene_level_wgd
    elif cohort == 'no_wgd':
        cohort = cohort_nowgd
        maf_cohort = maf_cohort_nowgd
        clinical = clinical_nowgd
        arm_level = arm_level_nowgd
        gene_level = gene_level_nowgd
    
    # We want TP53 locus so we have to filter the gene
    gene_level = gene_level[gene_level['gene'] == 'TP53']

    gene_level['Tumor_Id'] = gene_level['sample'].str[:17]
    gene_level_subset = gene_level[['sample','tcn','mcn','lcn','seg_length','cn_state', 'cf.em']]
    
    # We rename the cf.em column 
    gene_level_subset = gene_level_subset.rename(columns={'cf.em': 'tp53_cf', 
                                                          'sample':'Sample_Id',
                                                          'tcn': 'tp53_tcn',
                                                          'mcn': 'tp53_mcn',
                                                          'lcn': 'tp53_lcn',
                                                          'seg_length': 'tp53_seg_length',
                                                          'cn_state':'tp53_cn_state'})
    
    # We add WGD information
    wgd = cohort[['sample_id', 'wgd']]
    
    final = pd.merge(gene_level_subset, wgd, left_on='Sample_Id', right_on='sample_id').drop(['sample_id'], axis=1)
    
    
    return final

In [120]:
%%time 
copy_number_info = create_copy_number_state(cohort = cohort_type)
copy_number_info

CPU times: user 19.6 ms, sys: 2.26 ms, total: 21.8 ms
Wall time: 20.4 ms


Unnamed: 0,Sample_Id,tp53_tcn,tp53_mcn,tp53_lcn,tp53_seg_length,tp53_cn_state,tp53_cf,wgd
0,P-0025956-T01-IM6_P-0025956-N01-IM6,3,2.0,1.0,80668431,LOSS AFTER,0.169170,True
1,P-0036909-T01-IM6_P-0036909-N01-IM6,2,2.0,0.0,18101929,LOSS BEFORE,0.306121,True
2,P-0023546-T01-IM6_P-0023546-N01-IM6,3,3.0,0.0,25250470,CNLOH BEFORE & LOSS,0.835504,True
3,P-0023546-T02-IM6_P-0023546-N01-IM6,2,2.0,0.0,25237770,LOSS BEFORE,0.312907,True
4,P-0018837-T01-IM6_P-0018837-N01-IM6,3,3.0,0.0,25231975,CNLOH BEFORE & LOSS,0.257400,True
...,...,...,...,...,...,...,...,...
9893,P-0050750-T01-IM6_P-0050750-N01-IM6,2,2.0,0.0,13849975,LOSS BEFORE,0.286332,True
9894,P-0050643-T01-IM6_P-0050643-N01-IM6,3,3.0,0.0,11917531,CNLOH BEFORE & LOSS,0.545007,True
9895,P-0050223-T02-IM6_P-0050223-N02-IM6,4,2.0,2.0,80668650,TETRAPLOID,,True
9896,P-0048760-T01-IM6_P-0048760-N01-IM6,3,3.0,0.0,7438283,CNLOH BEFORE & LOSS,0.598599,True


# Computed metrics
In this part we define functions to be applied on the master file to compute specific metrics.


In [None]:
def create_gene_count(maf_cohort):
    '''
    This function create the count of genes mutated for each sample.
    Arguments:
        - maf_cohort: the maf_cohort file located in data/merged/data
    '''
    
    # First we create the gene_count table by groupbying and sizing, we then change the index
    selected_cohort = maf_cohort[['Sample_Id','Tumor_Id', 'Gene_Id']]
    gene_count = pd.DataFrame(pd.DataFrame(selected_cohort[['Sample_Id', 'Gene_Id']].groupby(['Sample_Id', 'Gene_Id']).size(), columns = ['count']).groupby(['Sample_Id']).size(), columns = ['gene_count'])
    gene_count = gene_count.reset_index()

    # We add missing patients to the gene_count to have all the cohort
    no_gene_id = selected_cohort.Gene_Id.isna()
    no_gene_samples = set(selected_cohort[selected_cohort.index.isin(list(no_gene_id[no_gene_id == True].index))]['Sample_Id'])
    missing_samp = pd.DataFrame(no_gene_samples, columns = ['Sample_Id'])

    # We append the two dataframes
    gene_count = gene_count.append(missing_samp)
    
    #Fillna with 0
    gene_count = gene_count.fillna(0)

    return gene_count

def create_mut_count(maf_cohort):
    '''
    This function computes the dataframe of all mutation count per sample.
    '''
    selected_cohort = maf_cohort[['Sample_Id','Tumor_Id', 'Gene_Id']]
    mut_count = get_groupby(selected_cohort, 'Sample_Id', 'mutation_count')
    
    return mut_count


def get_driver_count(x, maf_cohort):
    lookup_table = maf_cohort[maf_cohort['Sample_Id'] == x.Sample_Id]
    h = get_groupby(lookup_table, 'oncogenic', 'count')
    count = (int(h.loc['Oncogenic']) if 'Oncogenic' in h.index else 0) + (int(h.loc['Likely Oncogenic']) if 'Likely Oncogenic' in h.index else 0) +(int(h.loc['Predicted Oncogenic']) if 'Predicted Oncogenic' in h.index else 0)
    return count

def create_driver_count(maf_cohort):
    '''In this function we count the number of mutation driver per sample.'''
    samples=list(set(maf_cohort.Sample_Id))
    driver_count = pd.DataFrame(columns=['Sample_Id', 'driver_count'])
    driver_count.Sample_Id = samples
    driver_count['driver_count'] = driver_count.apply(get_driver_count, maf_cohort=maf_cohort, axis=1)

    return driver_count
    

# The following function needs to be called on the complete master file because it needs info from different parts
# It computes the expected number of tp53 mutant copies in a cell
def create_copies_tp53_muts(master):
    master['tp53_exp_nb_1'] = master.apply(lambda x:(x.tp53_vaf_1 / x.purity) * (x.tp53_tcn * x.purity + 2*(1 - x.purity)), axis = 1)
    master['tp53_exp_nb_2'] = master.apply(lambda x:(x.tp53_vaf_2 / x.purity) * (x.tp53_tcn * x.purity + 2*(1 - x.purity)), axis = 1)
    master['tp53_exp_nb_3'] = master.apply(lambda x:(x.tp53_vaf_3 / x.purity) * (x.tp53_tcn * x.purity + 2*(1 - x.purity)), axis = 1)
    master['tp53_exp_nb_4'] = master.apply(lambda x:(x.tp53_vaf_4 / x.purity) * (x.tp53_tcn * x.purity + 2*(1 - x.purity)), axis = 1)
    master['tp53_exp_nb_5'] = master.apply(lambda x:(x.tp53_vaf_5 / x.purity) * (x.tp53_tcn * x.purity + 2*(1 - x.purity)), axis = 1)
    
    return master


# The following computes the expected number of copies of tp53 residuals 
def create_tp53_res(master):
    master['tp53_res_1'] = master.apply(lambda x:x.tcn - x.tp53_exp_nb_1, axis = 1)
    master['tp53_res_2'] = master.apply(lambda x:x.tcn - x.tp53_exp_nb_2, axis = 1)
    master['tp53_res_3'] = master.apply(lambda x:x.tcn - x.tp53_exp_nb_3, axis = 1)
    master['tp53_res_4'] = master.apply(lambda x:x.tcn - x.tp53_exp_nb_4, axis = 1)
    master['tp53_res_5'] = master.apply(lambda x:x.tcn - x.tp53_exp_nb_5, axis = 1)
    
    return master



# The following functions allow to group the Mutation Types
def vc_group_cond_1(x):
    truncated = ['Splice_Site','Intron','Nonsense_Mutation','Splice_Region','Frame_Shift_Del','Frame_Shift_Ins']
    in_frame = ['In_Frame_Ins','In_Frame_Del']
    missense = ['Missense_Mutation']
    
    if x.tp53_vc_1 in truncated: return 'truncated'
    if x.tp53_vc_1 in in_frame: return 'in_frame'
    if x.tp53_vc_1 in missense: 
        if x.tp53_spot_1 in ['273','248','175']: return x.tp53_spot_1
        elif x.tp53_spot_1 in ['245', '282', '213', '352', '220', '196']: return 'hotspot'
        else: return 'missense'
def vc_group_cond_2(x):
    truncated = ['Splice_Site','Intron','Nonsense_Mutation','Splice_Region','Frame_Shift_Del','Frame_Shift_Ins']
    in_frame = ['In_Frame_Ins','In_Frame_Del']
    missense = ['Missense_Mutation']
    
    if x.tp53_vc_2 in truncated: return 'truncated'
    if x.tp53_vc_2 in in_frame: return 'in_frame'
    if x.tp53_vc_2 in missense: 
        if x.tp53_spot_2 in ['273','248','175']: return x.tp53_spot_2
        elif x.tp53_spot_2 in['245', '282', '213', '352', '220', '196']: return 'hotspot'
        else: return 'missense'   
def vc_group_cond_3(x):
    truncated = ['Splice_Site','Intron','Nonsense_Mutation','Splice_Region','Frame_Shift_Del','Frame_Shift_Ins']
    in_frame = ['In_Frame_Ins','In_Frame_Del']
    missense = ['Missense_Mutation']
    
    if x.tp53_vc_3 in truncated: return 'truncated'
    if x.tp53_vc_3 in in_frame: return 'in_frame'
    if x.tp53_vc_3 in missense: 
        if x.tp53_spot_3 in ['273','248','175']: return x.tp53_spot_3
        elif x.tp53_spot_3 in ['245', '282', '213', '352', '220', '196']: return 'hotspot'
        else: return 'missense' 
def vc_group_cond_4(x):
    truncated = ['Splice_Site','Intron','Nonsense_Mutation','Splice_Region','Frame_Shift_Del','Frame_Shift_Ins']
    in_frame = ['In_Frame_Ins','In_Frame_Del']
    missense = ['Missense_Mutation']
    
    if x.tp53_vc_4 in truncated: return 'truncated'
    if x.tp53_vc_4 in in_frame: return 'in_frame'
    if x.tp53_vc_4 in missense: 
        if x.tp53_spot_4 in ['273','248','175']: return x.tp53_spot_4
        elif x.tp53_spot_4 in ['245', '282', '213', '352', '220', '196']: return 'hotspot'
        else: return 'missense'
def vc_group_cond_5(x):
    truncated = ['Splice_Site','Intron','Nonsense_Mutation','Splice_Region','Frame_Shift_Del','Frame_Shift_Ins']
    in_frame = ['In_Frame_Ins','In_Frame_Del']
    missense = ['Missense_Mutation']
    
    if x.tp53_vc_5 in truncated: return 'truncated'
    if x.tp53_vc_5 in in_frame: return 'in_frame'
    if x.tp53_vc_5 in missense: 
        if x.tp53_spot_5 in ['273','248','175']: return x.tp53_spot_5
        elif x.tp53_spot_5 in ['245', '282', '213', '352', '220', '196']: return 'hotspot'
        else: return 'missense'

        
        
# The following functions are for WGD cohort, computing the tp53 allelic state before WGD
def get_bi_nobi(x):
    tumor = x.tumor_sample
    cn_state = x.tp53_cn_state
    tp53_count = x.tp53_count
    maf_muts = maf_cohort[maf_cohort['Tumor_Sample_Barcode'] == tumor]
    nb_tp53muts = maf_muts.shape[0]
    
    # Samples with only one tp53 mutation
    if nb_tp53muts == 1:
        tp53_res = maf_muts.tp53_res.values[0]
        
        if cn_state == 'LOSS BEFORE' or cn_state == 'CNLOH BEFORE & LOSS':
            thr = 0.5
            if tp53_res < thr - 0.1:
                return 'bi'
            elif (tp53_res < thr + 0.1) and (tp53_res > thr - 0.1):
                return 'uncertain'
            elif tp53_res > thr + 0.1:
                return 'no_bi'

        elif cn_state == 'CNLOH BEFORE':
            thr = 1.5
            if tp53_res < thr:
                return 'bi'
            elif tp53_res >= thr:
                return 'no_bi'

        elif cn_state in ['LOSS AFTER','DOUBLE LOSS AFTER','TETRAPLOID','CNLOH AFTER']: 
            return 'no_bi'

        elif cn_state == 'CNLOH BEFORE & GAIN':
            thr = 1.5
            if tp53_res < thr - 0.1:
                return 'bi'
            elif (tp53_res < thr + 0.1) and (tp53_res > thr - 0.1):
                return 'uncertain'
            elif tp53_res > thr + 0.1:
                return 'no_bi'
            
            
            
    # Samples with 2 tp53 mutations
    elif nb_tp53muts == 2:
        tp53_res_1 = maf_muts.tp53_res.values[0]
        tp53_res_2 = maf_muts.tp53_res.values[1]
        timing_1 = maf_muts.tp53_timing_wgd.values[0]
        timing_2 = maf_muts.tp53_timing_wgd.values[1]
        
        if cn_state == 'LOSS BEFORE':
            thr = 0.5
            if (tp53_res_1 < thr - 0.1) or (tp53_res_2 < thr - 0.1):
                return 'bi'
            elif (tp53_res_1 > thr + 0.1) and (tp53_res_2 > thr + 0.1):
                return 'no_bi'
            else: return 'uncertain'
            
        if cn_state == 'CNLOH BEFORE & LOSS':
            thr_1 = 0.5
            thr_2 = 1.5
            if (tp53_res_1 < thr_1 - 0.1) or (tp53_res_2 < thr_1 - 0.1):
                return 'bi'
            elif (tp53_res_1 < thr_2 and tp53_res_2 < thr_2):
                return 'bi'
            else: return 'no_bi'
            
        elif cn_state == 'CNLOH BEFORE':
            thr_1 = 1.5
            thr_2 = 2.5
            if (tp53_res_1 < thr_1) or (tp53_res_2 < thr_1):
                return 'bi'
            elif (tp53_res_1 < thr_2 and tp53_res_2 < thr_2):
                return 'bi'
            else: return 'no_bi'

        elif cn_state =='LOSS AFTER': 
            thr = 1.5
            if (tp53_res_1 < thr) and (tp53_res_2 < thr):
                return 'bi'
            else: return 'no_bi'
            
        elif cn_state =='DOUBLE LOSS AFTER': 
            thr = 0.5
            if (tp53_res_1 < thr) and (tp53_res_2 < thr):
                return 'bi'
            else: return 'no_bi'
        
        elif cn_state =='TETRAPLOID': 
            thr = 2.5
            if (tp53_res_1 < thr) and (tp53_res_2 < thr):
                return 'bi'
            else: return 'no_bi'
            
        elif cn_state =='CNLOH AFTER': 
            thr = 1.5
            if (tp53_res_1 < thr) and (tp53_res_2 < thr):
                return 'bi'
            else: return 'no_bi'
        
        elif cn_state == 'CNLOH BEFORE & GAIN':
            thr_1 = 1.5
            thr_2 = 2.5
            if tp53_res_1 < thr_1 - 0.1 or tp53_res_2 < thr_1 - 0.1:
                return 'bi'
            elif (tp53_res_1 > thr_2) and (tp53_res_2 > thr_2):
                return 'no_bi'
            else: return 'uncertain'
            
            
    # Samples with 3 tp53 mutations
    elif nb_tp53muts == 3:
        tp53_res_1 = maf_muts.tp53_res.values[0]
        tp53_res_2 = maf_muts.tp53_res.values[1]
        tp53_res_3 = maf_muts.tp53_res.values[2]
        tp53_res = [tp53_res_1,tp53_res_2,tp53_res_3]
        tp53_res.sort()
        
        timing_1 = maf_muts.tp53_timing_wgd.values[0]
        timing_2 = maf_muts.tp53_timing_wgd.values[1]
        timing_3 = maf_muts.tp53_timing_wgd.values[2]
        timings = [timing_1, timing_2, timing_3]
        BEF = [1 for x in timings if x=='BEFORE']
        
        # Number of mutations happening before wgd
        nb_bef = len(BEF)
        
        if cn_state == 'LOSS BEFORE':
            thr = 0.5
            if (min(tp53_res) < thr - 0.1):
                return 'bi'
            elif (max(tp53_res) > thr + 0.1) :
                return 'no_bi'
            else: return 'uncertain'
            
        if cn_state == 'CNLOH BEFORE & LOSS':
            thr_1 = 0.5
            thr_2 = 1.5
            if (min(tp53_res) < thr_1 - 0.1):
                return 'bi'
            elif (tp53_res[1] < thr_2): # we want that exactly 2 mutation arose before WGD
                return 'bi'
            else: return 'no_bi'
            
        elif cn_state == 'CNLOH BEFORE':
            thr_1 = 1.5
            thr_2 = 2.5
            if (min(tp53_res) < thr_1):
                return 'bi'
            elif (tp53_res[1] < thr_2):
                return 'bi'
            else: return 'no_bi'

        elif cn_state =='LOSS AFTER': 
            thr = 1.5
            if (tp53_res[1] < thr):
                return 'bi'
            else: return 'no_bi'
            
        elif cn_state =='DOUBLE LOSS AFTER': 
            thr = 0.5
            if (tp53_res[1] < thr):
                return 'bi'
            else: return 'no_bi'
        
        elif cn_state =='TETRAPLOID': 
            thr = 2.5
            if (tp53_res[1] < thr):
                return 'bi'
            else: return 'no_bi'
            
        elif cn_state =='CNLOH AFTER': 
            thr = 1.5
            if (tp53_res[1] < thr):
                return 'bi'
            else: return 'no_bi'
        
        elif cn_state == 'CNLOH BEFORE & GAIN':
            thr_1 = 1.5
            thr_2 = 2.5
            if min(tp53_res) < thr_1 - 0.1 :
                return 'bi'
            elif (tp53_res[1] > thr_2):
                return 'no_bi'
            else: return 'uncertain'
    
    else: return 'uncertain'


# This functions can only be called after get_bi_nobi() has been called on master
def get_mono(x):
    tumor = x.tumor_sample
    cn_state = x.tp53_cn_state
    tp53_count = x.tp53_count
    maf_muts = maf_cohort[maf_cohort['Tumor_Sample_Barcode'] == tumor]
    nb_tp53muts = maf_muts.shape[0]
    bi_state = x.bi_state_wgd
    
    if bi_state == 'bi':
        return 'bi'
    
    elif bi_state == 'uncertain':
        return 'uncertain_bi'
    
    
    elif bi_state == 'no_bi': # We are already in the  mono/2WT distinction 
        
        #1 mut samples
        if nb_tp53muts == 1:
            tp53_res = maf_muts.tp53_res.values[0]
            
            if cn_state == 'LOSS BEFORE':
                return 'mono'
            elif cn_state == 'CNLOH BEFORE' or cn_state == 'TETRAPLOID':
                if tp53_res < 2.5:
                    return 'mono'
                elif tp53_res > 2.5:
                    return '2WT'
                
            else: return 'uncertain_mono'

        
        # 2 mut samples   
        elif nb_tp53muts == 2:
            tp53_res_1 = maf_muts.tp53_res.values[0]
            tp53_res_2 = maf_muts.tp53_res.values[1]
            
            if cn_state == 'LOSS BEFORE':
                return 'mono'
            elif cn_state == 'CNLOH BEFORE' or cn_state == 'TETRAPLOID':
                if tp53_res_1 < 2.5 or tp53_res_2 < 2.5:
                    return 'mono'
                elif tp53_res_1 > 2.5 and tp53_res_2 > 2.5:
                    return '2WT'
                
            else: return 'uncertain_mono'
            
        # 3 mut samples
        elif nb_tp53muts == 3:
            tp53_res_1 = maf_muts.tp53_res.values[0]
            tp53_res_2 = maf_muts.tp53_res.values[1]
            tp53_res_3 = maf_muts.tp53_res.values[2]
            tp53_res = [tp53_res_1,tp53_res_2,tp53_res_3]
            tp53_res.sort()
            
            if cn_state == 'LOSS BEFORE':
                return 'mono'
            elif cn_state == 'CNLOH BEFORE' or cn_state == 'TETRAPLOID':
                if min(tp53_res) < 2.5:
                    return 'mono'
                elif tp53_res[0] > 2.5:
                    return '2WT'
                
            else: return 'uncertain_mono'
        
        else: return 'uncertain_mono' 

# Merge Tables

In [123]:
def merge_tables(cohort_type: str):
    
    if cohort == 'wgd':
        cohort = cohort_wgd
        maf_cohort = maf_cohort_wgd
        clinical = clinical_wgd
        arm_level = arm_level_wgd
        gene_level = gene_level_wgd
    elif cohort == 'no_wgd':
        cohort = cohort_nowgd
        maf_cohort = maf_cohort_nowgd
        clinical = clinical_nowgd
        arm_level = arm_level_nowgd
        gene_level = gene_level_nowgd
    
    sample_info = create_sample_info(cohort=cohort_type)
    tp53_muts = create_tp53_muts(cohort=cohort_type)
    copy_number_info = create_copy_number_state(cohort=cohort_type)
    
    master = pd.merge(sample_info, tp53_muts, on='Tumor_Id')
    master = pd.merge(master, copy_number_info, on='Sample_Id')
    
    return master

In [125]:
%%time
master = merge_tables(cohort_type=cohort_type)

CPU times: user 5.65 s, sys: 71.6 ms, total: 5.73 s
Wall time: 5.7 s


In [131]:
def create_gene_count(maf_cohort):
    '''
    This function create the count of genes mutated for each sample.
    Arguments:
        - maf_cohort: the maf_cohort file located in data/merged/data
    '''
    
    # First we create the gene_count table by groupbying and sizing, we then change the index
    selected_cohort = maf_cohort[['Sample_Id','Tumor_Id', 'Gene_Id']]
    gene_count = pd.DataFrame(pd.DataFrame(selected_cohort[['Sample_Id', 'Gene_Id']].groupby(['Sample_Id', 'Gene_Id']).size(), columns = ['count']).groupby(['Sample_Id']).size(), columns = ['gene_count'])
    gene_count = gene_count.reset_index()

    # We add missing patients to the gene_count to have all the cohort
    no_gene_id = selected_cohort.Gene_Id.isna()
    no_gene_samples = set(selected_cohort[selected_cohort.index.isin(list(no_gene_id[no_gene_id == True].index))]['Sample_Id'])
    missing_samp = pd.DataFrame(no_gene_samples, columns = ['Sample_Id'])

    # We append the two dataframes
    gene_count = gene_count.append(missing_samp)
    
    #Fillna with 0
    gene_count = gene_count.fillna(0)

    return gene_count

def create_mut_count(x):
    '''
    This function computes the dataframe of all mutation count per sample.
    '''
    tumor = x.Tumor_Id
    selected_cohort = maf_cohort[maf_cohort['Tumor_Sample_Barcode'] == tumor]
    
    return selected_cohort.shape[0]


def get_driver_count(x, maf_cohort):
    lookup_table = maf_cohort[maf_cohort['Sample_Id'] == x.Sample_Id]
    h = get_groupby(lookup_table, 'oncogenic', 'count')
    count = (int(h.loc['Oncogenic']) if 'Oncogenic' in h.index else 0) + (int(h.loc['Likely Oncogenic']) if 'Likely Oncogenic' in h.index else 0) +(int(h.loc['Predicted Oncogenic']) if 'Predicted Oncogenic' in h.index else 0)
    return count

def create_driver_count(maf_cohort):
    '''In this function we count the number of mutation driver per sample.'''
    samples=list(set(maf_cohort.Sample_Id))
    driver_count = pd.DataFrame(columns=['Sample_Id', 'driver_count'])
    driver_count.Sample_Id = samples
    driver_count['driver_count'] = driver_count.apply(get_driver_count, maf_cohort=maf_cohort, axis=1)

    return driver_count

In [132]:
master['mut_count'] = master.apply(create_mut_count, axis=1)

In [133]:
master

Unnamed: 0,Sample_Id,Tumor_Id,Patient_Id,Cancer_Type,Cancer_Type_Detailed,Patient_Current_Age,Sample_Type,purity,ploidy,Overall_Survival_Months,Overall_Survival_Status,MSI_Score,MSI_Type,TMB_Score,tp53_key_1,tp53_vc_1,tp53_ccf_1,tp53_vaf_1,tp53_HGVSp_1,tp53_spot_1,tp53_key_2,tp53_vc_2,tp53_ccf_2,tp53_vaf_2,tp53_HGVSp_2,tp53_spot_2,tp53_key_3,tp53_vc_3,tp53_ccf_3,tp53_vaf_3,tp53_HGVSp_3,tp53_spot_3,tp53_key_4,tp53_vc_4,tp53_ccf_4,tp53_vaf_4,tp53_HGVSp_4,tp53_spot_4,tp53_key_5,tp53_vc_5,tp53_ccf_5,tp53_vaf_5,tp53_HGVSp_5,tp53_spot_5,tp53_count,tp53_tcn,tp53_mcn,tp53_lcn,tp53_seg_length,tp53_cn_state,tp53_cf,wgd,mut_count
0,P-0025956-T01-IM6_P-0025956-N01-IM6,P-0025956-T01-IM6,P-0025956,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,71.0,Primary,0.273767,3.496971,3.584,DECEASED,0.00,Stable,5.3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,3,2.0,1.0,80668431,LOSS AFTER,0.169170,True,6
1,P-0036909-T01-IM6_P-0036909-N01-IM6,P-0036909-T01-IM6,P-0036909,Non-Small Cell Lung Cancer,Lung Adenocarcinoma,47.0,Metastasis,0.391316,2.871793,14.137,LIVING,0.37,Stable,3.5,P-0036909-T01-IM617_7577121_G_A,Missense_Mutation,0.798,0.312169,p.Arg273Cys,273,,,,,,,,,,,,,,,,,,,,,,,,,1,2,2.0,0.0,18101929,LOSS BEFORE,0.306121,True,4
2,P-0023546-T01-IM6_P-0023546-N01-IM6,P-0023546-T01-IM6,P-0023546,Prostate Cancer,Prostate Neuroendocrine Carcinoma,50.0,Primary,0.865628,3.115253,4.800,DECEASED,2.37,Stable,3.5,P-0023546-T01-IM617_7578442_T_C,Missense_Mutation,0.933,0.845070,p.Tyr163Cys,163,,,,,,,,,,,,,,,,,,,,,,,,,1,3,3.0,0.0,25250470,CNLOH BEFORE & LOSS,0.835504,True,4
3,P-0023546-T02-IM6_P-0023546-N01-IM6,P-0023546-T02-IM6,P-0023546,Prostate Cancer,Prostate Adenocarcinoma,50.0,Primary,0.312907,3.136841,4.800,DECEASED,0.82,Stable,2.6,P-0023546-T02-IM617_7578442_T_C,Missense_Mutation,1.000,0.636735,p.Tyr163Cys,163,,,,,,,,,,,,,,,,,,,,,,,,,1,2,2.0,0.0,25237770,LOSS BEFORE,0.312907,True,3
4,P-0018837-T01-IM6_P-0018837-N01-IM6,P-0018837-T01-IM6,P-0018837,Colorectal Cancer,Colon Adenocarcinoma,60.0,Primary,0.351778,3.727190,34.060,LIVING,0.45,Stable,5.3,P-0018837-T01-IM617_7578406_C_T,Missense_Mutation,1.000,0.325843,p.Arg175His,175,,,,,,,,,,,,,,,,,,,,,,,,,1,3,3.0,0.0,25231975,CNLOH BEFORE & LOSS,0.257400,True,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9893,P-0050750-T01-IM6_P-0050750-N01-IM6,P-0050750-T01-IM6,P-0050750,Pancreatic Cancer,Pancreatic Adenocarcinoma,55.0,Metastasis,0.301523,3.845451,0.099,LIVING,0.05,Stable,5.3,P-0050750-T01-IM617_7578406_C_T,Missense_Mutation,0.946,0.285229,p.Arg175His,175,,,,,,,,,,,,,,,,,,,,,,,,,1,2,2.0,0.0,13849975,LOSS BEFORE,0.286332,True,5
9894,P-0050643-T01-IM6_P-0050643-N01-IM6,P-0050643-T01-IM6,P-0050643,Breast Cancer,Breast Invasive Ductal Carcinoma,53.0,Primary,0.667936,4.419218,1.315,LIVING,0.30,Stable,4.4,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,3,3.0,0.0,11917531,CNLOH BEFORE & LOSS,0.545007,True,5
9895,P-0050223-T02-IM6_P-0050223-N02-IM6,P-0050223-T02-IM6,P-0050223,Hepatobiliary Cancer,Cholangiocarcinoma,67.0,Primary,0.215128,4.725790,1.940,LIVING,0.00,Stable,3.5,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,4,2.0,2.0,80668650,TETRAPLOID,,True,4
9896,P-0048760-T01-IM6_P-0048760-N01-IM6,P-0048760-T01-IM6,P-0048760,Bone Cancer,Osteosarcoma,18.0,Primary,0.614603,3.947721,3.682,LIVING,5.79,Indeterminate,6.1,P-0048760-T01-IM617_7578413_C_T,Missense_Mutation,0.958,0.675637,p.Val173Met,173,,,,,,,,,,,,,,,,,,,,,,,,,1,3,3.0,0.0,7438283,CNLOH BEFORE & LOSS,0.598599,True,7


In [134]:
maf_cohort_wgd[maf_cohort_wgd['Tumor_Sample_Barcode'] == 'P-0025956-T01-IM6']

Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,Center,NCBI_Build,Chromosome,Start_Position,End_Position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,Tumor_Seq_Allele2,dbSNP_RS,dbSNP_Val_Status,Tumor_Sample_Barcode,Matched_Norm_Sample_Barcode,Match_Norm_Seq_Allele1,Match_Norm_Seq_Allele2,Tumor_Validation_Allele1,Tumor_Validation_Allele2,Match_Norm_Validation_Allele1,Match_Norm_Validation_Allele2,Verification_Status,Validation_Status,Mutation_Status,Sequencing_Phase,Sequence_Source,Validation_Method,Score,BAM_File,Sequencer,Tumor_Sample_UUID,Matched_Norm_Sample_UUID,HGVSc,HGVSp,HGVSp_Short,Transcript_ID,Exon_Number,t_depth,t_ref_count,t_alt_count,n_depth,n_ref_count,n_alt_count,all_effects,Allele,Gene,Feature,Feature_type,Consequence,cDNA_position,CDS_position,Protein_position,Amino_acids,Codons,Existing_variation,ALLELE_NUM,DISTANCE,STRAND_VEP,SYMBOL,SYMBOL_SOURCE,HGNC_ID,BIOTYPE,CANONICAL,CCDS,ENSP,SWISSPROT,TREMBL,UNIPARC,RefSeq,SIFT,PolyPhen,EXON,INTRON,DOMAINS,GMAF,AFR_MAF,AMR_MAF,ASN_MAF,EAS_MAF,EUR_MAF,SAS_MAF,AA_MAF,EA_MAF,CLIN_SIG,SOMATIC,PUBMED,MOTIF_NAME,MOTIF_POS,HIGH_INF_POS,MOTIF_SCORE_CHANGE,IMPACT,PICK,VARIANT_CLASS,TSL,HGVS_OFFSET,PHENO,MINIMISED,ExAC_AF,ExAC_AF_AFR,ExAC_AF_AMR,ExAC_AF_EAS,ExAC_AF_FIN,ExAC_AF_NFE,ExAC_AF_OTH,ExAC_AF_SAS,GENE_PHENO,FILTER,flanking_bps,variant_id,variant_qual,ExAC_AF_Adj,ExAC_AC_AN_Adj,ExAC_AC_AN,ExAC_AC_AN_AFR,ExAC_AC_AN_AMR,ExAC_AC_AN_EAS,ExAC_AC_AN_FIN,ExAC_AC_AN_NFE,ExAC_AC_AN_OTH,ExAC_AC_AN_SAS,ExAC_FILTER,Caller,is-a-hotspot,is-a-3d-hotspot,mutation_effect,oncogenic,LEVEL_1,LEVEL_2A,LEVEL_2B,LEVEL_3A,LEVEL_3B,LEVEL_4,LEVEL_R1,LEVEL_R2,LEVEL_R3,Highest_level,citations,driver,tcn,lcn,cf,purity,t_var_freq,expected_alt_copies,ccf_Mcopies,ccf_Mcopies_lower,ccf_Mcopies_upper,ccf_Mcopies_prob95,ccf_Mcopies_prob90,ccf_1copy,ccf_1copy_lower,ccf_1copy_upper,ccf_1copy_prob95,ccf_1copy_prob90,ccf_expected_copies,ccf_expected_copies_lower,ccf_expected_copies_upper,ccf_expected_copies_prob95,ccf_expected_copies_prob90,facets_fit,reviewer_set_purity,use_only_purity_run,use_edited_cncf,cncf_file_used,mut_key,sample_mut_key
0,KRAS,3845,MSKCC,GRCh37,12,25398285,25398285,+,Missense_Mutation,SNP,C,C,A,novel,,P-0025956-T01-IM6,,,,,,,,,Unknown,SOMATIC,,,,MSK-IMPACT,,,,,c.34G>T,p.Gly12Cys,p.G12C,ENST00000256078,2/6,588.0,495.0,93.0,908,908,0,"KRAS,missense_variant,p.Gly12Cys,ENST00000311936,NM_004985.3;KRAS,missense_variant,p.Gly12Cys,ENST00000556131,;KRAS,missense_variant,p.Gly12Cys,ENST00000256078,NM_033360.2;KRAS,missense_variant,p.Gly12Cys,ENST00000557334,;",A,ENSG00000133703,ENST00000256078,Transcript,missense_variant,98/1119,34/570,12/189,G/C,Ggt/Tgt,,1,,-1,KRAS,HGNC,6407.0,protein_coding,YES,CCDS8703.1,ENSP00000256078,P01116,"Q9UM97,Q71SP6,P78460,L7RSL8,I1SRC5",UPI0000133132,NM_033360.2,deleterious(0.04),probably_damaging(0.993),2/6,,"Gene3D:3.40.50.300,Pfam_domain:PF00071,Prints_domain:PR00449,PROSITE_profiles:PS51421,hmmpanther:PTHR24070,hmmpanther:PTHR24070:SF186,Low_complexity_(Seg):seg,SMART_domains:SM00173,SMART_domains:SM00174,SMART_domains:SM00175,SMART_domains:SM00176,Superfamily_domains:SSF52540,TIGRFAM_domain:TIGR00231",,,,,,,,,,,,,,,,,MODERATE,1.0,indel,,,,,,,,,,,,,1.0,,CCA,.,.,,,,,,,,,,,,,Y,Y,Gain-of-function,Oncogenic,,,,,,"Binimetinib,Cobimetinib,Trametinib",,,,LEVEL_4,"16051643;25705018;26841430;24256730;28783719;29247021;Bhagwat et al. Abstract# 4973, AACR 2017(http://cancerres.aacrjournals.org/content/77/13_Supplement/4973);Robarge et al. Abstract# DDT02-03, AACR 2014(http://cancerres.aacrjournals.org/content/74/19_Supplement/DDT02-03);Burrows et al. Abstract# 5168, AACR 2017(http://cancerres.aacrjournals.org/content/77/13_Supplement/5168)",True,4.0,2.0,0.138437,0.273767,0.158163,1.0,0.736,0.655,0.821,0.002236,0.014834,1.0,0.979,1.0,0.860769,0.984692,1.0,0.979,1.0,0.860769,0.984692,/juno/work/ccs/shared/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default,,False,False,/juno/work/ccs/shared/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default/P-0025956-T01-IM6_P-0025956-N01-IM6_hisens.cncf.txt,12_25398285_C_A,P-0025956-T01-IM612_25398285_C_A
1,STK11,6794,MSKCC,GRCh37,19,1221314,1221314,+,Frame_Shift_Del,DEL,C,C,-,novel,,P-0025956-T01-IM6,,,,,,,,,Unknown,SOMATIC,,,,MSK-IMPACT,,,,,c.837delC,p.Pro281ArgfsTer6,p.P281Rfs*6,ENST00000326873,6/10,615.0,492.0,123.0,885,884,1,"STK11,frameshift_variant,p.Pro281ArgfsTer6,ENST00000326873,NM_000455.4;STK11,frameshift_variant,p.Pro37ArgfsTer6,ENST00000586243,;STK11,downstream_gene_variant,,ENST00000585851,;STK11,downstream_gene_variant,,ENST00000585748,;STK11,3_prime_UTR_variant,,ENST00000593219,;STK11,non_coding_transcript_exon_variant,,ENST00000589152,;STK11,non_coding_transcript_exon_variant,,ENST00000591133,;STK11,non_coding_transcript_exon_variant,,ENST00000586358,;STK11,upstream_gene_variant,,ENST00000585465,;,regulatory_region_variant,,ENSR00000377818,;",-,ENSG00000118046,ENST00000326873,Transcript,frameshift_variant,2010/3328,837/1302,279/433,G/X,ggC/gg,,1,,1,STK11,HGNC,11389.0,protein_coding,YES,CCDS45896.1,ENSP00000324856,Q15831,Q9NS52,UPI0000136105,NM_000455.4,,,6/10,,"Gene3D:1.10.510.10,Pfam_domain:PF00069,PROSITE_profiles:PS50011,hmmpanther:PTHR24347,hmmpanther:PTHR24347:SF1,SMART_domains:SM00220,Superfamily_domains:SSF56112",,,,,,,,,,,,,,,,,HIGH,1.0,sequence_alteration,,,,,,,,,,,,,1.0,,GGCC,.,.,,,,,,,,,,,,,,,Likely Loss-of-function,Likely Oncogenic,,,,,,,,,,,24652667;19892943;25079552;21516316;19340305,True,2.0,0.0,0.18372,0.273767,0.2,1.0,0.731,0.662,0.802,0.000266,0.003571,1.0,0.984,1.0,0.926527,0.996079,1.0,0.984,1.0,0.926527,0.996079,/juno/work/ccs/shared/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default,,False,False,/juno/work/ccs/shared/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default/P-0025956-T01-IM6_P-0025956-N01-IM6_hisens.cncf.txt,19_1221314_C_-,P-0025956-T01-IM619_1221314_C_-
2,KEAP1,9817,MSKCC,GRCh37,19,10602584,10602584,+,Missense_Mutation,SNP,C,C,T,novel,,P-0025956-T01-IM6,,,,,,,,,Unknown,SOMATIC,,,,MSK-IMPACT,,,,,c.994G>A,p.Gly332Ser,p.G332S,ENST00000171111,3/6,641.0,523.0,118.0,471,471,0,"KEAP1,missense_variant,p.Gly332Ser,ENST00000171111,NM_203500.1;KEAP1,missense_variant,p.Gly332Ser,ENST00000393623,NM_012289.3;KEAP1,upstream_gene_variant,,ENST00000592478,;KEAP1,downstream_gene_variant,,ENST00000591419,;KEAP1,downstream_gene_variant,,ENST00000592055,;CTC-429L19.3,upstream_gene_variant,,ENST00000592671,;KEAP1,downstream_gene_variant,,ENST00000588024,;KEAP1,downstream_gene_variant,,ENST00000585845,;KEAP1,upstream_gene_variant,,ENST00000590593,;KEAP1,upstream_gene_variant,,ENST00000590237,;",T,ENSG00000079999,ENST00000171111,Transcript,missense_variant,1542/2955,994/1875,332/624,G/S,Ggc/Agc,,1,,-1,KEAP1,HGNC,23177.0,protein_coding,YES,CCDS12239.1,ENSP00000171111,Q14145,"K7ESE0,K7EJD8,K7EJ49",UPI000007139C,NM_203500.1,deleterious(0),probably_damaging(0.998),3/6,,"hmmpanther:PTHR24412:SF162,hmmpanther:PTHR24412,Pfam_domain:PF01344,Gene3D:1k3iA02,PIRSF_domain:PIRSF037037,SMART_domains:SM00612,Superfamily_domains:0052715",,,,,,,,,,,,,,,,,MODERATE,1.0,indel,,,,,,,,,,,,,,,CCC,.,.,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,2.0,0.0,0.18372,0.273767,0.184087,1.0,0.672,0.608,0.74,3e-06,9.3e-05,1.0,0.98,1.0,0.880435,0.989551,1.0,0.98,1.0,0.880435,0.989551,/juno/work/ccs/shared/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default,,False,False,/juno/work/ccs/shared/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default/P-0025956-T01-IM6_P-0025956-N01-IM6_hisens.cncf.txt,19_10602584_C_T,P-0025956-T01-IM619_10602584_C_T
3,SMARCA4,6597,MSKCC,GRCh37,19,11113836,11113836,+,Splice_Site,SNP,G,G,A,novel,,P-0025956-T01-IM6,,,,,,,,,Unknown,SOMATIC,,,,MSK-IMPACT,,,,,c.1943+1G>A,,p.X648_splice,ENST00000344626,,445.0,371.0,74.0,362,361,1,"SMARCA4,splice_donor_variant,,ENST00000358026,NM_001128849.1;SMARCA4,splice_donor_variant,,ENST00000429416,NM_001128844.1;SMARCA4,splice_donor_variant,,ENST00000344626,NM_003072.3;SMARCA4,splice_donor_variant,,ENST00000413806,NM_001128845.1,NM_001128847.1;SMARCA4,splice_donor_variant,,ENST00000450717,NM_001128846.1,NM_001128848.1;SMARCA4,splice_donor_variant,,ENST00000590574,;SMARCA4,splice_donor_variant,,ENST00000589677,;SMARCA4,splice_donor_variant,,ENST00000541122,;SMARCA4,splice_donor_variant,,ENST00000444061,;SMARCA4,splice_donor_variant,,ENST00000591545,;",A,ENSG00000127616,ENST00000344626,Transcript,splice_donor_variant,-/5392,1943/4944,648/1647,,,,1,,1,SMARCA4,HGNC,11100.0,protein_coding,,CCDS12253.1,ENSP00000343896,P51532,"B4DSI8,A7E2E1",UPI000006F973,NM_003072.3,,,,12/34,,,,,,,,,,,,,,,,,,HIGH,,indel,,,,,,,,,,,,,1.0,,GGT,.,.,,,,,,,,,,,,,,,Likely Loss-of-function,Likely Oncogenic,,,,,,,,,,,24658002;24658001;24658004;25060813;18301784,True,2.0,0.0,0.18372,0.273767,0.166292,1.0,0.607,0.534,0.686,1e-06,2.7e-05,1.0,0.959,1.0,0.659996,0.904988,1.0,0.959,1.0,0.659996,0.904988,/juno/work/ccs/shared/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default,,False,False,/juno/work/ccs/shared/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default/P-0025956-T01-IM6_P-0025956-N01-IM6_hisens.cncf.txt,19_11113836_G_A,P-0025956-T01-IM619_11113836_G_A
4,PTPRT,11122,MSKCC,GRCh37,20,40735560,40735560,+,Missense_Mutation,SNP,C,C,A,novel,,P-0025956-T01-IM6,,,,,,,,,Unknown,SOMATIC,,,,MSK-IMPACT,,,,,c.3313G>T,p.Ala1105Ser,p.A1105S,ENST00000373198,25/32,471.0,378.0,93.0,349,349,0,"PTPRT,missense_variant,p.Ala1105Ser,ENST00000373198,NM_133170.3;PTPRT,missense_variant,p.Ala1076Ser,ENST00000373201,;PTPRT,missense_variant,p.Ala1089Ser,ENST00000373193,NM_007050.5;PTPRT,missense_variant,p.Ala1085Ser,ENST00000373190,;PTPRT,missense_variant,p.Ala1096Ser,ENST00000373184,;PTPRT,missense_variant,p.Ala1095Ser,ENST00000356100,;PTPRT,missense_variant,p.Ala1086Ser,ENST00000373187,;",A,ENSG00000196090,ENST00000373198,Transcript,"missense_variant,splice_region_variant",3549/12746,3313/4383,1105/1460,A/S,Gct/Tct,,1,,-1,PTPRT,HGNC,9682.0,protein_coding,,CCDS68127.1,ENSP00000362294,O14522,,UPI00001AF6FA,NM_133170.3,deleterious(0),probably_damaging(0.973),25/32,,"Gene3D:3.90.190.10,Pfam_domain:PF00102,Prints_domain:PR00700,PROSITE_patterns:PS00383,PROSITE_profiles:PS50055,PROSITE_profiles:PS50056,hmmpanther:PTHR19134,hmmpanther:PTHR19134:SF208,SMART_domains:SM00194,SMART_domains:SM00404,Superfamily_domains:SSF52799",,,,,,,,,,,,,,,,,MODERATE,,indel,,,,,,,,,,,,,1.0,,GCA,.,.,,,,,,,,,,,,,,,,,,,,,,,,,,,,False,2.0,0.0,0.18372,0.273767,0.197452,1.0,0.721,0.644,0.803,0.000762,0.006532,1.0,0.979,1.0,0.859171,0.98442,1.0,0.979,1.0,0.859171,0.98442,/juno/work/ccs/shared/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default,,False,False,/juno/work/ccs/shared/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default/P-0025956-T01-IM6_P-0025956-N01-IM6_hisens.cncf.txt,20_40735560_C_A,P-0025956-T01-IM620_40735560_C_A
5,ERG,2078,MSKCC,GRCh37,21,39755607,39755607,+,Nonsense_Mutation,SNP,G,G,C,novel,,P-0025956-T01-IM6,,,,,,,,,Unknown,SOMATIC,,,,MSK-IMPACT,,,,,c.1158C>G,p.Tyr386Ter,p.Y386*,ENST00000288319,10/10,584.0,490.0,94.0,371,371,0,"ERG,stop_gained,p.Tyr369Ter,ENST00000442448,NM_004449.4;ERG,stop_gained,p.Tyr393Ter,ENST00000417133,NM_001136154.1,NM_001243432.1;ERG,stop_gained,p.Tyr370Ter,ENST00000398910,;ERG,stop_gained,p.Tyr386Ter,ENST00000288319,NM_182918.3;ERG,stop_gained,p.Tyr369Ter,ENST00000398911,;ERG,stop_gained,p.Tyr363Ter,ENST00000398907,;ERG,stop_gained,p.Tyr362Ter,ENST00000398905,;ERG,stop_gained,p.Tyr270Ter,ENST00000398897,NM_001243429.1;ERG,stop_gained,p.Tyr393Ter,ENST00000398919,NM_001243428.1;ERG,stop_gained,p.Tyr294Ter,ENST00000453032,NM_001136155.1;",C,ENSG00000157554,ENST00000288319,Transcript,stop_gained,1261/4919,1158/1440,386/479,Y/*,taC/taG,,1,,-1,ERG,HGNC,3446.0,protein_coding,,CCDS13658.1,ENSP00000288319,P11308,"Q16031,B4DVX5",UPI0000074389,NM_182918.3,,,10/10,,"PROSITE_profiles:PS50061,hmmpanther:PTHR11849,hmmpanther:PTHR11849:SF161,Pfam_domain:PF00178,Gene3D:1.10.10.10,SMART_domains:SM00413,Superfamily_domains:SSF46785,Prints_domain:PR00454",,,,,,,,,,,,,,,,,HIGH,,indel,,,,,,,,,,,,,1.0,,CGT,.,.,,,,,,,,,,,PASS,,,,,,,,,,,,,,,,,False,4.0,2.0,0.138437,0.273767,0.160959,1.0,0.749,0.668,0.835,0.003877,0.023297,1.0,0.98,1.0,0.871368,0.986954,1.0,0.98,1.0,0.871368,0.986954,/juno/work/ccs/shared/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default,,False,False,/juno/work/ccs/shared/resources/impact/facets/all/P-00259/P-0025956-T01-IM6_P-0025956-N01-IM6//default/P-0025956-T01-IM6_P-0025956-N01-IM6_hisens.cncf.txt,21_39755607_G_C,P-0025956-T01-IM621_39755607_G_C
