# Read and Standardize Mutation Information
This notebook reads a .csv or .tsv file with one mutation per line. This notebook is a template that you can modify for your specific use case.

To prepare your data for subsequenct analysis, you need to:

1. Read the file with your mutation information
2. Create a column 'var_id' with the genomic location using the [HGVS sequence variant nomenclature](http://varnomen.hgvs.org/recommendations/general/), e.g. chr5:g.149440497C>T
3. Filter out any variations that are not SNPs
4. Save the file as 'mutations.csv'

The mutations.csv file is the input for the next step: 2_Map-to-3D.ipynb

In [1]:
import pandas as pd
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_columns', 500)
import numpy as np
import os

import matplotlib.pyplot as plt
from matplotlib import interactive
interactive(True)

import pandas as pd
pd.set_option('display.max_columns', None)  # show all columns
pd.set_option('display.max_columns', None)

### Input Parameters

In [2]:
input_file_name = "../../../data/CCLE/CCLE_DepMap_18q3_maf_20180718.txt"

#input_file_name = <path to your input file> # mutation info (chromosome number and position required)

output_file_name1 = '../analysis/NRF2_pathway/dataframes/step1/mutations_NRF2v2_step1.csv' # contains mutation info in standard format (e.g., chr5:g.149440497C>T)
output_file_name2 = '../analysis/NRF2_pathway/dataframes/step1/mutations_NRF2v2_step1_detailed.csv' # contains depmap details of mutation file


In [3]:
depmap = pd.read_csv(input_file_name, header=0, sep='\t')
pd.options.display.max_columns = None # show all columns
depmap.head(2)

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,Hugo_Symbol,Entrez_Gene_Id,NCBI_Build,Chromosome,Start_position,End_position,Strand,Variant_Classification,Variant_Type,Reference_Allele,Tumor_Seq_Allele1,dbSNP_RS,dbSNP_Val_Status,Genome_Change,Annotation_Transcript,Tumor_Sample_Barcode,cDNA_Change,Codon_Change,Protein_Change,isDeleterious,isTCGAhotspot,TCGAhsCnt,isCOSMIChotspot,COSMIChsCnt,ExAC_AF,WES_AC,SangerWES_AC,SangerRecalibWES_AC,RNAseq_AC,HC_AC,RD_AC,WGS_AC,Broad_ID
0,DVL1,1855,37,1,1277461,1277461,+,Silent,SNP,C,T,,,g.chr1:1277461C>T,ENST00000378888.5,127399_SOFT_TISSUE,c.438G>A,c.(436-438)gaG>gaA,p.E146E,False,False,0,False,0,,87:39,,,,,,,ACH-001270
1,AL590822.1,0,37,1,2144416,2144416,+,Missense_Mutation,SNP,G,A,,,g.chr1:2144416G>A,ENST00000545087.1,127399_SOFT_TISSUE,c.604C>T,c.(604-606)Cgc>Tgc,p.R202C,False,False,0,False,0,,23:43,,,,,,,ACH-001270


#### NRF2v2 gene set

In [4]:
gene_set = pd.read_csv('../data/GSEA/NFE2L2.V2_gene_set.gmt',sep='\t',header=None)

gene_set = gene_set.T
gene_set.columns=['gene_name']
gene_set.head(2)

Unnamed: 0,gene_name
0,ABCB6
1,ABCC2


In [5]:
len(gene_set.gene_name)

481

#### Genes with mutations associated with NRF2 pathway activation

Note this is taken from match panel 

In [7]:
#nrf2_genes = ['AKR1B10','AKR1B15','AKR1C4','KEAP1','NFE2L2','NOX5','FOS','JUNB','MAFK','CUL3']
genes = ['NFE2L2', 'KEAP1', 'CUL3', 'SIRT1', 'UCHL1', 'TRIM16L','AKR1C4',
          'AKR1B10', 'AKR1C2', 'AKR1C3', 'G6PD', 'GCLC', 'GCLM', 'GSTM3', 'NTRK2',
          'OSGIN1', 'RAB6B', 'SLC7A11', 'SPP1', 'SRXN1', 'TXNRD1', 'AKR1B15', 'NQO1', 'MAFG',
          'NOX5','FOS','JUNB','MAFK']

In [12]:
len(genes)

28

## Select Variants by Gene

In [8]:
depmap = depmap[depmap.Hugo_Symbol.isin(genes)][['Hugo_Symbol','Entrez_Gene_Id','Variant_Classification','Genome_Change','Chromosome','Tumor_Sample_Barcode','Protein_Change','Reference_Allele','Tumor_Seq_Allele1']]
depmap['ID'] = [depmap.loc[i]['Hugo_Symbol']+'_'+str(depmap.loc[i]['Protein_Change']) for i in depmap.index]

#### put genome change in correct format
(chr2:g.178098804C>T)

In [10]:
depmap['var_id'] = ['chr'+str(depmap.loc[i]['Chromosome'])+':g.'+str(depmap.loc[i]['Genome_Change'].split(':')[1]) for i in depmap.index]                                                                     

In [11]:
len(depmap.var_id.unique())

1231

In [8]:
DF_for_mapping = depmap[['Hugo_Symbol','Chromosome','Tumor_Seq_Allele1','Reference_Allele','var_id']].drop_duplicates()
DF_for_mapping.columns = ['ID','#CHROM','ALT','REF','var_id']
DF_for_mapping.to_csv(output_file_name1)

In [9]:
depmap.to_csv(output_file_name2)