# Generating OncoPrint Data Files

The script will process all variant files and output files in an ingestible format for the R OncoPrint function.

It will output oncoprint data for both replicate files and the merged variant callsets.

In [1]:
import os
import pandas as pd

In [2]:
# Load all cosmic variants called in this dataset
# This file was generated in filter_variants.ipynb
cosmic_all_file = os.path.join('results', 'all_cosmic_variants.tsv')
cosmic_all_df = pd.read_table(cosmic_all_file)
cosmic_all_df.head(3)

Unnamed: 0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,phyloP7way_vertebrate,phyloP20way_mammalian,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds,Otherinfo,het,quality,depth,sample_name
0,1,874779,874826,CCTCCCCAGCCACGGTGAGGACCCACCCTGGCATGATCCCCCTCATCA,-,exonic,SAMD11,.,nonframeshift deletion,SAMD11:NM_152486:exon7:c.645_692del:p.G220Dfs*447,...,.,.,.,.,.,het\t.\t20,het,.,20,019-F0
1,1,26510311,26510311,C,-,exonic,CNKSR1,.,frameshift deletion,CNKSR1:NM_001297647:exon9:c.866delC:p.P291Hfs*...,...,.,.,.,.,.,het\t.\t31,het,.,31,019-F0
2,1,158576883,158576883,G,A,exonic,OR10Z1,.,nonsynonymous SNV,OR10Z1:NM_001004478:exon1:c.G655A:p.A219T,...,0.917,0.949,0.822,0.761,4.671,het\t.\t127,het,.,127,019-F0


In [3]:
# What are the 50 most commonly altered COSMIC genes?
top_n = 50
paad_genes = cosmic_all_df['Gene.refGene'].value_counts().head(top_n).index.tolist()
cosmic_all_df['Gene.refGene'].value_counts().head(20)

NCOR2                38
OR11G2               29
KRAS                 29
MUC4;MUC4            28
DNHD1                27
TSPYL1               26
CBWD1                25
ARHGEF11;ETV3L       24
MARF1                24
WDR66                24
LCE4A                23
PHF2                 23
GOLGA8M              22
GOLGA6D              22
POTEI                21
ACIN1                21
KRTAP5-11;FAM86C1    21
BRCA1                20
CGB8                 20
MROH8;MROH8          19
Name: Gene.refGene, dtype: int64

## Generate OncoPrint Data

### For All Replicates

In [4]:
%%time

# Process each replicate by observed COSMIC mutation
variant_file_path = os.path.join('results', 'processed_vcfs')
variant_assign = []
case_id = []
for variant_file in os.listdir(variant_file_path):
    # Load and subset file to only variants in the COSMIC db
    variant_df = pd.read_table(os.path.join(variant_file_path, variant_file), index_col=0)
    variant_sub_df = variant_df[variant_df['cosmic70'] != '.']
    
    # Define mutated genes if they exist for the given variant
    variant_class = ['MUT;' if x in variant_sub_df['Gene.refGene'].tolist() else ''
                     for x in paad_genes]
    
    # Store results
    variant_assign.append(variant_class)
    case_id.append(variant_file.replace('_001_processed_variants.tsv.bz2', ''))

CPU times: user 13.4 s, sys: 244 ms, total: 13.7 s
Wall time: 13.7 s


In [5]:
# Generate and save oncoprint data for all replicates
oncoprint_file = os.path.join('results', 'oncoprint_replicates.tsv')
oncoprint_df = pd.DataFrame(variant_assign, index=case_id, columns=paad_genes)
oncoprint_df.index.name = 'Case.ID'
oncoprint_df.to_csv(oncoprint_file, sep='\t')

In [6]:
oncoprint_df.head(3)

Unnamed: 0_level_0,NCOR2,OR11G2,KRAS,MUC4;MUC4,DNHD1,TSPYL1,CBWD1,ARHGEF11;ETV3L,MARF1,WDR66,...,TP53,DPCR1,PARG,FAM161B,AHNAK2,SNAP47,RAD23B,PRELID3B,SMAD4,STEAP2
Case.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
KS26_S5_L003,MUT;,MUT;,MUT;,MUT;,MUT;,MUT;,,MUT;,MUT;,MUT;,...,,,,MUT;,,MUT;,,,,MUT;
KS28_S3_L001,MUT;,MUT;,MUT;,MUT;,,MUT;,,MUT;,MUT;,MUT;,...,MUT;,,,,,MUT;,,,MUT;,MUT;
KS26_S5_L002,MUT;,MUT;,MUT;,,MUT;,MUT;,MUT;,MUT;,MUT;,,...,,,,,,MUT;,,,,MUT;


### Merged Samples

In [7]:
variant_assign_consensus = []
case_id_consensus = []
for sample_id in set(cosmic_all_df['sample_name']):
    # Subset file to given sample ID
    variant_sub_df = cosmic_all_df.query('sample_name == @sample_id')
    
    # Define mutated genes if they exist for the given variant
    variant_class = ['MUT;' if x in variant_sub_df['Gene.refGene'].tolist() else ''
                     for x in paad_genes]
    
    # Store results
    variant_assign_consensus.append(variant_class)
    case_id_consensus.append(sample_id)

In [8]:
# Generate and save oncoprint data for consensus samples
oncoprint_consensus_file = os.path.join('results', 'oncoprint_merged.tsv')

oncoprint_consensus_df = (
    pd.DataFrame(variant_assign_consensus,
                 index=case_id_consensus,
                 columns=paad_genes)
    )
oncoprint_consensus_df.index.name = 'Case.ID'
oncoprint_consensus_df.to_csv(oncoprint_consensus_file, sep='\t')

In [9]:
oncoprint_consensus_df.head(3)

Unnamed: 0_level_0,NCOR2,OR11G2,KRAS,MUC4;MUC4,DNHD1,TSPYL1,CBWD1,ARHGEF11;ETV3L,MARF1,WDR66,...,TP53,DPCR1,PARG,FAM161B,AHNAK2,SNAP47,RAD23B,PRELID3B,SMAD4,STEAP2
Case.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
029-F0,MUT;,MUT;,MUT;,MUT;,MUT;,MUT;,,MUT;,MUT;,,...,MUT;,,,MUT;,,,,,,
008-F5,MUT;,MUT;,MUT;,MUT;,MUT;,MUT;,MUT;,MUT;,MUT;,MUT;,...,,MUT;,,MUT;,,MUT;,MUT;,,,MUT;
KS28,MUT;,MUT;,MUT;,MUT;,MUT;,MUT;,MUT;,MUT;,MUT;,MUT;,...,MUT;,,MUT;,,,MUT;,,MUT;,MUT;,MUT;


## COSMIC Mutational Similarity

Output mutational similarity data for all replicates and consensus samples. The COSMIC mutational similarity is built from a (0,1) sample by COSMIC mutation matrix.

### For All Replicates

In [10]:
# How many COSMIC mutation IDs are in the entire set and how many are unique?
print('All COSMIC mutations: {}'.format(cosmic_all_df.shape[0]))
unique_cosmic_ids = set(cosmic_all_df['cosmic70'])
print('Unique COSMIC mutations: {}'.format(len(unique_cosmic_ids)))

All COSMIC mutations: 3120
Unique COSMIC mutations: 1113


In [11]:
case_id = []
cosmic_similarity_list = []
for variant_file in os.listdir(variant_file_path):
    # Load and subset file to only variants in the COSMIC db
    variant_df = pd.read_table(os.path.join(variant_file_path, variant_file), index_col=0)
    variant_sub_df = variant_df[variant_df['cosmic70'] != '.']
    
    # Define membership in COSMIC IDs
    cosmic_class = [1 if x in variant_sub_df['cosmic70'].tolist() else 0
                    for x in unique_cosmic_ids]
    
    # Store results
    cosmic_similarity_list.append(cosmic_class)
    case_id.append(variant_file.replace('_001_processed_variants.tsv.bz2', ''))

In [12]:
# Generate COSMIC id membership data (for downstream similarity matrix)
cosmic_common_file = os.path.join('results', 'cosmic_similarity_replicates.tsv')
cosmic_common_df = (
    pd.DataFrame(cosmic_similarity_list,
                 index=case_id,
                 columns=unique_cosmic_ids)
    )
cosmic_common_df.index.name = 'Case.ID'
cosmic_common_df.to_csv(cosmic_common_file, sep='\t')

### Consensus samples

In [13]:
case_id_consensus = []
cosmic_similarity_consensus_list = []
for sample_id in set(cosmic_all_df['sample_name']):
    # Subset file to given sample ID
    variant_sub_df = cosmic_all_df.query('sample_name == @sample_id')
    
    # Define membership in COSMIC IDs
    cosmic_class = [1 if x in variant_sub_df['cosmic70'].tolist() else 0
                    for x in unique_cosmic_ids]
    
    # Store results
    cosmic_similarity_consensus_list.append(cosmic_class)
    case_id_consensus.append(sample_id)

In [14]:
common_cosmic_consensus_file = os.path.join('results', 'cosmic_similarity_merged.tsv')
cosmic_common_consensus_df = pd.DataFrame(cosmic_similarity_consensus_list,
                                          index=case_id_consensus,
                                          columns=unique_cosmic_ids)
cosmic_common_consensus_df.index.name = 'Case.ID'
cosmic_common_consensus_df.to_csv(common_cosmic_consensus_file, sep='\t')