# Generating OncoPrint Data Files

The script will process all variant files and output files in an ingestible format for the R OncoPrint function.

In [1]:
import os
import pandas as pd

In [2]:
cosmic_file = os.path.join('results', 'all_common_replicate_COSMIC_variants.tsv')
cosmic_df = pd.read_table(cosmic_file, index_col=0)
print(cosmic_df.shape)
cosmic_df.head()

(1222, 62)


Unnamed: 0_level_0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,phastCons20way_mammalian,SiPhy_29way_logOdds,Otherinfo,het,quality,depth,replicate,variant_id,sample_id,base_id
X1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
47,1.0,1117795,1117795,C,G,exonic,TTLL10,.,nonsynonymous SNV,"TTLL10:NM_153254:exon6:c.C666G:p.H222Q,TTLL10:...",...,0.006,0.659,het\t.\t38,het,.,38,L001,1_1117795_1117795_C_G_TTLL10,001-F0,1
633,1.0,17326767,17326767,C,T,exonic,ATP13A2,.,nonsynonymous SNV,"ATP13A2:NM_001141973:exon10:c.G866A:p.R289Q,AT...",...,0.715,8.802,het\t.\t38,het,.,38,L001,1_17326767_17326767_C_T_ATP13A2,001-F0,1
4600,1.0,173915909,173915909,G,C,exonic,RC3H1,.,nonsynonymous SNV,"RC3H1:NM_001300850:exon15:c.C2803G:p.P935A,RC3...",...,0.988,9.916,het\t.\t24,het,.,24,L001,1_173915909_173915909_G_C_RC3H1,001-F0,1
5578,1.0,210010524,210010524,G,A,exonic,DIEXF,.,nonsynonymous SNV,DIEXF:NM_014388:exon6:c.G1030A:p.D344N,...,0.998,15.183,het\t.\t21,het,.,21,L001,1_210010524_210010524_G_A_DIEXF,001-F0,1
5584,1.0,210560817,210560817,G,A,exonic,HHAT,.,synonymous SNV,"HHAT:NM_001170587:exon3:c.G168A:p.A56A,HHAT:NM...",...,.,.,het\t.\t32,het,.,32,L001,1_210560817_210560817_G_A_HHAT,001-F0,1


In [3]:
# What are the 50 most commonly altered genes?
top_n = 50
paad_genes = cosmic_df['Gene.refGene'].value_counts().head(top_n).index.tolist()
print(paad_genes)

['KRAS', 'MARF1', 'TSPYL1', 'MUC4;MUC4', 'OR11G2', 'DNHD1', 'LCE4A', 'ARHGEF11;ETV3L', 'NCOR2', 'BRCA1', 'CA8', 'SNAP47', 'DCC', 'RYR3', 'PPP1R3A', 'PTEN', 'TP53', 'FAM161B', 'JCAD', 'IDH1', 'OR10Z1', 'SMAD4', 'SGK494', 'FNIP2', 'CNTRL', 'ZNF780B', 'WDR62', 'ZNF679', 'UBL3', 'STEAP2', 'CLK2P1', 'ANO6', 'RNF123', 'KRTAP9-2', 'CDH18', 'LAMC2', 'CCDC180', 'LINC01242;LINC01243', 'DIAPH3', 'SS18L1', 'SHCBP1', 'CD34', 'LAMA1', 'EP400', 'C19orf33', 'CGB8', 'MROH8;MROH8', 'FGF6', 'ZNF763', 'NEB']


## Generate OncoPrint Data

### For All Replicates

In [4]:
%%time
variant_file_path = os.path.join('results', 'processed_vcfs')
variant_assign = []
case_id = []
all_cosmic_ids = []
for variant_file in os.listdir(variant_file_path):
    # Load and subset file to only variants in the COSMIC db
    variant_df = pd.read_table(os.path.join(variant_file_path, variant_file), index_col=0)
    variant_sub_df = variant_df[variant_df['cosmic70'] != '.']
    
    # Build a list of all COSMIC IDs for separate R visualization (similarity heatmaps)
    all_cosmic_ids += variant_sub_df['cosmic70'].tolist()
    
    # Define mutated genes if they exist for the given variant
    variant_class = ['MUT;' if x in variant_sub_df['Gene.refGene'].tolist() else ''
                     for x in paad_genes]
    
    # Store results
    variant_assign.append(variant_class)
    case_id.append(variant_file.replace('_001_processed_variants.tsv.bz2', ''))

CPU times: user 12.5 s, sys: 68 ms, total: 12.6 s
Wall time: 12.6 s


In [5]:
# Generate and save oncoprint data for all replicates
oncoprint_file = os.path.join('results', 'oncoprint_replicates.tsv')
oncoprint_df = pd.DataFrame(variant_assign, index=case_id, columns=paad_genes)
oncoprint_df.index.name = 'Case.ID'
oncoprint_df.to_csv(oncoprint_file, sep='\t')

### Consensus samples

In [6]:
# Generate oncoprint data for consensus samples (COSMIC variant exists in all replicates)
full_variant_file = os.path.join('results', 'all_common_replicate_COSMIC_variants.tsv')
full_variant_df = pd.read_table(full_variant_file, index_col=0)
full_variant_df.head(2)

Unnamed: 0_level_0,Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,...,phastCons20way_mammalian,SiPhy_29way_logOdds,Otherinfo,het,quality,depth,replicate,variant_id,sample_id,base_id
X1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
47,1.0,1117795,1117795,C,G,exonic,TTLL10,.,nonsynonymous SNV,"TTLL10:NM_153254:exon6:c.C666G:p.H222Q,TTLL10:...",...,0.006,0.659,het\t.\t38,het,.,38,L001,1_1117795_1117795_C_G_TTLL10,001-F0,1
633,1.0,17326767,17326767,C,T,exonic,ATP13A2,.,nonsynonymous SNV,"ATP13A2:NM_001141973:exon10:c.G866A:p.R289Q,AT...",...,0.715,8.802,het\t.\t38,het,.,38,L001,1_17326767_17326767_C_T_ATP13A2,001-F0,1


In [7]:
variant_assign_consensus = []
case_id_consensus = []
for sample_id in set(full_variant_df['sample_id']):
    # Subset file to given sample ID
    variant_sub_df = full_variant_df[full_variant_df['sample_id'] == sample_id]
    
    # Define mutated genes if they exist for the given variant
    variant_class = ['MUT;' if x in variant_sub_df['Gene.refGene'].tolist() else ''
                     for x in paad_genes]
    
    # Store results
    variant_assign_consensus.append(variant_class)
    case_id_consensus.append(sample_id)

In [8]:
# Generate and save oncoprint data for consensus samples
oncoprint_consensus_file = os.path.join('results', 'oncoprint_consensus.tsv')

oncoprint_consensus_df = (
    pd.DataFrame(variant_assign_consensus,
                 index=case_id_consensus,
                 columns=paad_genes)
    )
oncoprint_consensus_df.index.name = 'Case.ID'
oncoprint_consensus_df.to_csv(oncoprint_consensus_file, sep='\t')

In [9]:
oncoprint_consensus_df.head(3)

Unnamed: 0_level_0,KRAS,MARF1,TSPYL1,MUC4;MUC4,OR11G2,DNHD1,LCE4A,ARHGEF11;ETV3L,NCOR2,BRCA1,...,SHCBP1,CD34,LAMA1,EP400,C19orf33,CGB8,MROH8;MROH8,FGF6,ZNF763,NEB
Case.ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
006-F0,MUT;,MUT;,,MUT;,MUT;,MUT;,,MUT;,,,...,,,,,,,,,,
030-F0,MUT;,MUT;,MUT;,MUT;,,,,,MUT;,,...,,,,,,,,,,
KS27_F0,MUT;,MUT;,MUT;,MUT;,MUT;,,MUT;,MUT;,MUT;,MUT;,...,MUT;,,,,,,,,,


## COSMIC Mutational Similarity

Output mutational similarity data for all replicates and consensus samples. The COSMIC mutational similarity is built from a (0,1) sample by COSMIC mutation matrix.

### For All Replicates

In [10]:
# How many COSMIC mutation IDs are in the entire set and how many are unique?
print('All COSMIC mutations: {}'.format(len(all_cosmic_ids)))
unique_cosmic_ids = set(all_cosmic_ids)
print('Unique COSMIC mutations: {}'.format(len(unique_cosmic_ids)))

All COSMIC mutations: 7403
Unique COSMIC mutations: 751


In [11]:
case_id = []
cosmic_similarity_list = []
for variant_file in os.listdir(variant_file_path):
    # Load and subset file to only variants in the COSMIC db
    variant_df = pd.read_table(os.path.join(variant_file_path, variant_file), index_col=0)
    variant_sub_df = variant_df[variant_df['cosmic70'] != '.']
    
    # Define membership in COSMIC IDs
    cosmic_class = [1 if x in variant_sub_df['cosmic70'].tolist() else 0
                    for x in unique_cosmic_ids]
    
    # Store results
    cosmic_similarity_list.append(cosmic_class)
    case_id.append(variant_file.replace('_001_processed_variants.tsv.bz2', ''))

In [12]:
# Generate COSMIC id membership data (for downstream similarity matrix)
cosmic_common_file = os.path.join('results', 'cosmic_similarity_replicates.tsv')
cosmic_common_df = (
    pd.DataFrame(cosmic_similarity_list,
                 index=case_id,
                 columns=unique_cosmic_ids)
    )
cosmic_common_df.index.name = 'Case.ID'
cosmic_common_df.to_csv(cosmic_common_file, sep='\t')

### Consensus samples

In [13]:
case_id_consensus = []
cosmic_similarity_consensus_list = []
for sample_id in set(full_variant_df['sample_id']):
    # Subset file to given sample ID
    variant_sub_df = full_variant_df[full_variant_df['sample_id'] == sample_id]
    
    # Define membership in COSMIC IDs
    cosmic_class = [1 if x in variant_sub_df['cosmic70'].tolist() else 0
                    for x in unique_cosmic_ids]
    
    # Store results
    cosmic_similarity_consensus_list.append(cosmic_class)
    case_id_consensus.append(sample_id)

In [14]:
common_cosmic_consensus_file = os.path.join('results', 'cosmic_similarity_consensus.tsv')
cosmic_common_consensus_df = pd.DataFrame(cosmic_similarity_consensus_list,
                                          index=case_id_consensus,
                                          columns=unique_cosmic_ids)
cosmic_common_consensus_df.index.name = 'Case.ID'
cosmic_common_consensus_df.to_csv(common_cosmic_consensus_file, sep='\t')