# Exploring annotated variants across tumor samples

The script will output visualizations exploring the relationship between deleterious predictions (SIFT scores) and population allele frequency (gnomAD minor allele frequency).

There are two sets of figures visualized.

1. SIFT/gnomAD across four technical replicates for all samples
2. SIFT/gnomAD for concatenated samples pre- and post filtration (filtered common variation and low read depth)

In [1]:
import os
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline
plt.style.use('seaborn-notebook')

In [3]:
# Load Phenotype Data
file = 'pdx_phenotype.csv'
pheno_df = pd.read_table(file, sep=',')
pheno_df.head(2)

Unnamed: 0,sample,final_id,patient_id,sample_name,pt,pool,read_id,wes_id,extract,location,passage,tumor,date,ln2,rna,pedigree
0,KS1,001-F0,1,001-F0,1,2,001-F0_S3_L001_R1_001,001-F0_S3_L001_001,FNA,,F0,F0-R-2464,5.2.12,1.0,0.0,1.0
1,KS1,001-F0,1,001-F0,1,2,001-F0_S3_L001_R2_001,001-F0_S3_L001_001,FNA,,F0,F0-R-2464,5.2.12,1.0,0.0,1.0


In [4]:
id_updater = dict(zip([x[0] for x in pheno_df.read_id.str.split('_')],
                      pheno_df.final_id))
id_updater

{'001-F0': '001-F0',
 '001-F5': '001-F5',
 '004-F0': '004-M-F0',
 '004-F5': '004-M-F5',
 '004-primary': '004-M',
 '005-F0': '005-M-F0',
 '005-F5': '005-M-F5',
 '005-primary': '005-M',
 '006-F0': '006-F0',
 '006-F5': '006-F5',
 '008-F0': '008-F0',
 '008-F5': '008-F5',
 '018-F0': '008-M1-F0',
 '018-F5': '008-M1-F5',
 '019-F0': '008-M2-F0',
 '019-F5': '008-M2-F5',
 '029-F0': '029-F0',
 '029-F5': '029-F5',
 '030-F0': '030-F0',
 '030-F5': '030-F5',
 '032-F0': '032-F0',
 '032-F5': '032-F5',
 '040-F0': '040-F0',
 '040-F5': '040-F5',
 'KS25': '008-ORTH-M',
 'KS26': '008-ORTH',
 'KS27': '048-F0',
 'KS28': '048-F5',
 'KS29': '048-M1-F0',
 'KS30': '048-M1-F5'}

## Across four technical replicates

In [5]:
# Plot per replicate variants gnomAD vs. SIFT scores
for unique_sample in id_updater.keys():
    
    # Update sample name for publication
    final_id = id_updater[unique_sample]
    
    pheno_subset_df = pheno_df.query('final_id == @final_id')
    
    # generate filenames for each unique sample
    fig_name = os.path.join('figures', 'sift_gnomad', 'replicates',
                            '{}_sift_gnomad_kde.pdf'.format(final_id))
    multi_plots = []
    for wes_id in set(pheno_subset_df['wes_id']):
        variant_file = os.path.join('results', 'annotated_vcfs',
                                    '{}.annotated.hg19_multianno.csv'.format(wes_id))
        
        variant_df = pd.read_csv(variant_file)
        variant_df = variant_df.assign(replicate = wes_id)

        # Subset variants
        filtered_variant_df = variant_df.query('SIFT_score != "."')
        filtered_variant_df = filtered_variant_df.query('gnomAD_exome_ALL != "."')
        multi_plots.append(filtered_variant_df)
        
    sample_results = pd.concat(multi_plots)
    g = sns.FacetGrid(sample_results, col='replicate', col_wrap=2)
    g = (g.map(sns.kdeplot, 'SIFT_score', 'gnomAD_exome_ALL', shade=True)
         .set_titles("{col_name}"))
    plt.subplots_adjust(top=0.9)
    g.fig.suptitle('{} SIFT/gnomAD distributions'.format(final_id))
    plt.savefig(fig_name)
    plt.close()

  interactivity=interactivity, compiler=compiler, result=result)
  s)


## Pre- and post filtering of concatenated samples

In [6]:
# Plot concatenated variants gnomAD vs. SIFT scores
for unique_sample in id_updater.keys():

    # Update sample name for publication
    final_id = id_updater[unique_sample]

    # generate filenames for each unique sample
    fig_name = os.path.join('figures', 'sift_gnomad',
                            'merged_{}_sift_gnomad_kde.pdf'.format(final_id))

    # Read in file
    variant_file = os.path.join('results', 'annotated_merged_vcfs',
                                '{}.annotated.hg19_multianno.csv'.format(unique_sample))
    variant_df = pd.read_csv(variant_file)

    processed_variant_file = os.path.join('results', 'processed_merged_vcfs',
                                '{}_processed_variants.tsv.bz2'.format(final_id))
    processed_variant_df = pd.read_table(processed_variant_file)

    # Subset variants
    filtered_variant_df = variant_df.query('SIFT_score != "."')
    filtered_variant_df = filtered_variant_df.query('gnomAD_exome_ALL != "."')
    filtered_variant_df = filtered_variant_df.assign(variant_type='unfiltered')

    processed_variant_df = processed_variant_df.query('SIFT_score != "."')
    processed_variant_df = processed_variant_df.query('gnomAD_exome_ALL != "."')
    processed_variant_df = processed_variant_df.assign(variant_type='processed')
    
    variant_full_df = pd.concat([filtered_variant_df, processed_variant_df], axis=0)

    g = sns.FacetGrid(variant_full_df, col='variant_type', col_wrap=2, sharey=False)
    g = (g.map(sns.kdeplot, 'SIFT_score', 'gnomAD_exome_ALL', shade=True)
         .set_titles("{col_name}"))
    g.axes[0].set_xlabel("SIFT score")
    g.axes[0].set_ylabel("gnomAD Minor Allele Freq")
    g.axes[1].set_xlabel("SIFT score")
    plt.subplots_adjust(top=0.8)

    g.fig.suptitle('{} SIFT/gnomAD distributions'.format(final_id))
    plt.savefig(fig_name)
    plt.close()

  interactivity=interactivity, compiler=compiler, result=result)
  s)
  interactivity=interactivity, compiler=compiler, result=result)
