# Observing intersecting gene variants across patient sets

We visualize overlapping variants using [UpSetR](https://github.com/hms-dbmi/UpSetR) plots ([Conway et al. 2017](https://doi.org/10.1093/bioinformatics/btx364)).

In [1]:
library(UpSetR)
library(reshape2)
library(dplyr)


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



In [2]:
cosmic_file <- file.path('results', 'all_cosmic_variants.tsv')
cosmic_df <- readr::read_tsv(cosmic_file) %>%
    dplyr::mutate(base_sample_id = sapply(final_id, function(x) unlist(strsplit(x, '-'))[1]))
head(cosmic_df)

Parsed with column specification:
cols(
  .default = col_character(),
  Start = col_integer(),
  End = col_integer(),
  depth = col_integer()
)
See spec(...) for full column specifications.


Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,⋯,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds,Otherinfo,sample_name,final_id,het,quality,depth,base_sample_id
1,1117795,1117795,C,G,exonic,TTLL10,.,nonsynonymous SNV,"TTLL10:NM_153254:exon6:c.C666G:p.H222Q,TTLL10:NM_001130045:exon10:c.C885G:p.H295Q",⋯,0.035,0.006,0.659,het	.	158,001-F0,001-F0,het,.,158,1
1,1571791,1571791,G,A,exonic,CDK11B,.,nonsynonymous SNV,"CDK11B:NM_033487:exon16:c.C1202T:p.A401V,CDK11B:NM_033490:exon17:c.C1319T:p.A440V,CDK11B:NM_001291345:exon19:c.C1934T:p.A645V,CDK11B:NM_001787:exon19:c.C2003T:p.A668V,CDK11B:NM_033486:exon19:c.C1964T:p.A655V,CDK11B:NM_033489:exon20:c.C1862T:p.A621V",⋯,0.916,0.815,16.864,het	.	126,001-F0,001-F0,het,.,126,1
1,5933384,5933384,C,G,exonic,NPHP4,.,synonymous SNV,"NPHP4:NM_001291594:exon19:c.G1707C:p.G569G,NPHP4:NM_001291593:exon20:c.G1704C:p.G568G,NPHP4:NM_015102:exon23:c.G3243C:p.G1081G",⋯,.,.,.,het	.	45,001-F0,001-F0,het,.,45,1
1,6500762,6500762,G,A,exonic,ESPN,.,nonsynonymous SNV,ESPN:NM_031475:exon4:c.G752A:p.G251D,⋯,0.998,0.995,12.642,het	.	59,001-F0,001-F0,het,.,59,1
1,12885090,12885090,T,G,exonic,PRAMEF11,.,unknown,UNKNOWN,⋯,0.003,0.004,4.382,het	.	161,001-F0,001-F0,het,.,161,1
1,17326767,17326767,C,T,exonic,ATP13A2,.,nonsynonymous SNV,"ATP13A2:NM_001141973:exon10:c.G866A:p.R289Q,ATP13A2:NM_001141974:exon10:c.G866A:p.R289Q,ATP13A2:NM_022089:exon10:c.G881A:p.R294Q",⋯,0.463,0.715,8.802,het	.	162,001-F0,001-F0,het,.,162,1


In [3]:
for (sample_group in unique(cosmic_df$base_sample_id)) {
    
    upset_fig_file <- file.path('figures', 'upset', paste0('upset_sample_', sample_group, '.pdf'))
   
    
    patient_df <- cosmic_df %>% dplyr::filter(base_sample_id == sample_group)
    
    sample_set <- sort(unique(patient_df$final_id), decreasing = TRUE)

    patient_df_melt <- reshape2::melt(patient_df, id.vars = 'final_id', measure.vars = 'Gene.refGene')
    patient_pivot <- reshape2::dcast(patient_df_melt, value ~ final_id, fun.aggregate = function(x) length(x) )
    patient_pivot[patient_pivot == 2] <- 1
    
    pdf(upset_fig_file, height = 6, width = 7, onefile = FALSE)         
    upset(patient_pivot, order.by = 'freq', sets = sample_set, keep.order = TRUE,
          queries = list(list(query = intersects, params = sample_set,
                              color = 'orange', active = T)), mb.ratio = c(0.7, 0.3),
          text.scale = c(1.8, 1.5, 1.8, 1.5, 1.8, 1.2))
    dev.off()
}

## Generate UpSetR plots before COSMIC filtering


In [4]:
file = file.path('results', 'all_cosmic_prefiltered_variants.tsv')
prefiltered_cosmic_df <- readr::read_tsv(file) %>%
    dplyr::mutate(base_sample_id = sapply(final_id, function(x) unlist(strsplit(x, '-'))[1]))
head(prefiltered_cosmic_df)

Parsed with column specification:
cols(
  .default = col_character(),
  Chr = col_integer(),
  Start = col_integer(),
  End = col_integer(),
  gnomAD_exome_ALL = col_double(),
  gnomAD_exome_ASJ = col_double(),
  gnomAD_exome_NFE = col_double(),
  gnomAD_exome_OTH = col_double(),
  depth = col_integer()
)
See spec(...) for full column specifications.
“2475431 parsing failures.
row # A tibble: 5 x 5 col     row col   expected   actual file                                          expected   <int> <chr> <chr>      <chr>  <chr>                                         actual 1  2135 Chr   an integer X      'results/all_cosmic_prefiltered_variants.tsv' file 2  2136 Chr   an integer X      'results/all_cosmic_prefiltered_variants.tsv' row 3  2137 Chr   an integer X      'results/all_cosmic_prefiltered_variants.tsv' col 4  2138 Chr   an integer X      'results/all_cosmic_prefiltered_variants.tsv' expected 5  2139 Chr   an integer X      'results/all_cosmic_prefiltered_variants.tsv'
... ......

Chr,Start,End,Ref,Alt,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,⋯,phastCons7way_vertebrate,phastCons20way_mammalian,SiPhy_29way_logOdds,Otherinfo,sample_name,final_id,het,quality,depth,base_sample_id
1,13503,13503,C,T,ncRNA_exonic,DDX11L1,.,.,.,⋯,.,.,.,het	.	780,001-F0,001-F0,het,.,780,1
1,13657,13658,AG,-,ncRNA_exonic,DDX11L1,.,.,.,⋯,.,.,.,het	.	21,001-F0,001-F0,het,.,21,1
1,984971,984971,G,A,exonic,AGRN,.,nonsynonymous SNV,AGRN:NM_198576:exon26:c.G4540A:p.A1514T,⋯,0.003,0.000,4.685,het	.	63,001-F0,001-F0,het,.,63,1
1,1117795,1117795,C,G,exonic,TTLL10,.,nonsynonymous SNV,"TTLL10:NM_153254:exon6:c.C666G:p.H222Q,TTLL10:NM_001130045:exon10:c.C885G:p.H295Q",⋯,0.035,0.006,0.659,het	.	158,001-F0,001-F0,het,.,158,1
1,1396132,1396132,T,G,exonic,ATAD3C,.,nonsynonymous SNV,ATAD3C:NM_001039211:exon10:c.T815G:p.F272C,⋯,0.970,0.762,9.316,het	.	17,001-F0,001-F0,het,.,17,1
1,1571791,1571791,G,A,exonic,CDK11B,.,nonsynonymous SNV,"CDK11B:NM_033487:exon16:c.C1202T:p.A401V,CDK11B:NM_033490:exon17:c.C1319T:p.A440V,CDK11B:NM_001291345:exon19:c.C1934T:p.A645V,CDK11B:NM_001787:exon19:c.C2003T:p.A668V,CDK11B:NM_033486:exon19:c.C1964T:p.A655V,CDK11B:NM_033489:exon20:c.C1862T:p.A621V",⋯,0.916,0.815,16.864,het	.	126,001-F0,001-F0,het,.,126,1


In [5]:
for (sample_group in unique(prefiltered_cosmic_df$base_sample_id)) {
    
    upset_fig_file <- file.path('figures', 'upset', 'prefiltered',
                                paste0('upset_sample_', sample_group, '.pdf'))
   
    
    patient_df <- prefiltered_cosmic_df %>% dplyr::filter(base_sample_id == sample_group)
    
    sample_set <- sort(unique(patient_df$final_id), decreasing = TRUE)

    patient_df_melt <- reshape2::melt(patient_df, id.vars = 'final_id', measure.vars = 'Gene.refGene')
    patient_pivot <- reshape2::dcast(patient_df_melt, value ~ final_id, fun.aggregate = function(x) length(x) )
    patient_pivot[patient_pivot >= 2] <- 1
    
    pdf(upset_fig_file, height = 6, width = 7, onefile = FALSE)         
    upset(patient_pivot, order.by = 'freq', sets = sample_set, keep.order = TRUE,
          queries = list(list(query = intersects, params = sample_set,
                              color = 'orange', active = T)), mb.ratio = c(0.7, 0.3),
          text.scale = c(1.8, 1.5, 1.8, 1.5, 1.8, 1.2))
    dev.off()
}