add yieldSize to VCF reading (#406)

* add yieldSize to VCF reading * truncate summary table to 1000 * yieldSize config value * while loop instead of repeat/break * add defaults * fix while loop * lower rvc yieldsize default * Update batch_data_table.R Co-authored-by: Smith Nicholas <smith@in.tum.de>
gagneurlab · Dec 5, 2022 · ffc786d · ffc786d
1 parent 7c06532
commit ffc786d
Show file tree

Hide file tree

Showing 8 changed files with 92 additions and 72 deletions.
diff --git a/docs/source/prepare.rst b/docs/source/prepare.rst
@@ -105,6 +105,7 @@ implementation                character  Either 'autoencoder', 'pca' or 'peer'.
 zScoreCutoff                  numeric    A non-negative number. Z scores (in absolute value) greater than this cutoff are considered as outliers.                           ``0``
 padjCutoff                    numeric    A number between (0, 1] indicating the maximum FDR an event can have in order to be considered an outlier.                         ``0.05``
 maxTestedDimensionProportion  numeric    An integer that controls the maximum value that the encoding dimension can take. Refer to `advanced-options`_.                     ``3``
+yieldSize                     numeric    An integer that sets the batch size for counting reads within a bam file. If memory issues persist lower the yieldSize.            ``2000000``
 ============================  =========  =================================================================================================================================  ======
 
 Aberrant splicing dictionary
@@ -174,7 +175,7 @@ maxAF                  numeric    Maximum allele frequency (of the minor allele)
 maxVarFreqCohort       numeric    Maximum variant frequency among the cohort.                                                                                                                                                       ``0.05``
 minAlt                 numeric    Integer describing the minimum required reads that support the alternative allele. We recommend a minimum of 3 if further filtering on your own. 10 otherwise.                                    ``3``
 hcArgs                 character  String describing additional arguments for GATK haplocaller. Refer to `advanced-options`_.                                                                                                        ``""``
-
+yieldSize              numeric    An integer that sets the batch size for counting reads within a vcf file. If memory issues persist during ``batch_data_table`` lower the yieldSize.                                               ``100000``
 =====================  =========  ================================================================================================================================================================================================  =========
 
 

diff --git a/drop/config/submodules/AberrantExpression.py b/drop/config/submodules/AberrantExpression.py
@@ -38,6 +38,7 @@ def setDefaultKeys(self, dict_):
         setKey(dict_, None, "padjCutoff", .05)
         setKey(dict_, None, "zScoreCutoff", 0)
         setKey(dict_, None, "maxTestedDimensionProportion", 3)
+        setKey(dict_, None, "yieldSize", 2000000)
         return dict_
 
     def getCountFiles(self, annotation, group):

diff --git a/drop/config/submodules/RNAVariantCalling.py b/drop/config/submodules/RNAVariantCalling.py
@@ -68,9 +68,10 @@ def setDefaultKeys(self, dict_):
         setKey(dict_, None, "hcArgs", "")
         setKey(dict_, None, "addAF", False)
         setKey(dict_, None, "maxAF", 0.001 )
-        setKey(dict_, None, "maxVarFreqCohort", 0.04 )
+        setKey(dict_, None, "maxVarFreqCohort", 0.05 )
         setKey(dict_, None, "minAlt", 3)
         setKey(dict_, None, "createSingleVCF", False)
+        setKey(dict_, None, "yieldSize", 100000)
 
         if dict_["run"]:
             dict_ = utils.checkKeys(dict_, keys=["repeat_mask","highQualityVCFs","dbSNP"], check_files=True)

diff --git a/drop/demo/config_relative.yaml b/drop/demo/config_relative.yaml
@@ -29,6 +29,10 @@ aberrantExpression:
     padjCutoff: 1
     zScoreCutoff: 0
     maxTestedDimensionProportion: 3
+    dassie:
+        tssWindow: 500
+        pasWindow: 1000
+    yieldSize: 2000000
 
 aberrantSplicing:
     run: true
@@ -63,21 +67,22 @@ mae:
       - mae
 
 rnaVariantCalling:
-  run: true
-  groups:
-  - batch_0
-  - batch_1
-  highQualityVCFs:
-  - Data/high_confidence_snps.vcf.gz
-  - Data/high_confidence_indels.vcf.gz
-  dbSNP: Data/dbSNP_chr21.vcf.gz
-  repeat_mask: Data/repeat_mask_chr21.bed
-  createSingleVCF: true
-  addAF: false
-  maxAF: 0.001
-  maxVarFreqCohort: 1
-  hcArgs: null
-  minAlt: 3
+    run: true
+    groups:
+      - batch_0
+      - batch_1
+    highQualityVCFs:
+      - Data/high_confidence_snps.vcf.gz
+      - Data/high_confidence_indels.vcf.gz
+    dbSNP: Data/dbSNP_chr21.vcf.gz
+    repeat_mask: Data/repeat_mask_chr21.bed
+    createSingleVCF: true
+    addAF: false
+    maxAF: 0.001
+    maxVarFreqCohort: 1
+    hcArgs: null
+    minAlt: 3
+    yieldSize: 100000
 
 tools:
   gatkCmd: gatk

diff --git a/drop/modules/aberrant-expression-pipeline/Counting/countReads.R b/drop/modules/aberrant-expression-pipeline/Counting/countReads.R
@@ -50,7 +50,7 @@ if (strand == "yes") {
 }
 
 # read files
-bam_file <- BamFile(snakemake@input$sample_bam, yieldSize = 2e6)
+bam_file <- BamFile(snakemake@input$sample_bam, yieldSize = snakemake@config$aberrantExpression$yieldSize)
 count_ranges <- readRDS(snakemake@input$count_ranges)
 # set chromosome style
 seqlevelsStyle(count_ranges) <- seqlevelsStyle(bam_file)

diff --git a/drop/modules/rvc-pipeline/countVariants/Results.R b/drop/modules/rvc-pipeline/countVariants/Results.R
@@ -34,7 +34,7 @@ res_plot[,VARIANT := NULL]
 
 res$cohortFreq <- round(res$cohortFreq,3)
 
-#' ## Variant Calling Tables
+#' ## Variant Calling Tables (first 1,000)
 DT::datatable(
     head(res[grepl("PASS", FILTER)], 1000),
     caption = 'Variants called from RNA (up to 1,000 rows shown)',
@@ -52,10 +52,10 @@ if (!all(is.na(res$MAX_AF))) {
 # melt filters by GT. Exclude reference calls
 res_plot <- melt(res_plot,id.vars = "FILTER",value.name = "GT")[GT != "0/0",.N,by = c("FILTER","variable","GT")]
 
-#' ## Table of variant calls by GT
+#' ## Table of variant calls by GT (first 1,000)
 summary_dt <- dcast(res_plot, FILTER + GT ~ variable, value.var = "N")
 DT::datatable(
-    summary_dt,
+    head(summary_dt,1000),
     caption = "Variant filters by GT", 
     options=list(scrollY=TRUE),
     filter = 'top')

diff --git a/drop/modules/rvc-pipeline/countVariants/batch_data_table.R b/drop/modules/rvc-pipeline/countVariants/batch_data_table.R
@@ -18,7 +18,7 @@
 #'                        "rnaVariantCalling/data_tables", "{dataset}",
 #'                        "{dataset}_{annotation}_data_table.Rds")`'
 #'  type: script
-#'---  
+#'---
 
 #+ echo=FALSE
 library(data.table)
@@ -36,7 +36,7 @@ saveRDS(snakemake, snakemake@log$snakemake)
   x[which(x == "1/0" | x == "1|0" | x == "0/1" | x == "0|1")] <- "0/1"
   x[which(x == "./." | x == ".|." | x == "0/0 "| x == "0|0")] <- "0/0"
   x[grepl("QD|FS|SnpCluster",x)] <- "Seq_filter"
-  
+
   return(x)
 }
 
@@ -49,53 +49,60 @@ saveRDS(snakemake, snakemake@log$snakemake)
   })
 }
 
-vcf <- VariantAnnotation::readVcf(snakemake@input$annotatedVCF) # read the batch vcf
-canonical_chr <- c(paste0("chr",c(1:22,"X","Y","M")),1:22,"X","Y","MT")
-vcf <- vcf[seqnames(vcf) %in% canonical_chr]
-
-vcf_dt <- as.data.table(geno(vcf)$GT)
-vcf_dt[,FILTER := vcf@fixed$FILTER]
-vcf_dt[,VARIANT := names(vcf)]
-vcf_dt[,GENE_ID := .extract_info(vcf,"gene_id")]
-vcf_dt[,GENE_NAME := .extract_info(vcf,"gene_name")]
-
-dt <- as.data.table(lapply(vcf_dt, .clean_vars))
-
-# calculate variant frequency within the cohort
-max_var_freq_cutoff <- snakemake@config$rnaVariantCalling$maxVarFreqCohort
-dt$cohortFreq <- apply(dt[,1:ncol(vcf)],1,function(x){
-  1-sum(x == "0/0")/length(x) })
-
-
-if (snakemake@config$rnaVariantCalling$addAF){
-  #tMAE code for adding gnomad frequency
-  max_af_cutoff <- snakemake@config$rnaVariantCalling$maxAF
-  pops <- c('AF', 'AF_afr', 'AF_amr', 'AF_eas', 'AF_nfe', 'AF_popmax')
-  maf_dt <- add_gnomAD_AF(granges(vcf),genome_assembly = snakemake@config$genomeAssembly) %>% as.data.table()
-
-  # Compute the MAX_AF based on all provided population columns
-  # return -1 if only NAs are present (to avoid a warning)
-  maf_dt$MAX_AF <- apply(maf_dt[, ..pops], 1, 
-                    FUN=function(x){ max(x, -1, na.rm=TRUE) })
-
-  # Replace Inf/-1 with NA
-  maf_dt[is.infinite(MAX_AF) | MAX_AF == -1, MAX_AF := NA]
-
-  res <- cbind(dt,maf_dt[,"MAX_AF"])
-
-  # Label variants based on their MAX_AF
-  res[FILTER == "PASS",FILTER:= "PASS_common"]
-  res[MAX_AF <= max_af_cutoff & cohortFreq <= max_var_freq_cutoff & FILTER == "PASS_common",FILTER:= "PASS_rare"]
-
-  #Reorder FILTER
-  res$FILTER <- factor(res$FILTER,levels = c("Seq_filter","Mask","minALT","Mask;minALT","PASS_common","PASS_rare"))
-
-} else{
-  res <- dt
-  #Reorder FILTER
-  res$FILTER <- factor(res$FILTER,levels = c("Seq_filter","Mask","minALT","Mask;minALT","PASS"))
-  res$MAX_AF <- NA
-}
+vcffile <- open(VcfFile(snakemake@input$annotatedVCF,yieldSize = snakemake@config$rnaVariantCalling$yieldSize)) # read the batch vcf
+res_final <- data.table()
+
+#while there are rows to read in. Process the vcf file until nrow is 0.
+while(nrow(vcf <- readVcf(vcffile,param =  ScanVcfParam(fixed="FILTER",geno="GT")))) {
+  canonical_chr <- c(paste0("chr",c(1:22,"X","Y","M")),1:22,"X","Y","MT")
+  vcf <- vcf[seqnames(vcf) %in% canonical_chr]
+
+  vcf_dt <- as.data.table(geno(vcf)$GT)
+  vcf_dt[,FILTER := vcf@fixed$FILTER]
+  vcf_dt[,VARIANT := names(vcf)]
+  vcf_dt[,GENE_ID := .extract_info(vcf,"gene_id")]
+  vcf_dt[,GENE_NAME := .extract_info(vcf,"gene_name")]
+
+  dt <- as.data.table(lapply(vcf_dt, .clean_vars))
+
+  # calculate variant frequency within the cohort
+  max_var_freq_cutoff <- snakemake@config$rnaVariantCalling$maxVarFreqCohort
+  dt$cohortFreq <- apply(dt[,1:ncol(vcf)],1,function(x){
+    1-sum(x == "0/0")/length(x) })
+
 
-setcolorder(res,c("VARIANT","GENE_ID","GENE_NAME","FILTER","MAX_AF","cohortFreq"))
-saveRDS(res,snakemake@output$data_table) # save res to a data_table object
+  if (snakemake@config$rnaVariantCalling$addAF){
+    #tMAE code for adding gnomad frequency
+    max_af_cutoff <- snakemake@config$rnaVariantCalling$maxAF
+    pops <- c('AF', 'AF_afr', 'AF_amr', 'AF_eas', 'AF_nfe', 'AF_popmax')
+    maf_dt <- add_gnomAD_AF(granges(vcf),genome_assembly = snakemake@config$genomeAssembly) %>% as.data.table()
+
+    # Compute the MAX_AF based on all provided population columns
+    # return -1 if only NAs are present (to avoid a warning)
+    maf_dt$MAX_AF <- apply(maf_dt[, ..pops], 1, 
+                      FUN=function(x){ max(x, -1, na.rm=TRUE) })
+
+    # Replace Inf/-1 with NA
+    maf_dt[is.infinite(MAX_AF) | MAX_AF == -1, MAX_AF := NA]
+
+    res <- cbind(dt,maf_dt[,"MAX_AF"])
+
+    # Label variants based on their MAX_AF
+    res[FILTER == "PASS",FILTER:= "PASS_common"]
+    res[MAX_AF <= max_af_cutoff & cohortFreq <= max_var_freq_cutoff & FILTER == "PASS_common",FILTER:= "PASS_rare"]
+
+    #Reorder FILTER
+    res$FILTER <- factor(res$FILTER,levels = c("Seq_filter","Mask","minALT","Mask;minALT","PASS_common","PASS_rare"))
+
+  } else{
+    res <- dt
+    #Reorder FILTER
+    res$FILTER <- factor(res$FILTER,levels = c("Seq_filter","Mask","minALT","Mask;minALT","PASS"))
+    res$MAX_AF <- NA
+  }
+  res_final <- rbind(res_final,res)
+}
+
+close(vcffile) 
+setcolorder(res_final,c("VARIANT","GENE_ID","GENE_NAME","FILTER","MAX_AF","cohortFreq"))
+saveRDS(res_final,snakemake@output$data_table) # save res to a data_table object
diff --git a/drop/template/config.yaml b/drop/template/config.yaml
@@ -33,6 +33,10 @@ aberrantExpression:
     padjCutoff: 0.05
     zScoreCutoff: 0
     maxTestedDimensionProportion: 3
+    dassie:
+        tssWindow: 500
+        pasWindow: 1000
+    yieldSize: 2000000
 
 aberrantSplicing:
     run: true
@@ -82,6 +86,7 @@ rnaVariantCalling:
     maxVarFreqCohort: 0.05
     hcArgs: ""
     minAlt: 3
+    yieldSize: 100000
 
 tools:
     gatkCmd: gatk