Skip to content

Commit

Permalink
add yieldSize to VCF reading (#406)
Browse files Browse the repository at this point in the history
* add yieldSize to VCF reading

* truncate summary table to 1000

* yieldSize config value

* while loop instead of repeat/break

* add defaults

* fix while loop

* lower rvc yieldsize default

* Update batch_data_table.R

Co-authored-by: Smith Nicholas <smith@in.tum.de>
  • Loading branch information
nickhsmith and Smith Nicholas committed Dec 5, 2022
1 parent 7c06532 commit ffc786d
Show file tree
Hide file tree
Showing 8 changed files with 92 additions and 72 deletions.
3 changes: 2 additions & 1 deletion docs/source/prepare.rst
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ implementation character Either 'autoencoder', 'pca' or 'peer'.
zScoreCutoff numeric A non-negative number. Z scores (in absolute value) greater than this cutoff are considered as outliers. ``0``
padjCutoff numeric A number between (0, 1] indicating the maximum FDR an event can have in order to be considered an outlier. ``0.05``
maxTestedDimensionProportion numeric An integer that controls the maximum value that the encoding dimension can take. Refer to `advanced-options`_. ``3``
yieldSize numeric An integer that sets the batch size for counting reads within a bam file. If memory issues persist lower the yieldSize. ``2000000``
============================ ========= ================================================================================================================================= ======

Aberrant splicing dictionary
Expand Down Expand Up @@ -174,7 +175,7 @@ maxAF numeric Maximum allele frequency (of the minor allele)
maxVarFreqCohort numeric Maximum variant frequency among the cohort. ``0.05``
minAlt numeric Integer describing the minimum required reads that support the alternative allele. We recommend a minimum of 3 if further filtering on your own. 10 otherwise. ``3``
hcArgs character String describing additional arguments for GATK haplocaller. Refer to `advanced-options`_. ``""``

yieldSize numeric An integer that sets the batch size for counting reads within a vcf file. If memory issues persist during ``batch_data_table`` lower the yieldSize. ``100000``
===================== ========= ================================================================================================================================================================================================ =========


Expand Down
1 change: 1 addition & 0 deletions drop/config/submodules/AberrantExpression.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def setDefaultKeys(self, dict_):
setKey(dict_, None, "padjCutoff", .05)
setKey(dict_, None, "zScoreCutoff", 0)
setKey(dict_, None, "maxTestedDimensionProportion", 3)
setKey(dict_, None, "yieldSize", 2000000)
return dict_

def getCountFiles(self, annotation, group):
Expand Down
3 changes: 2 additions & 1 deletion drop/config/submodules/RNAVariantCalling.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,9 +68,10 @@ def setDefaultKeys(self, dict_):
setKey(dict_, None, "hcArgs", "")
setKey(dict_, None, "addAF", False)
setKey(dict_, None, "maxAF", 0.001 )
setKey(dict_, None, "maxVarFreqCohort", 0.04 )
setKey(dict_, None, "maxVarFreqCohort", 0.05 )
setKey(dict_, None, "minAlt", 3)
setKey(dict_, None, "createSingleVCF", False)
setKey(dict_, None, "yieldSize", 100000)

if dict_["run"]:
dict_ = utils.checkKeys(dict_, keys=["repeat_mask","highQualityVCFs","dbSNP"], check_files=True)
Expand Down
35 changes: 20 additions & 15 deletions drop/demo/config_relative.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ aberrantExpression:
padjCutoff: 1
zScoreCutoff: 0
maxTestedDimensionProportion: 3
dassie:
tssWindow: 500
pasWindow: 1000
yieldSize: 2000000

aberrantSplicing:
run: true
Expand Down Expand Up @@ -63,21 +67,22 @@ mae:
- mae

rnaVariantCalling:
run: true
groups:
- batch_0
- batch_1
highQualityVCFs:
- Data/high_confidence_snps.vcf.gz
- Data/high_confidence_indels.vcf.gz
dbSNP: Data/dbSNP_chr21.vcf.gz
repeat_mask: Data/repeat_mask_chr21.bed
createSingleVCF: true
addAF: false
maxAF: 0.001
maxVarFreqCohort: 1
hcArgs: null
minAlt: 3
run: true
groups:
- batch_0
- batch_1
highQualityVCFs:
- Data/high_confidence_snps.vcf.gz
- Data/high_confidence_indels.vcf.gz
dbSNP: Data/dbSNP_chr21.vcf.gz
repeat_mask: Data/repeat_mask_chr21.bed
createSingleVCF: true
addAF: false
maxAF: 0.001
maxVarFreqCohort: 1
hcArgs: null
minAlt: 3
yieldSize: 100000

tools:
gatkCmd: gatk
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ if (strand == "yes") {
}

# read files
bam_file <- BamFile(snakemake@input$sample_bam, yieldSize = 2e6)
bam_file <- BamFile(snakemake@input$sample_bam, yieldSize = snakemake@config$aberrantExpression$yieldSize)
count_ranges <- readRDS(snakemake@input$count_ranges)
# set chromosome style
seqlevelsStyle(count_ranges) <- seqlevelsStyle(bam_file)
Expand Down
6 changes: 3 additions & 3 deletions drop/modules/rvc-pipeline/countVariants/Results.R
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ res_plot[,VARIANT := NULL]

res$cohortFreq <- round(res$cohortFreq,3)

#' ## Variant Calling Tables
#' ## Variant Calling Tables (first 1,000)
DT::datatable(
head(res[grepl("PASS", FILTER)], 1000),
caption = 'Variants called from RNA (up to 1,000 rows shown)',
Expand All @@ -52,10 +52,10 @@ if (!all(is.na(res$MAX_AF))) {
# melt filters by GT. Exclude reference calls
res_plot <- melt(res_plot,id.vars = "FILTER",value.name = "GT")[GT != "0/0",.N,by = c("FILTER","variable","GT")]

#' ## Table of variant calls by GT
#' ## Table of variant calls by GT (first 1,000)
summary_dt <- dcast(res_plot, FILTER + GT ~ variable, value.var = "N")
DT::datatable(
summary_dt,
head(summary_dt,1000),
caption = "Variant filters by GT",
options=list(scrollY=TRUE),
filter = 'top')
Expand Down
109 changes: 58 additions & 51 deletions drop/modules/rvc-pipeline/countVariants/batch_data_table.R
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
#' "rnaVariantCalling/data_tables", "{dataset}",
#' "{dataset}_{annotation}_data_table.Rds")`'
#' type: script
#'---
#'---

#+ echo=FALSE
library(data.table)
Expand All @@ -36,7 +36,7 @@ saveRDS(snakemake, snakemake@log$snakemake)
x[which(x == "1/0" | x == "1|0" | x == "0/1" | x == "0|1")] <- "0/1"
x[which(x == "./." | x == ".|." | x == "0/0 "| x == "0|0")] <- "0/0"
x[grepl("QD|FS|SnpCluster",x)] <- "Seq_filter"

return(x)
}

Expand All @@ -49,53 +49,60 @@ saveRDS(snakemake, snakemake@log$snakemake)
})
}

vcf <- VariantAnnotation::readVcf(snakemake@input$annotatedVCF) # read the batch vcf
canonical_chr <- c(paste0("chr",c(1:22,"X","Y","M")),1:22,"X","Y","MT")
vcf <- vcf[seqnames(vcf) %in% canonical_chr]

vcf_dt <- as.data.table(geno(vcf)$GT)
vcf_dt[,FILTER := vcf@fixed$FILTER]
vcf_dt[,VARIANT := names(vcf)]
vcf_dt[,GENE_ID := .extract_info(vcf,"gene_id")]
vcf_dt[,GENE_NAME := .extract_info(vcf,"gene_name")]

dt <- as.data.table(lapply(vcf_dt, .clean_vars))

# calculate variant frequency within the cohort
max_var_freq_cutoff <- snakemake@config$rnaVariantCalling$maxVarFreqCohort
dt$cohortFreq <- apply(dt[,1:ncol(vcf)],1,function(x){
1-sum(x == "0/0")/length(x) })


if (snakemake@config$rnaVariantCalling$addAF){
#tMAE code for adding gnomad frequency
max_af_cutoff <- snakemake@config$rnaVariantCalling$maxAF
pops <- c('AF', 'AF_afr', 'AF_amr', 'AF_eas', 'AF_nfe', 'AF_popmax')
maf_dt <- add_gnomAD_AF(granges(vcf),genome_assembly = snakemake@config$genomeAssembly) %>% as.data.table()

# Compute the MAX_AF based on all provided population columns
# return -1 if only NAs are present (to avoid a warning)
maf_dt$MAX_AF <- apply(maf_dt[, ..pops], 1,
FUN=function(x){ max(x, -1, na.rm=TRUE) })

# Replace Inf/-1 with NA
maf_dt[is.infinite(MAX_AF) | MAX_AF == -1, MAX_AF := NA]

res <- cbind(dt,maf_dt[,"MAX_AF"])

# Label variants based on their MAX_AF
res[FILTER == "PASS",FILTER:= "PASS_common"]
res[MAX_AF <= max_af_cutoff & cohortFreq <= max_var_freq_cutoff & FILTER == "PASS_common",FILTER:= "PASS_rare"]

#Reorder FILTER
res$FILTER <- factor(res$FILTER,levels = c("Seq_filter","Mask","minALT","Mask;minALT","PASS_common","PASS_rare"))

} else{
res <- dt
#Reorder FILTER
res$FILTER <- factor(res$FILTER,levels = c("Seq_filter","Mask","minALT","Mask;minALT","PASS"))
res$MAX_AF <- NA
}
vcffile <- open(VcfFile(snakemake@input$annotatedVCF,yieldSize = snakemake@config$rnaVariantCalling$yieldSize)) # read the batch vcf
res_final <- data.table()

#while there are rows to read in. Process the vcf file until nrow is 0.
while(nrow(vcf <- readVcf(vcffile,param = ScanVcfParam(fixed="FILTER",geno="GT")))) {
canonical_chr <- c(paste0("chr",c(1:22,"X","Y","M")),1:22,"X","Y","MT")
vcf <- vcf[seqnames(vcf) %in% canonical_chr]

vcf_dt <- as.data.table(geno(vcf)$GT)
vcf_dt[,FILTER := vcf@fixed$FILTER]
vcf_dt[,VARIANT := names(vcf)]
vcf_dt[,GENE_ID := .extract_info(vcf,"gene_id")]
vcf_dt[,GENE_NAME := .extract_info(vcf,"gene_name")]

dt <- as.data.table(lapply(vcf_dt, .clean_vars))

# calculate variant frequency within the cohort
max_var_freq_cutoff <- snakemake@config$rnaVariantCalling$maxVarFreqCohort
dt$cohortFreq <- apply(dt[,1:ncol(vcf)],1,function(x){
1-sum(x == "0/0")/length(x) })


setcolorder(res,c("VARIANT","GENE_ID","GENE_NAME","FILTER","MAX_AF","cohortFreq"))
saveRDS(res,snakemake@output$data_table) # save res to a data_table object
if (snakemake@config$rnaVariantCalling$addAF){
#tMAE code for adding gnomad frequency
max_af_cutoff <- snakemake@config$rnaVariantCalling$maxAF
pops <- c('AF', 'AF_afr', 'AF_amr', 'AF_eas', 'AF_nfe', 'AF_popmax')
maf_dt <- add_gnomAD_AF(granges(vcf),genome_assembly = snakemake@config$genomeAssembly) %>% as.data.table()

# Compute the MAX_AF based on all provided population columns
# return -1 if only NAs are present (to avoid a warning)
maf_dt$MAX_AF <- apply(maf_dt[, ..pops], 1,
FUN=function(x){ max(x, -1, na.rm=TRUE) })

# Replace Inf/-1 with NA
maf_dt[is.infinite(MAX_AF) | MAX_AF == -1, MAX_AF := NA]

res <- cbind(dt,maf_dt[,"MAX_AF"])

# Label variants based on their MAX_AF
res[FILTER == "PASS",FILTER:= "PASS_common"]
res[MAX_AF <= max_af_cutoff & cohortFreq <= max_var_freq_cutoff & FILTER == "PASS_common",FILTER:= "PASS_rare"]

#Reorder FILTER
res$FILTER <- factor(res$FILTER,levels = c("Seq_filter","Mask","minALT","Mask;minALT","PASS_common","PASS_rare"))

} else{
res <- dt
#Reorder FILTER
res$FILTER <- factor(res$FILTER,levels = c("Seq_filter","Mask","minALT","Mask;minALT","PASS"))
res$MAX_AF <- NA
}
res_final <- rbind(res_final,res)
}

close(vcffile)
setcolorder(res_final,c("VARIANT","GENE_ID","GENE_NAME","FILTER","MAX_AF","cohortFreq"))
saveRDS(res_final,snakemake@output$data_table) # save res to a data_table object
5 changes: 5 additions & 0 deletions drop/template/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ aberrantExpression:
padjCutoff: 0.05
zScoreCutoff: 0
maxTestedDimensionProportion: 3
dassie:
tssWindow: 500
pasWindow: 1000
yieldSize: 2000000

aberrantSplicing:
run: true
Expand Down Expand Up @@ -82,6 +86,7 @@ rnaVariantCalling:
maxVarFreqCohort: 0.05
hcArgs: ""
minAlt: 3
yieldSize: 100000

tools:
gatkCmd: gatk
Expand Down

0 comments on commit ffc786d

Please sign in to comment.