version 1.0.0

cran · Sep 5, 2018 · 7a9d817 · 7a9d817
commit 7a9d817
Show file tree

Hide file tree

Showing 44 changed files with 1,501 additions and 0 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -0,0 +1,26 @@
+Package: vanquish
+Title: Variant Quality Investigation Helper
+Version: 1.0.0
+Authors@R: c(person("Tao", "Jiang", role = c("aut", "cre"),
+                     email = "tjiang8@ncsu.edu"))
+Description: Imports Variant Calling Format file into R. It can detect
+             whether a sample contains contaminant from the same species.
+             In the first stage of the approach, a change-point detection
+             method is used to identify copy number variations for filtering.
+             Next, features are extracted from the data for a support vector
+             machine model. For log-likelihood calculation, the deviation
+             parameter is estimated by maximum likelihood method. Using
+             a radial basis function kernel support vector machine, the
+             contamination of a sample can be detected.
+Depends: R (>= 3.4.0)
+Imports: changepoint, e1071, ggplot2, stats, VGAM
+License: GPL-2
+Encoding: UTF-8
+LazyData: true
+RoxygenNote: 6.0.1
+NeedsCompilation: no
+Packaged: 2018-07-17 19:05:46 UTC; tjiang8
+Author: Tao Jiang [aut, cre]
+Maintainer: Tao Jiang <tjiang8@ncsu.edu>
+Repository: CRAN
+Date/Publication: 2018-09-05 14:50:04 UTC
diff --git a/MD5 b/MD5
@@ -0,0 +1,43 @@
+e402d07979542bdf757d93ef844d310f *DESCRIPTION
+ba5a0cc2f985c043ffd1f5d7f4340497 *NAMESPACE
+391f931190fba071b5820e4f5fa0c1a4 *R/data.R
+c772c7e2ac31b679ea4ff01cdc85f97e *R/defcon.R
+411ebfedd51550cc95b79ad46053c96a *R/generate_feature.R
+41c09d885cb307a6b4fa1afefce6d411 *R/read_vcf.R
+4a14d7a1cef3060a7fb13e99d517e720 *R/rho_est.R
+efafc7cb893b4e61e08d3e5ccd0eaaee *R/summary_vcf.R
+882fab2726081d7ecea656da3af3d820 *R/train_ct.R
+281160af53f0f77843eef77f7aef710b *R/update_vcf.R
+ee076e00c62a477c8e4e3a6f593c47cd *README.md
+e92adfd5499610f98cf0e14589ba7b81 *data/config_df.RData
+db8fb3929d667b29bae48adb0e7cb54f *data/svm_class_model.RData
+1796991ca93485b0d5551a2635b9646b *data/svm_regression_model.RData
+fa6bce5797e399cf80207a4252a842e8 *data/vcf_example.RData
+c6ff521654dc587b74a7032ee2072908 *inst/extdata/example.vcf.gz
+9b08f14d1b6b50afd7f423c1053d97da *man/config_df.Rd
+243c3bb00208d8122aba341621c96fcd *man/defcon.Rd
+43b202ad720a691b2eb4ec9c972cb255 *man/generate_feature.Rd
+9673c3128074dcd0a74dc08f3b04b269 *man/getAlt2.Rd
+514de8a7759c14d86e7fec59dbba0f3a *man/getAnnoRate.Rd
+34e83ac31c6496e13e3caf4e4c90f711 *man/getAvgLL.Rd
+c938e8539b6d2cc4af2db4db977b8346 *man/getLowDepth.Rd
+75c5ba75056e0612bf8551e155b2bb20 *man/getRatio.Rd
+78589eb934005080951ff81245ed2058 *man/getSNVRate.Rd
+5e67f00d4d97c8031e8fb8d971d12771 *man/getSkewness.Rd
+7d18db03a68849384f7fb4bb80d35aab *man/getVar.Rd
+245705c066024826f5ee5ead2f84af12 *man/locateFile.Rd
+782a63604a4aa879b94ab443d8d1155a *man/negll.Rd
+bddd2e2e5610364338bd32ede4c043c8 *man/readGATK.Rd
+e47455dccced75f344d51beca9f5ccba *man/readStrelka.Rd
+3e11bcd0a262ead483d9664ec1c2c267 *man/readVarDict.Rd
+979b92610d1c38541eb0dadb0f429ace *man/readVarPROWL.Rd
+4dc04935c108cb0d8492bd69419212c1 *man/read_vcf.Rd
+25faa94ed4aa68498ea0d972be4e2d3b *man/rho_est.Rd
+c585497cceb85ef326b8c1bde6dba015 *man/rmCNVinVCF.Rd
+28c11247f869f6e636bc58dd46f8c857 *man/rmChangePoint.Rd
+c2dcfa7b81cfe57ec1974456f9899c28 *man/summary_vcf.Rd
+1ea419482528030b444a9d1f38956e1e *man/svm_class_model.Rd
+cbf29d17ad187b1e6d1963bfaf1a790e *man/svm_regression_model.Rd
+0d52eed0c8e672f2d87e78dfce5588b4 *man/train_ct.Rd
+0d808161cdc090b9becad479ce8cad68 *man/update_vcf.Rd
+ff0baec20cf95332e5f8f905961e2793 *man/vcf_example.Rd
diff --git a/NAMESPACE b/NAMESPACE
@@ -0,0 +1,18 @@
+# Generated by roxygen2: do not edit by hand
+
+export(defcon)
+export(generate_feature)
+export(getAlt2)
+export(read_vcf)
+export(rho_est)
+export(summary_vcf)
+export(train_ct)
+export(update_vcf)
+import(e1071)
+import(ggplot2)
+import(stats)
+import(utils)
+importFrom(VGAM,dbetabinom)
+importFrom(changepoint,cpt.var)
+importFrom(stats,optim)
+importFrom(utils,read.table)
diff --git a/R/data.R b/R/data.R
@@ -0,0 +1,45 @@
+#' Default parameters of config.
+#'
+#' A dataframe containing default parameters.
+#'
+#' @format A data frame with 12 variables:
+#' \describe{
+#'   \item{\code{threshold}}{Threshold for allele frequency}
+#'   \item{\code{skew}}{Skewness for allele frequency}
+#'   \item{\code{lower}}{Lower bound for allele frequency region}
+#'   \item{\code{upper}}{Upper bound for allele frequency region}
+#'   \item{\code{ldpthred}}{Threshold to determine low depth}
+#'   \item{\code{hom_mle}}{Hom MLE of p in Beta-Binomial model}
+#'   \item{\code{het_mle}}{Het MLE of p in Beta-Binomial model}
+#'   \item{\code{Hom_thred}}{Threshold between hom and high}
+#'   \item{\code{High_thred}}{Threshold between high and het}
+#'   \item{\code{Het_thred}}{Threshold between het and low}
+#'   \item{\code{hom_rho}}{Hom MLE of rho in Beta-Binomial model}
+#'   \item{\code{het_rho}}{Het MLE of rho in Beta-Binomial model}
+#' }
+#' @source Created by Tao Jiang
+"config_df"
+
+#' Default svm classification model.
+#'
+#' An svm object containing default svm classification model.
+#'
+#' @format An svm object:
+#' @source Created by Tao Jiang
+"svm_class_model"
+
+#' Default svm regression model.
+#'
+#' An svm object containing default svm regression model.
+#'
+#' @format An svm object:
+#' @source Created by Tao Jiang
+"svm_regression_model"
+
+#' VCF example file.
+#'
+#' An example containing a list of 4 data frames.
+#'
+#' @format A list of 4 data frames:
+#' @source Created by Tao Jiang
+"vcf_example"
diff --git a/R/defcon.R b/R/defcon.R
@@ -0,0 +1,60 @@
+#' @title DEtection of Frequency CONtamination
+#' @description Detects whether a sample is contaminated another sample of its same species. The input file should be in vcf format.
+#'
+#' @param file VCF input object
+#' @param rmCNV Remove CNV regions, default is FALSE
+#' @param cnvobj CNV object, default is NULL
+#' @param config config information of parameters. A default set is generated as part of the model and is included in a model object, which contains
+#' @param class_model An SVM classification model
+#' @param regression_model An SVM regression model
+#'
+#' @return A list containing (1) stat: a data frame with all statistics for contamination estimation; (2) result: contamination estimation (Class = 0, pure; Class = 1, contaminated)
+#'
+#' @export
+#' @import e1071
+#' @import utils
+#'
+#' @examples
+#' data(vcf_example)
+#' result <- defcon(file = vcf_example)
+defcon <- function(file, rmCNV = FALSE, cnvobj = NULL, config = NULL, class_model = NULL, regression_model = NULL) {
+  config_df <- svm_class_model <- svm_regression_model <- NULL
+  data("config_df", envir = environment())
+  data("svm_class_model", envir = environment())
+  data("svm_regression_model", envir = environment())
+
+  if (is.null(config)) {
+    config <- config_df #config_df contains default parameters.
+  }
+  vcf <- file$VCF
+  vcf <- update_vcf(rmCNV = rmCNV, vcf = vcf, cnvobj = cnvobj, config$threshold, config$skew, config$lower, config$upper)
+  Name <- file$file_sample_name[1]
+
+  LOH <- nrow(vcf[vcf$ZG == "het",])/nrow(vcf[vcf$ZG == "hom",])
+  HomVar <- getVar(df = vcf, state = "hom", config$hom_mle, config$het_mle)
+  HetVar <- getVar(df = vcf, state = "het", config$hom_mle, config$het_mle)
+  HomRate <- nrow(vcf[(vcf$AF > config$Hom_thred),]) / nrow(vcf)
+  HighRate <- nrow(vcf[((vcf$AF > config$High_thred) & (vcf$AF < config$Hom_thred)),]) / nrow(vcf)
+  HetRate <- nrow(vcf[((vcf$AF > config$Het_thred) & (vcf$AF < config$High_thred)),]) / nrow(vcf)
+  LowRate <- nrow(vcf[(vcf$AF < config$Het_thred),]) / nrow(vcf)
+  AvgLL <- getAvgLL(vcf, config$hom_mle, config$het_mle, config$hom_rho, config$het_rho)
+
+  res_df <- data.frame(Name, LOH, HomVar, HetVar, HomRate, HighRate, HetRate, LowRate, AvgLL)
+
+  if (is.null(class_model)) {
+    Class <- predict(object = svm_class_model, newdata = res_df) # svm_class_model is the default classification model.
+  } else {
+    Class <- predict(object = class_model, newdata = res_df)
+  }
+
+  if (is.null(regression_model)) {
+    Regression <- predict(object = svm_regression_model, newdata = res_df) # svm_regression_model is the default regression model.
+  } else {
+    Regression <- predict(object = regression_model, newdata = res_df)
+  }
+
+  res <- data.frame(Name, Class, Regression)
+
+  res_list <- list("stat" = res_df, "result" = res)
+  return(res_list)
+}
diff --git a/R/generate_feature.R b/R/generate_feature.R
@@ -0,0 +1,117 @@
+#' Second alternative allele percentage
+#' @param f Input raw file
+#' @return Percent of the second alternative allele
+#' @export
+getAlt2 <- function(f) {
+  tmp <- f[grep(",", f$Alt), ]
+  percentage <- nrow(tmp)/nrow(f)
+  return(percentage)
+}
+
+#' Annotation rate
+#' @param f Input raw file
+#' @return Percentage of annotation locus
+getAnnoRate <- function(f) {
+  tmp <- f[ f$dbID !=".", ]
+  percentage <- nrow(tmp)/nrow(f)
+  return(percentage)
+}
+
+#' Low depth percentage
+#' @param f Input raw file
+#' @param ldpthred Threshold to determine low depth, default is 20
+#' @return Percentage of low depth
+getLowDepth <- function(f, ldpthred) {
+  tmp <- f[which(f$DP < ldpthred), ]
+  percentage <- nrow(tmp)/nrow(f)
+  return(percentage)
+}
+
+#' SNV percentage
+#' @param df Input raw file
+#' @return Percentage of SNV
+getSNVRate <- function(df) {
+  tmp_df <- df[nchar(df$Ref)==1 & nchar(df$Alt)==1, ]
+  return (nrow(tmp_df)/nrow(df))
+}
+
+#' Calculate zygosity variable
+#' @param df Input modified file
+#' @param state Zygosity state
+#' @param hom_mle MLE in hom model
+#' @param het_mle MLE in het model
+#' @return Zygosity variable
+getVar <- function(df, state, hom_mle, het_mle) {
+  df_sub <- df[df$ZG == state,]
+  if (state == "hom") {
+    expected <- hom_mle
+  } else if (state == "het") {
+    expected <- het_mle
+  }
+  df_sub <- df_sub[(abs(df_sub$AF - expected) < 0.49),]
+  tmp <- sum(((df_sub$AF - expected)^2) * df_sub$DP) / sum(df_sub$DP)
+  return(tmp)
+}
+
+#' Calculate average log-likelihood
+#' @param df Input modified file
+#' @param hom_mle Hom MLE of p in Beta-Binomial model, default is 0.9981416 from NA12878_1_L5
+#' @param het_mle Het MLE of p in Beta-Binomial model, default is 0.4737897 from NA12878_1_L5
+#' @param hom_rho Hom MLE of rho in Beta-Binomial model, default is 0.04570275 from NA12878_1_L5
+#' @param het_rho Het MLE of rho in Beta-Binomial model, default is 0.02224098 from NA12878_1_L5
+#' @importFrom VGAM dbetabinom
+#' @return meanLL
+getAvgLL <- function(df, hom_mle, het_mle, hom_rho, het_rho) {
+  df_hom <- df[df$ZG == "hom",]
+  df_het <- df[df$ZG == "het",]
+  df_hom$LL <- dbetabinom(x = df_hom$AC,
+                          size = df_hom$DP,
+                          prob = hom_mle,
+                          rho = hom_rho,
+                          log = TRUE)
+  df_het$LL <- dbetabinom(x = df_het$AC,
+                          size = df_het$DP,
+                          prob = het_mle,
+                          rho = het_rho,
+                          log = TRUE)
+  df <- rbind(df_hom, df_het)
+  meanLL <- mean(df$LL)
+  return (meanLL)
+}
+
+#' @title Feature Generation for Contamination Detection Model
+#' @description Generates features from each pair of input VCF objects for training contamination detection model.
+#'
+#' @param file VCF input object
+#' @param hom_p The initial value for p in Homozygous Beta-Binomial model, default is 0.999
+#' @param het_p The initial value for p in Heterozygous Beta-Binomial model, default is 0.5
+#' @param hom_rho The initial value for rho in Homozygous Beta-Binomial model, default is 0.005
+#' @param het_rho The initial value for rho in Heterozygous Beta-Binomial model, default is 0.1
+#' @param mixture A vector of whether the sample is contaminated: 0 for pure; 1 for contaminated
+#' @param homcut Cutoff allele frequency value between hom and high, default is 0.99
+#' @param highcut Cutoff allele frequency value between high and het, default is 0.7
+#' @param hetcut Cutoff allele frequency value between het and low, default is 0.3
+#'
+#' @return A data frame with all features for training model of contamination detection
+#' @export
+generate_feature <- function(file, hom_p = 0.999, het_p = 0.5, hom_rho = 0.005, het_rho = 0.1,
+                             mixture, homcut = 0.99, highcut = 0.7, hetcut = 0.3) {
+  Name <- file$file_sample_name[1]
+  if ((mixture != 0) & (mixture != 1)) {
+    stop("Use 0 and 1 for mixture!")
+  }
+  Mixture <- mixture
+  vcf <- file$VCF
+
+  LOH <- nrow(vcf[vcf$ZG == "het",])/nrow(vcf[vcf$ZG == "hom",])
+  HomVar <- getVar(df = vcf, state = "hom", hom_mle = hom_p, het_mle = het_p)
+  HetVar <- getVar(df = vcf, state = "het", hom_mle = hom_p, het_mle = het_p)
+  HomRate <- nrow(vcf[(vcf$AF > homcut),]) / nrow(vcf)
+  HighRate <- nrow(vcf[((vcf$AF > highcut) & (vcf$AF < homcut)),]) / nrow(vcf)
+  HetRate <- nrow(vcf[((vcf$AF > hetcut) & (vcf$AF < highcut)),]) / nrow(vcf)
+  LowRate <- nrow(vcf[(vcf$AF < hetcut),]) / nrow(vcf)
+  AvgLL <- getAvgLL(df = vcf, hom_mle = hom_p, het_mle = het_p, hom_rho = hom_rho, het_rho = het_rho)
+
+  res_df <- data.frame(Name, Mixture, LOH, HomVar, HetVar, HomRate, HighRate, HetRate, LowRate, AvgLL)
+  return(res_df)
+}