Skip to content

Commit

Permalink
version 1.0.0
Browse files Browse the repository at this point in the history
  • Loading branch information
lethargy608 authored and cran-robot committed Sep 5, 2018
0 parents commit 7a9d817
Show file tree
Hide file tree
Showing 44 changed files with 1,501 additions and 0 deletions.
26 changes: 26 additions & 0 deletions DESCRIPTION
@@ -0,0 +1,26 @@
Package: vanquish
Title: Variant Quality Investigation Helper
Version: 1.0.0
Authors@R: c(person("Tao", "Jiang", role = c("aut", "cre"),
email = "tjiang8@ncsu.edu"))
Description: Imports Variant Calling Format file into R. It can detect
whether a sample contains contaminant from the same species.
In the first stage of the approach, a change-point detection
method is used to identify copy number variations for filtering.
Next, features are extracted from the data for a support vector
machine model. For log-likelihood calculation, the deviation
parameter is estimated by maximum likelihood method. Using
a radial basis function kernel support vector machine, the
contamination of a sample can be detected.
Depends: R (>= 3.4.0)
Imports: changepoint, e1071, ggplot2, stats, VGAM
License: GPL-2
Encoding: UTF-8
LazyData: true
RoxygenNote: 6.0.1
NeedsCompilation: no
Packaged: 2018-07-17 19:05:46 UTC; tjiang8
Author: Tao Jiang [aut, cre]
Maintainer: Tao Jiang <tjiang8@ncsu.edu>
Repository: CRAN
Date/Publication: 2018-09-05 14:50:04 UTC
43 changes: 43 additions & 0 deletions MD5
@@ -0,0 +1,43 @@
e402d07979542bdf757d93ef844d310f *DESCRIPTION
ba5a0cc2f985c043ffd1f5d7f4340497 *NAMESPACE
391f931190fba071b5820e4f5fa0c1a4 *R/data.R
c772c7e2ac31b679ea4ff01cdc85f97e *R/defcon.R
411ebfedd51550cc95b79ad46053c96a *R/generate_feature.R
41c09d885cb307a6b4fa1afefce6d411 *R/read_vcf.R
4a14d7a1cef3060a7fb13e99d517e720 *R/rho_est.R
efafc7cb893b4e61e08d3e5ccd0eaaee *R/summary_vcf.R
882fab2726081d7ecea656da3af3d820 *R/train_ct.R
281160af53f0f77843eef77f7aef710b *R/update_vcf.R
ee076e00c62a477c8e4e3a6f593c47cd *README.md
e92adfd5499610f98cf0e14589ba7b81 *data/config_df.RData
db8fb3929d667b29bae48adb0e7cb54f *data/svm_class_model.RData
1796991ca93485b0d5551a2635b9646b *data/svm_regression_model.RData
fa6bce5797e399cf80207a4252a842e8 *data/vcf_example.RData
c6ff521654dc587b74a7032ee2072908 *inst/extdata/example.vcf.gz
9b08f14d1b6b50afd7f423c1053d97da *man/config_df.Rd
243c3bb00208d8122aba341621c96fcd *man/defcon.Rd
43b202ad720a691b2eb4ec9c972cb255 *man/generate_feature.Rd
9673c3128074dcd0a74dc08f3b04b269 *man/getAlt2.Rd
514de8a7759c14d86e7fec59dbba0f3a *man/getAnnoRate.Rd
34e83ac31c6496e13e3caf4e4c90f711 *man/getAvgLL.Rd
c938e8539b6d2cc4af2db4db977b8346 *man/getLowDepth.Rd
75c5ba75056e0612bf8551e155b2bb20 *man/getRatio.Rd
78589eb934005080951ff81245ed2058 *man/getSNVRate.Rd
5e67f00d4d97c8031e8fb8d971d12771 *man/getSkewness.Rd
7d18db03a68849384f7fb4bb80d35aab *man/getVar.Rd
245705c066024826f5ee5ead2f84af12 *man/locateFile.Rd
782a63604a4aa879b94ab443d8d1155a *man/negll.Rd
bddd2e2e5610364338bd32ede4c043c8 *man/readGATK.Rd
e47455dccced75f344d51beca9f5ccba *man/readStrelka.Rd
3e11bcd0a262ead483d9664ec1c2c267 *man/readVarDict.Rd
979b92610d1c38541eb0dadb0f429ace *man/readVarPROWL.Rd
4dc04935c108cb0d8492bd69419212c1 *man/read_vcf.Rd
25faa94ed4aa68498ea0d972be4e2d3b *man/rho_est.Rd
c585497cceb85ef326b8c1bde6dba015 *man/rmCNVinVCF.Rd
28c11247f869f6e636bc58dd46f8c857 *man/rmChangePoint.Rd
c2dcfa7b81cfe57ec1974456f9899c28 *man/summary_vcf.Rd
1ea419482528030b444a9d1f38956e1e *man/svm_class_model.Rd
cbf29d17ad187b1e6d1963bfaf1a790e *man/svm_regression_model.Rd
0d52eed0c8e672f2d87e78dfce5588b4 *man/train_ct.Rd
0d808161cdc090b9becad479ce8cad68 *man/update_vcf.Rd
ff0baec20cf95332e5f8f905961e2793 *man/vcf_example.Rd
18 changes: 18 additions & 0 deletions NAMESPACE
@@ -0,0 +1,18 @@
# Generated by roxygen2: do not edit by hand

export(defcon)
export(generate_feature)
export(getAlt2)
export(read_vcf)
export(rho_est)
export(summary_vcf)
export(train_ct)
export(update_vcf)
import(e1071)
import(ggplot2)
import(stats)
import(utils)
importFrom(VGAM,dbetabinom)
importFrom(changepoint,cpt.var)
importFrom(stats,optim)
importFrom(utils,read.table)
45 changes: 45 additions & 0 deletions R/data.R
@@ -0,0 +1,45 @@
#' Default parameters of config.
#'
#' A dataframe containing default parameters.
#'
#' @format A data frame with 12 variables:
#' \describe{
#' \item{\code{threshold}}{Threshold for allele frequency}
#' \item{\code{skew}}{Skewness for allele frequency}
#' \item{\code{lower}}{Lower bound for allele frequency region}
#' \item{\code{upper}}{Upper bound for allele frequency region}
#' \item{\code{ldpthred}}{Threshold to determine low depth}
#' \item{\code{hom_mle}}{Hom MLE of p in Beta-Binomial model}
#' \item{\code{het_mle}}{Het MLE of p in Beta-Binomial model}
#' \item{\code{Hom_thred}}{Threshold between hom and high}
#' \item{\code{High_thred}}{Threshold between high and het}
#' \item{\code{Het_thred}}{Threshold between het and low}
#' \item{\code{hom_rho}}{Hom MLE of rho in Beta-Binomial model}
#' \item{\code{het_rho}}{Het MLE of rho in Beta-Binomial model}
#' }
#' @source Created by Tao Jiang
"config_df"

#' Default svm classification model.
#'
#' An svm object containing default svm classification model.
#'
#' @format An svm object:
#' @source Created by Tao Jiang
"svm_class_model"

#' Default svm regression model.
#'
#' An svm object containing default svm regression model.
#'
#' @format An svm object:
#' @source Created by Tao Jiang
"svm_regression_model"

#' VCF example file.
#'
#' An example containing a list of 4 data frames.
#'
#' @format A list of 4 data frames:
#' @source Created by Tao Jiang
"vcf_example"
60 changes: 60 additions & 0 deletions R/defcon.R
@@ -0,0 +1,60 @@
#' @title DEtection of Frequency CONtamination
#' @description Detects whether a sample is contaminated another sample of its same species. The input file should be in vcf format.
#'
#' @param file VCF input object
#' @param rmCNV Remove CNV regions, default is FALSE
#' @param cnvobj CNV object, default is NULL
#' @param config config information of parameters. A default set is generated as part of the model and is included in a model object, which contains
#' @param class_model An SVM classification model
#' @param regression_model An SVM regression model
#'
#' @return A list containing (1) stat: a data frame with all statistics for contamination estimation; (2) result: contamination estimation (Class = 0, pure; Class = 1, contaminated)
#'
#' @export
#' @import e1071
#' @import utils
#'
#' @examples
#' data(vcf_example)
#' result <- defcon(file = vcf_example)
defcon <- function(file, rmCNV = FALSE, cnvobj = NULL, config = NULL, class_model = NULL, regression_model = NULL) {
config_df <- svm_class_model <- svm_regression_model <- NULL
data("config_df", envir = environment())
data("svm_class_model", envir = environment())
data("svm_regression_model", envir = environment())

if (is.null(config)) {
config <- config_df #config_df contains default parameters.
}
vcf <- file$VCF
vcf <- update_vcf(rmCNV = rmCNV, vcf = vcf, cnvobj = cnvobj, config$threshold, config$skew, config$lower, config$upper)
Name <- file$file_sample_name[1]

LOH <- nrow(vcf[vcf$ZG == "het",])/nrow(vcf[vcf$ZG == "hom",])
HomVar <- getVar(df = vcf, state = "hom", config$hom_mle, config$het_mle)
HetVar <- getVar(df = vcf, state = "het", config$hom_mle, config$het_mle)
HomRate <- nrow(vcf[(vcf$AF > config$Hom_thred),]) / nrow(vcf)
HighRate <- nrow(vcf[((vcf$AF > config$High_thred) & (vcf$AF < config$Hom_thred)),]) / nrow(vcf)
HetRate <- nrow(vcf[((vcf$AF > config$Het_thred) & (vcf$AF < config$High_thred)),]) / nrow(vcf)
LowRate <- nrow(vcf[(vcf$AF < config$Het_thred),]) / nrow(vcf)
AvgLL <- getAvgLL(vcf, config$hom_mle, config$het_mle, config$hom_rho, config$het_rho)

res_df <- data.frame(Name, LOH, HomVar, HetVar, HomRate, HighRate, HetRate, LowRate, AvgLL)

if (is.null(class_model)) {
Class <- predict(object = svm_class_model, newdata = res_df) # svm_class_model is the default classification model.
} else {
Class <- predict(object = class_model, newdata = res_df)
}

if (is.null(regression_model)) {
Regression <- predict(object = svm_regression_model, newdata = res_df) # svm_regression_model is the default regression model.
} else {
Regression <- predict(object = regression_model, newdata = res_df)
}

res <- data.frame(Name, Class, Regression)

res_list <- list("stat" = res_df, "result" = res)
return(res_list)
}
117 changes: 117 additions & 0 deletions R/generate_feature.R
@@ -0,0 +1,117 @@
#' Second alternative allele percentage
#' @param f Input raw file
#' @return Percent of the second alternative allele
#' @export
getAlt2 <- function(f) {
tmp <- f[grep(",", f$Alt), ]
percentage <- nrow(tmp)/nrow(f)
return(percentage)
}

#' Annotation rate
#' @param f Input raw file
#' @return Percentage of annotation locus
getAnnoRate <- function(f) {
tmp <- f[ f$dbID !=".", ]
percentage <- nrow(tmp)/nrow(f)
return(percentage)
}

#' Low depth percentage
#' @param f Input raw file
#' @param ldpthred Threshold to determine low depth, default is 20
#' @return Percentage of low depth
getLowDepth <- function(f, ldpthred) {
tmp <- f[which(f$DP < ldpthred), ]
percentage <- nrow(tmp)/nrow(f)
return(percentage)
}

#' SNV percentage
#' @param df Input raw file
#' @return Percentage of SNV
getSNVRate <- function(df) {
tmp_df <- df[nchar(df$Ref)==1 & nchar(df$Alt)==1, ]
return (nrow(tmp_df)/nrow(df))
}

#' Calculate zygosity variable
#' @param df Input modified file
#' @param state Zygosity state
#' @param hom_mle MLE in hom model
#' @param het_mle MLE in het model
#' @return Zygosity variable
getVar <- function(df, state, hom_mle, het_mle) {
df_sub <- df[df$ZG == state,]
if (state == "hom") {
expected <- hom_mle
} else if (state == "het") {
expected <- het_mle
}
df_sub <- df_sub[(abs(df_sub$AF - expected) < 0.49),]
tmp <- sum(((df_sub$AF - expected)^2) * df_sub$DP) / sum(df_sub$DP)
return(tmp)
}

#' Calculate average log-likelihood
#' @param df Input modified file
#' @param hom_mle Hom MLE of p in Beta-Binomial model, default is 0.9981416 from NA12878_1_L5
#' @param het_mle Het MLE of p in Beta-Binomial model, default is 0.4737897 from NA12878_1_L5
#' @param hom_rho Hom MLE of rho in Beta-Binomial model, default is 0.04570275 from NA12878_1_L5
#' @param het_rho Het MLE of rho in Beta-Binomial model, default is 0.02224098 from NA12878_1_L5
#' @importFrom VGAM dbetabinom
#' @return meanLL
getAvgLL <- function(df, hom_mle, het_mle, hom_rho, het_rho) {
df_hom <- df[df$ZG == "hom",]
df_het <- df[df$ZG == "het",]
df_hom$LL <- dbetabinom(x = df_hom$AC,
size = df_hom$DP,
prob = hom_mle,
rho = hom_rho,
log = TRUE)
df_het$LL <- dbetabinom(x = df_het$AC,
size = df_het$DP,
prob = het_mle,
rho = het_rho,
log = TRUE)
df <- rbind(df_hom, df_het)
meanLL <- mean(df$LL)
return (meanLL)
}

#' @title Feature Generation for Contamination Detection Model
#' @description Generates features from each pair of input VCF objects for training contamination detection model.
#'
#' @param file VCF input object
#' @param hom_p The initial value for p in Homozygous Beta-Binomial model, default is 0.999
#' @param het_p The initial value for p in Heterozygous Beta-Binomial model, default is 0.5
#' @param hom_rho The initial value for rho in Homozygous Beta-Binomial model, default is 0.005
#' @param het_rho The initial value for rho in Heterozygous Beta-Binomial model, default is 0.1
#' @param mixture A vector of whether the sample is contaminated: 0 for pure; 1 for contaminated
#' @param homcut Cutoff allele frequency value between hom and high, default is 0.99
#' @param highcut Cutoff allele frequency value between high and het, default is 0.7
#' @param hetcut Cutoff allele frequency value between het and low, default is 0.3
#'
#' @return A data frame with all features for training model of contamination detection
#' @export
generate_feature <- function(file, hom_p = 0.999, het_p = 0.5, hom_rho = 0.005, het_rho = 0.1,
mixture, homcut = 0.99, highcut = 0.7, hetcut = 0.3) {
Name <- file$file_sample_name[1]
if ((mixture != 0) & (mixture != 1)) {
stop("Use 0 and 1 for mixture!")
}
Mixture <- mixture
vcf <- file$VCF

LOH <- nrow(vcf[vcf$ZG == "het",])/nrow(vcf[vcf$ZG == "hom",])
HomVar <- getVar(df = vcf, state = "hom", hom_mle = hom_p, het_mle = het_p)
HetVar <- getVar(df = vcf, state = "het", hom_mle = hom_p, het_mle = het_p)
HomRate <- nrow(vcf[(vcf$AF > homcut),]) / nrow(vcf)
HighRate <- nrow(vcf[((vcf$AF > highcut) & (vcf$AF < homcut)),]) / nrow(vcf)
HetRate <- nrow(vcf[((vcf$AF > hetcut) & (vcf$AF < highcut)),]) / nrow(vcf)
LowRate <- nrow(vcf[(vcf$AF < hetcut),]) / nrow(vcf)
AvgLL <- getAvgLL(df = vcf, hom_mle = hom_p, het_mle = het_p, hom_rho = hom_rho, het_rho = het_rho)

res_df <- data.frame(Name, Mixture, LOH, HomVar, HetVar, HomRate, HighRate, HetRate, LowRate, AvgLL)
return(res_df)
}

0 comments on commit 7a9d817

Please sign in to comment.