Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 7a9d817
Showing
44 changed files
with
1,501 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
Package: vanquish | ||
Title: Variant Quality Investigation Helper | ||
Version: 1.0.0 | ||
Authors@R: c(person("Tao", "Jiang", role = c("aut", "cre"), | ||
email = "tjiang8@ncsu.edu")) | ||
Description: Imports Variant Calling Format file into R. It can detect | ||
whether a sample contains contaminant from the same species. | ||
In the first stage of the approach, a change-point detection | ||
method is used to identify copy number variations for filtering. | ||
Next, features are extracted from the data for a support vector | ||
machine model. For log-likelihood calculation, the deviation | ||
parameter is estimated by maximum likelihood method. Using | ||
a radial basis function kernel support vector machine, the | ||
contamination of a sample can be detected. | ||
Depends: R (>= 3.4.0) | ||
Imports: changepoint, e1071, ggplot2, stats, VGAM | ||
License: GPL-2 | ||
Encoding: UTF-8 | ||
LazyData: true | ||
RoxygenNote: 6.0.1 | ||
NeedsCompilation: no | ||
Packaged: 2018-07-17 19:05:46 UTC; tjiang8 | ||
Author: Tao Jiang [aut, cre] | ||
Maintainer: Tao Jiang <tjiang8@ncsu.edu> | ||
Repository: CRAN | ||
Date/Publication: 2018-09-05 14:50:04 UTC |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,43 @@ | ||
e402d07979542bdf757d93ef844d310f *DESCRIPTION | ||
ba5a0cc2f985c043ffd1f5d7f4340497 *NAMESPACE | ||
391f931190fba071b5820e4f5fa0c1a4 *R/data.R | ||
c772c7e2ac31b679ea4ff01cdc85f97e *R/defcon.R | ||
411ebfedd51550cc95b79ad46053c96a *R/generate_feature.R | ||
41c09d885cb307a6b4fa1afefce6d411 *R/read_vcf.R | ||
4a14d7a1cef3060a7fb13e99d517e720 *R/rho_est.R | ||
efafc7cb893b4e61e08d3e5ccd0eaaee *R/summary_vcf.R | ||
882fab2726081d7ecea656da3af3d820 *R/train_ct.R | ||
281160af53f0f77843eef77f7aef710b *R/update_vcf.R | ||
ee076e00c62a477c8e4e3a6f593c47cd *README.md | ||
e92adfd5499610f98cf0e14589ba7b81 *data/config_df.RData | ||
db8fb3929d667b29bae48adb0e7cb54f *data/svm_class_model.RData | ||
1796991ca93485b0d5551a2635b9646b *data/svm_regression_model.RData | ||
fa6bce5797e399cf80207a4252a842e8 *data/vcf_example.RData | ||
c6ff521654dc587b74a7032ee2072908 *inst/extdata/example.vcf.gz | ||
9b08f14d1b6b50afd7f423c1053d97da *man/config_df.Rd | ||
243c3bb00208d8122aba341621c96fcd *man/defcon.Rd | ||
43b202ad720a691b2eb4ec9c972cb255 *man/generate_feature.Rd | ||
9673c3128074dcd0a74dc08f3b04b269 *man/getAlt2.Rd | ||
514de8a7759c14d86e7fec59dbba0f3a *man/getAnnoRate.Rd | ||
34e83ac31c6496e13e3caf4e4c90f711 *man/getAvgLL.Rd | ||
c938e8539b6d2cc4af2db4db977b8346 *man/getLowDepth.Rd | ||
75c5ba75056e0612bf8551e155b2bb20 *man/getRatio.Rd | ||
78589eb934005080951ff81245ed2058 *man/getSNVRate.Rd | ||
5e67f00d4d97c8031e8fb8d971d12771 *man/getSkewness.Rd | ||
7d18db03a68849384f7fb4bb80d35aab *man/getVar.Rd | ||
245705c066024826f5ee5ead2f84af12 *man/locateFile.Rd | ||
782a63604a4aa879b94ab443d8d1155a *man/negll.Rd | ||
bddd2e2e5610364338bd32ede4c043c8 *man/readGATK.Rd | ||
e47455dccced75f344d51beca9f5ccba *man/readStrelka.Rd | ||
3e11bcd0a262ead483d9664ec1c2c267 *man/readVarDict.Rd | ||
979b92610d1c38541eb0dadb0f429ace *man/readVarPROWL.Rd | ||
4dc04935c108cb0d8492bd69419212c1 *man/read_vcf.Rd | ||
25faa94ed4aa68498ea0d972be4e2d3b *man/rho_est.Rd | ||
c585497cceb85ef326b8c1bde6dba015 *man/rmCNVinVCF.Rd | ||
28c11247f869f6e636bc58dd46f8c857 *man/rmChangePoint.Rd | ||
c2dcfa7b81cfe57ec1974456f9899c28 *man/summary_vcf.Rd | ||
1ea419482528030b444a9d1f38956e1e *man/svm_class_model.Rd | ||
cbf29d17ad187b1e6d1963bfaf1a790e *man/svm_regression_model.Rd | ||
0d52eed0c8e672f2d87e78dfce5588b4 *man/train_ct.Rd | ||
0d808161cdc090b9becad479ce8cad68 *man/update_vcf.Rd | ||
ff0baec20cf95332e5f8f905961e2793 *man/vcf_example.Rd |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
# Generated by roxygen2: do not edit by hand | ||
|
||
export(defcon) | ||
export(generate_feature) | ||
export(getAlt2) | ||
export(read_vcf) | ||
export(rho_est) | ||
export(summary_vcf) | ||
export(train_ct) | ||
export(update_vcf) | ||
import(e1071) | ||
import(ggplot2) | ||
import(stats) | ||
import(utils) | ||
importFrom(VGAM,dbetabinom) | ||
importFrom(changepoint,cpt.var) | ||
importFrom(stats,optim) | ||
importFrom(utils,read.table) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
#' Default parameters of config. | ||
#' | ||
#' A dataframe containing default parameters. | ||
#' | ||
#' @format A data frame with 12 variables: | ||
#' \describe{ | ||
#' \item{\code{threshold}}{Threshold for allele frequency} | ||
#' \item{\code{skew}}{Skewness for allele frequency} | ||
#' \item{\code{lower}}{Lower bound for allele frequency region} | ||
#' \item{\code{upper}}{Upper bound for allele frequency region} | ||
#' \item{\code{ldpthred}}{Threshold to determine low depth} | ||
#' \item{\code{hom_mle}}{Hom MLE of p in Beta-Binomial model} | ||
#' \item{\code{het_mle}}{Het MLE of p in Beta-Binomial model} | ||
#' \item{\code{Hom_thred}}{Threshold between hom and high} | ||
#' \item{\code{High_thred}}{Threshold between high and het} | ||
#' \item{\code{Het_thred}}{Threshold between het and low} | ||
#' \item{\code{hom_rho}}{Hom MLE of rho in Beta-Binomial model} | ||
#' \item{\code{het_rho}}{Het MLE of rho in Beta-Binomial model} | ||
#' } | ||
#' @source Created by Tao Jiang | ||
"config_df" | ||
|
||
#' Default svm classification model. | ||
#' | ||
#' An svm object containing default svm classification model. | ||
#' | ||
#' @format An svm object: | ||
#' @source Created by Tao Jiang | ||
"svm_class_model" | ||
|
||
#' Default svm regression model. | ||
#' | ||
#' An svm object containing default svm regression model. | ||
#' | ||
#' @format An svm object: | ||
#' @source Created by Tao Jiang | ||
"svm_regression_model" | ||
|
||
#' VCF example file. | ||
#' | ||
#' An example containing a list of 4 data frames. | ||
#' | ||
#' @format A list of 4 data frames: | ||
#' @source Created by Tao Jiang | ||
"vcf_example" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
#' @title DEtection of Frequency CONtamination | ||
#' @description Detects whether a sample is contaminated another sample of its same species. The input file should be in vcf format. | ||
#' | ||
#' @param file VCF input object | ||
#' @param rmCNV Remove CNV regions, default is FALSE | ||
#' @param cnvobj CNV object, default is NULL | ||
#' @param config config information of parameters. A default set is generated as part of the model and is included in a model object, which contains | ||
#' @param class_model An SVM classification model | ||
#' @param regression_model An SVM regression model | ||
#' | ||
#' @return A list containing (1) stat: a data frame with all statistics for contamination estimation; (2) result: contamination estimation (Class = 0, pure; Class = 1, contaminated) | ||
#' | ||
#' @export | ||
#' @import e1071 | ||
#' @import utils | ||
#' | ||
#' @examples | ||
#' data(vcf_example) | ||
#' result <- defcon(file = vcf_example) | ||
defcon <- function(file, rmCNV = FALSE, cnvobj = NULL, config = NULL, class_model = NULL, regression_model = NULL) { | ||
config_df <- svm_class_model <- svm_regression_model <- NULL | ||
data("config_df", envir = environment()) | ||
data("svm_class_model", envir = environment()) | ||
data("svm_regression_model", envir = environment()) | ||
|
||
if (is.null(config)) { | ||
config <- config_df #config_df contains default parameters. | ||
} | ||
vcf <- file$VCF | ||
vcf <- update_vcf(rmCNV = rmCNV, vcf = vcf, cnvobj = cnvobj, config$threshold, config$skew, config$lower, config$upper) | ||
Name <- file$file_sample_name[1] | ||
|
||
LOH <- nrow(vcf[vcf$ZG == "het",])/nrow(vcf[vcf$ZG == "hom",]) | ||
HomVar <- getVar(df = vcf, state = "hom", config$hom_mle, config$het_mle) | ||
HetVar <- getVar(df = vcf, state = "het", config$hom_mle, config$het_mle) | ||
HomRate <- nrow(vcf[(vcf$AF > config$Hom_thred),]) / nrow(vcf) | ||
HighRate <- nrow(vcf[((vcf$AF > config$High_thred) & (vcf$AF < config$Hom_thred)),]) / nrow(vcf) | ||
HetRate <- nrow(vcf[((vcf$AF > config$Het_thred) & (vcf$AF < config$High_thred)),]) / nrow(vcf) | ||
LowRate <- nrow(vcf[(vcf$AF < config$Het_thred),]) / nrow(vcf) | ||
AvgLL <- getAvgLL(vcf, config$hom_mle, config$het_mle, config$hom_rho, config$het_rho) | ||
|
||
res_df <- data.frame(Name, LOH, HomVar, HetVar, HomRate, HighRate, HetRate, LowRate, AvgLL) | ||
|
||
if (is.null(class_model)) { | ||
Class <- predict(object = svm_class_model, newdata = res_df) # svm_class_model is the default classification model. | ||
} else { | ||
Class <- predict(object = class_model, newdata = res_df) | ||
} | ||
|
||
if (is.null(regression_model)) { | ||
Regression <- predict(object = svm_regression_model, newdata = res_df) # svm_regression_model is the default regression model. | ||
} else { | ||
Regression <- predict(object = regression_model, newdata = res_df) | ||
} | ||
|
||
res <- data.frame(Name, Class, Regression) | ||
|
||
res_list <- list("stat" = res_df, "result" = res) | ||
return(res_list) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
#' Second alternative allele percentage | ||
#' @param f Input raw file | ||
#' @return Percent of the second alternative allele | ||
#' @export | ||
getAlt2 <- function(f) { | ||
tmp <- f[grep(",", f$Alt), ] | ||
percentage <- nrow(tmp)/nrow(f) | ||
return(percentage) | ||
} | ||
|
||
#' Annotation rate | ||
#' @param f Input raw file | ||
#' @return Percentage of annotation locus | ||
getAnnoRate <- function(f) { | ||
tmp <- f[ f$dbID !=".", ] | ||
percentage <- nrow(tmp)/nrow(f) | ||
return(percentage) | ||
} | ||
|
||
#' Low depth percentage | ||
#' @param f Input raw file | ||
#' @param ldpthred Threshold to determine low depth, default is 20 | ||
#' @return Percentage of low depth | ||
getLowDepth <- function(f, ldpthred) { | ||
tmp <- f[which(f$DP < ldpthred), ] | ||
percentage <- nrow(tmp)/nrow(f) | ||
return(percentage) | ||
} | ||
|
||
#' SNV percentage | ||
#' @param df Input raw file | ||
#' @return Percentage of SNV | ||
getSNVRate <- function(df) { | ||
tmp_df <- df[nchar(df$Ref)==1 & nchar(df$Alt)==1, ] | ||
return (nrow(tmp_df)/nrow(df)) | ||
} | ||
|
||
#' Calculate zygosity variable | ||
#' @param df Input modified file | ||
#' @param state Zygosity state | ||
#' @param hom_mle MLE in hom model | ||
#' @param het_mle MLE in het model | ||
#' @return Zygosity variable | ||
getVar <- function(df, state, hom_mle, het_mle) { | ||
df_sub <- df[df$ZG == state,] | ||
if (state == "hom") { | ||
expected <- hom_mle | ||
} else if (state == "het") { | ||
expected <- het_mle | ||
} | ||
df_sub <- df_sub[(abs(df_sub$AF - expected) < 0.49),] | ||
tmp <- sum(((df_sub$AF - expected)^2) * df_sub$DP) / sum(df_sub$DP) | ||
return(tmp) | ||
} | ||
|
||
#' Calculate average log-likelihood | ||
#' @param df Input modified file | ||
#' @param hom_mle Hom MLE of p in Beta-Binomial model, default is 0.9981416 from NA12878_1_L5 | ||
#' @param het_mle Het MLE of p in Beta-Binomial model, default is 0.4737897 from NA12878_1_L5 | ||
#' @param hom_rho Hom MLE of rho in Beta-Binomial model, default is 0.04570275 from NA12878_1_L5 | ||
#' @param het_rho Het MLE of rho in Beta-Binomial model, default is 0.02224098 from NA12878_1_L5 | ||
#' @importFrom VGAM dbetabinom | ||
#' @return meanLL | ||
getAvgLL <- function(df, hom_mle, het_mle, hom_rho, het_rho) { | ||
df_hom <- df[df$ZG == "hom",] | ||
df_het <- df[df$ZG == "het",] | ||
df_hom$LL <- dbetabinom(x = df_hom$AC, | ||
size = df_hom$DP, | ||
prob = hom_mle, | ||
rho = hom_rho, | ||
log = TRUE) | ||
df_het$LL <- dbetabinom(x = df_het$AC, | ||
size = df_het$DP, | ||
prob = het_mle, | ||
rho = het_rho, | ||
log = TRUE) | ||
df <- rbind(df_hom, df_het) | ||
meanLL <- mean(df$LL) | ||
return (meanLL) | ||
} | ||
|
||
#' @title Feature Generation for Contamination Detection Model | ||
#' @description Generates features from each pair of input VCF objects for training contamination detection model. | ||
#' | ||
#' @param file VCF input object | ||
#' @param hom_p The initial value for p in Homozygous Beta-Binomial model, default is 0.999 | ||
#' @param het_p The initial value for p in Heterozygous Beta-Binomial model, default is 0.5 | ||
#' @param hom_rho The initial value for rho in Homozygous Beta-Binomial model, default is 0.005 | ||
#' @param het_rho The initial value for rho in Heterozygous Beta-Binomial model, default is 0.1 | ||
#' @param mixture A vector of whether the sample is contaminated: 0 for pure; 1 for contaminated | ||
#' @param homcut Cutoff allele frequency value between hom and high, default is 0.99 | ||
#' @param highcut Cutoff allele frequency value between high and het, default is 0.7 | ||
#' @param hetcut Cutoff allele frequency value between het and low, default is 0.3 | ||
#' | ||
#' @return A data frame with all features for training model of contamination detection | ||
#' @export | ||
generate_feature <- function(file, hom_p = 0.999, het_p = 0.5, hom_rho = 0.005, het_rho = 0.1, | ||
mixture, homcut = 0.99, highcut = 0.7, hetcut = 0.3) { | ||
Name <- file$file_sample_name[1] | ||
if ((mixture != 0) & (mixture != 1)) { | ||
stop("Use 0 and 1 for mixture!") | ||
} | ||
Mixture <- mixture | ||
vcf <- file$VCF | ||
|
||
LOH <- nrow(vcf[vcf$ZG == "het",])/nrow(vcf[vcf$ZG == "hom",]) | ||
HomVar <- getVar(df = vcf, state = "hom", hom_mle = hom_p, het_mle = het_p) | ||
HetVar <- getVar(df = vcf, state = "het", hom_mle = hom_p, het_mle = het_p) | ||
HomRate <- nrow(vcf[(vcf$AF > homcut),]) / nrow(vcf) | ||
HighRate <- nrow(vcf[((vcf$AF > highcut) & (vcf$AF < homcut)),]) / nrow(vcf) | ||
HetRate <- nrow(vcf[((vcf$AF > hetcut) & (vcf$AF < highcut)),]) / nrow(vcf) | ||
LowRate <- nrow(vcf[(vcf$AF < hetcut),]) / nrow(vcf) | ||
AvgLL <- getAvgLL(df = vcf, hom_mle = hom_p, het_mle = het_p, hom_rho = hom_rho, het_rho = het_rho) | ||
|
||
res_df <- data.frame(Name, Mixture, LOH, HomVar, HetVar, HomRate, HighRate, HetRate, LowRate, AvgLL) | ||
return(res_df) | ||
} |
Oops, something went wrong.