# Exploratory of a toy CNV dataset in VCF format

In [None]:
## install package saasCNV, first install DNAcopy
## source("https://bioconductor.org/biocLite.R")
## biocLite("DNAcopy")
## install.packages("saasCNV")
library(saasCNV)
## first covert VCF file to a data frame
vcf_table <- vcf2txt(vcf.file="WES_example.vcf", normal.col = 10, tumor.col = 11)

## not used
## read.delim is used to read .txt.gz, not vcf
## vcf_table <- read.delim(file="WES_example.vcf", as.is=TRUE)
## use the data generated by vcf2txt, whose input is vcf

## construct data frame for CNV inference with NGS data
seq.data <- cnv.data(vcf=vcf_table, min.chr.probe=100, verbose=FALSE)
## see how seq.data looks like
head(seq.data)
data(seq.data)

## ??? contained in library saasCNV ???
data(seq.segs.merge)
## CNV Calling from Sequencing Data
seq.cnv <- cnv.call(data=seq.data, sample.id="PT116", segs.stat=seq.segs.merge, maxL=2000, N=1000, pvalue.cutoff=0.05)

## the last column is CNV
seq.cnv[, ncol(seq.cnv)]
table(seq.cnv[, ncol(seq.cnv)])
## gain       LOH      loss    normal undecided 
##   14        14        35         4         7 
head(seq.cnv)
data(seq.cnv)
## visualize genome-wide SCNA profile in 2D cluster plot
diagnosis.cluster.plot(segs=seq.cnv, chrs=sub("^chr","",unique(seq.cnv$chr)), min.snps=10, max.cex=3, ref.num.probe=1000)

## visualize genome-wide SCNA profile
genome.wide.plot(data=seq.data, segs=seq.cnv, sample.id="PT116", chrs=sub("^chr","",unique(seq.cnv$chr)), cex=0.3)

## Joint Segmentation on log2ratio and log2mBAF Dimensions, for chr 1-22
seq.segs <- joint.segmentation(data=seq.data, min.snps=10, global.pval.cutoff=1e-4, max.chpts=30, verbose=TRUE)

## Merge Adjacent Segments
seq.segs.merge <- merging.segments(data=seq.data, segs.stat=seq.segs, use.null.data=TRUE, N=1000, maxL=2000, merge.pvalue.cutoff=0.05, verbose=TRUE)
head(seq.segs.merge)
data(seq.segs.merge)

## CNV Analysis Pipeline for WGS and WES Data
sample.id <- "WES_0116"
output.dir <- file.path(getwd(), "test_saasCNV")
NGS.CNV(vcf=vcf_table, output.dir=output.dir, sample.id=sample.id, min.chr.probe=100, min.snps=10, joint.segmentation.pvalue.cutoff=1e-4, max.chpts=30, do.merge=TRUE, use.null.data=TRUE, num.perm=1000, maxL=2000, merge.pvalue.cutoff=0.05, do.cnvcall.on.merge=TRUE, cnvcall.pvalue.cutoff=0.05, do.plot=TRUE, cex=0.3, ref.num.probe=1000, do.gene.anno=TRUE, gene.anno.file="refGene_hg19.txt.gz", seed=123456789, verbose=TRUE)

## Gene Annotation
gene.anno <- read.delim(file="refGene_hg19.txt.gz", as.is=TRUE, comment.char="")
seq.cnv.anno <- reannotate.CNV.res(res=seq.cnv, gene=gene.anno, only.CNV=TRUE)

## CNV Analysis Pipeline for SNP array Data
## download snp_table.txt.gz
## url <- "https://zhangz05.u.hpc.mssm.edu/saasCNV/data/snp_table.txt.gz"
snp_table <- read.delim(file="snp_table.txt.gz", as.is=TRUE)
SNP.CNV(snp=snp_table, output.dir=output.dir, sample.id=sample.id, min.chr.probe=100, min.snps=10, joint.segmentation.pvalue.cutoff=1e-4, max.chpts=30, do.merge=TRUE, use.null.data=TRUE, num.perm=1000, maxL=5000, merge.pvalue.cutoff=0.05, do.cnvcall.on.merge=TRUE, cnvcall.pvalue.cutoff=0.05, do.boundary.refine=TRUE, do.plot=TRUE, cex=0.3, ref.num.probe=5000, do.gene.anno=TRUE, gene.anno.file="refGene_hg19.txt.gz", seed=123456789, verbose=TRUE)

## Construct Data Frame for CNV Inference with SNP Array Data
snp.data <- snp.cnv.data(snp=snp_table, min.chr.probe=100, verbose=TRUE)
head(snp.data)

## Refine Segment Boundaries
data(snp.cnv)
snp.cnv.refine <- snp.refine.boundary(data=snp.data, segs.stat=snp.cnv)
head(snp.cnv.refine)