<a href="https://colab.research.google.com/github/gilsonauerswald/Bioinformatic_Projects/blob/main/R_06_Advanced_analysis_of_VCF_files_Quality_control%2C_Filter_and_visualize_the_genomic_variants.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install Necessary Packages**

In [None]:
#Install packages
install.packages("vcfR")
install.packages("reshape2")
#Load packages
library("vcfR")
library("reshape2")
library('ggplot2')

**Load the Sample Data**

In [None]:
#Load data
sample_vcf_tp53 <- read.vcfR('https://raw.githubusercontent.com/pine-bio-support/Merge-VCF-files/main/aneuploid_samples_freebayes_tp53.vcf')

In [None]:
#show the vcfR object
sample_vcf_tp53

**The vcfR Object**

In [None]:
#Display first few lines of each slots
head(sample_vcf_tp53)

**Querying the Meta Data**

In [None]:
# Querying meta data for element DP
queryMETA(sample_vcf_tp53, element = 'FORMAT.+DP')

**Extract depth (DP)**

In [None]:
# Genotype GT data of the sample
sample_vcf_tp53@gt[1:5,1:5]

In [None]:
# Extract Depth from the genotype portion
dp <- extract.gt(sample_vcf_tp53, element = "DP", as.numeric=TRUE)
head(dp)

**Depth Plot**

In [None]:
#Set the margin, width and height for the plot
par(mar=c(12,4,4,2))
#Box plot the depth matrix for selected columns
boxplot(dp, col=2:ncol(dp), las=3)
#Give a title
title(ylab = "Depth (DP)")

**Log Transformation**

In [None]:
#Set the margin, width and height for the plot
par(mar=c(12,4,4,2))
#Box plot the depth matrix for selected columns in log scale
boxplot(dp, col=2:ncol(dp), las=3, log='y')
title(ylab = "Depth (DP)")

**Filtering on sequence depth.**

In [None]:
dp_filt <- dp
sums <- apply(dp_filt, MARGIN=2, quantile, probs=c(0.05, 0.95), na.rm=TRUE)
dp2 <- sweep(dp_filt, MARGIN=2, FUN = "-", sums[1,])
dp_filt[dp2 < 0] <- NA
dp2 <- sweep(dp_filt, MARGIN=2, FUN = "-", sums[2,])
dp_filt[dp2 > 0] <- NA
dp_filt[dp_filt < 4] <- NA

In [None]:
#Set the margin, width and height for the plot
par(mar=c(12,4,4,2))
#Box plot the depth matrix for selected columns in log scale
boxplot(dp_filt, col=2:ncol(dp), las=3, log='y')
title(ylab = "Depth (DP)")

In [None]:
# show the vcfR object
sample_vcf_tp53

In [None]:
# Employ filtering on a duplicated vcfR object based on the filtered depth matrix.
sample_vcf_tp53_filt <- sample_vcf_tp53
is.na( sample_vcf_tp53_filt@gt[,-1][ is.na(dp_filt) ] ) <- TRUE
sample_vcf_tp53_filt

# **Missing data**

## **Quantifying missing values across all samples**

In [None]:
sample_vcf_tp53

## **Quantifying missing values in one sample**

In [None]:
Quantifying missing values in one sample

## **Quantifying missing values across all samples**

In [None]:
#Estimate the number of missing values across different samples
myMiss <- apply(dp, MARGIN = 2, function(x){ sum(is.na(x)) })
#Estimate the proportion of missing values by number of variants
myMiss <- myMiss/nrow(sample_vcf_tp53)


#Use an extended colour pallete
library(RColorBrewer)
palette(brewer.pal(n=12, name = 'Set3'))


#Set the margin, width and height of the plot
par(mar = c(12,4,4,2))


#Barplot for the missing values
barplot(myMiss, las = 2, col = 1:12)
title(ylab = "Missing Variants (%)")

In [None]:
#Estimate the number of missing values across different variants
myMiss <- apply(dp, MARGIN = 1, function(x){ sum(is.na(x)) })
#Estimate the proportion of missing values by number of samples
myMiss <- myMiss/ncol(sample_vcf_tp53@gt[,-1])


#Histogram of missing values across variants
hist(myMiss, col = "#8DD3C7", xlab = "Missing Variants (%)",  breaks = 5)