# Plots accompanying initial variant filtering

    author: Gekkonid Consulting
    date: 2021-10-24

This notebook complements `01_initial-filtering.sh`, plotting various
statistics and histograms on the input and outputs of that notebook. 

In [None]:
library(tidyverse)
library(foreach)
theme_set(theme_bw())

## Raw data

These are the variant calls directly out of Acanthophis. We generated some
stats with htshax, which we plot here.

In [None]:
samphist = read_tsv("data/1_filtered/raw_variants_samphist.tsv")
bcfhist = read_tsv("data/1_filtered/raw_variants_bcfhist.tsv")

In [None]:
ggplot(bcfhist, aes(x=percent, y=..density.., weight=nsnp)) +
	geom_density() +
	facet_wrap(~metric, ncol=1, scales="free_x")

In [None]:
ggplot(bcfhist, aes(x=percent, y=nsnp)) +
	geom_bar(stat="identity") +
	facet_wrap(~metric, ncol=1, scales="free")

In [None]:
ggplot(samphist, aes(x=missing_prop)) +
    geom_histogram()

In [None]:
bcfhist %>%
    filter(metric=="dp") %>%
ggplot(aes(x=percent, y=nsnp)) +
	geom_bar(stat="identity") +
    lims(x=c(0, 251)) +
    labs(x="Fold Coverage", y="N. SNPs")

## Post-filtering

These are the same stats again, but after we have filtered the data. 

In [None]:
samphist = read_tsv("data/1_filtered/cuckoo_q50_dp10_maf3_mis80_samphist.tsv")
bcfhist = read_tsv("data/1_filtered/cuckoo_q50_dp10_maf3_mis80_bcfhist.tsv")

In [None]:
ggplot(bcfhist, aes(x=percent, y=..density.., weight=nsnp)) +
	geom_density() +
	facet_wrap(~metric, ncol=1, scales="free_x")

In [None]:
ggplot(bcfhist, aes(x=percent, y=nsnp)) +
	geom_bar(stat="identity") +
	facet_wrap(~metric, ncol=1, scales="free")

In [None]:
ggplot(samphist, aes(x=missing_prop)) +
    geom_histogram()

In [None]:
bcfhist %>%
    filter(metric=="miss") %>%
    ggplot(aes(x=percent, y=nsnp)) +
        geom_bar(stat="identity") +
        lims(x=c(0, 25))

In [None]:
bcfhist %>%
    filter(metric=="miss", percent<25) %>%
    summarise(nsnp=sum(nsnp))

# RAD depth summaries


In [None]:
rad.samps = c("RAD_MD022", "RAD_MD028", "RAD_MD033", "RAD_MD034")
all.covhist = foreach(sample=rad.samps, .combine=rbind) %do% {
	covhist.f = sprintf("data/radcover/%s_covhist.tsv", sample)
	covhist = read_tsv(covhist.f, col_names=c("coverage", "nbases"))
	covhist %>%
		mutate(sample=sample)
}

In [None]:
all.covhist %>%
	filter(coverage > 0, coverage < 1e4) %>%
	ggplot(aes(x=coverage, y=nbases)) +
		geom_bar(stat="identity") +
		scale_y_log10() +
		scale_x_log10() +
		facet_wrap(~sample, scales="free", ncol=1)

# RAD-locus filtering


In [None]:
samphist = read_tsv("data/1_filtered/cuckoo_q50_dp10_maf3_mis80_radloci_samphist.tsv")
bcfhist = read_tsv("data/1_filtered/cuckoo_q50_dp10_maf3_mis80_radloci_bcfhist.tsv")

In [None]:
meta = read_csv("../rawdata/cuckoo_metadata_oct2021.csv")

In [None]:
ggplot(bcfhist, aes(x=percent, y=nsnp)) +
	geom_bar(stat="identity") +
	facet_wrap(~metric, ncol=1, scales="free") +
    labs(title="RAD-locus Filtered SNP Stats")

In [None]:
ggplot(samphist, aes(x=missing_prop)) +
    geom_histogram() +
    labs(title="RAD-locus Filtered Sample Missingness")

In [None]:
str(meta)
col.plot = meta %>%
    left_join(samphist, by=c("Library_id"="sample"))

In [None]:
ggplot(col.plot, aes(x=missing_prop)) +
    geom_histogram(aes(fill=Sample_type, colour=Sample_type)) +
    labs(title="RAD-locus Filtered Sample Missingness")