In [None]:
library(tidyverse)
library(ggpubr)

In [None]:
csq = read_tsv("output/freebayes_at9852_ath_filt-q100-ac10-dp100-200k-only1k1g-csq_allgenes_csqtab.tsv",
              col_names=c("chrom", "pos", "pos2", "alt", "qual", "af", "bcsq"), na=".")

In [None]:
glimpse(csq)

In [None]:
csq2 = csq %>%
    filter(!is.na(bcsq), !grepl("^@", bcsq)) %>%
    mutate(bcsq=sub(",.*", "", bcsq)) %>%
    separate_wider_delim(bcsq, "|", names=c("csqtype", "gene_id", "transcript", "genetype", "strand", "aa", "cds"), too_few="debug", too_many="debug") %>%
    glimpse() %>%
    select(gene_id, chrom, pos, alt, qual, af, csqtype) %>%
    mutate(csqtype=sub("&.*", "", csqtype)) %>%
    filter(!grepl("^\\*", csqtype)) %>%
    group_by(gene_id) %>%
    mutate(genepos = (pos - min(pos)) / ( max(pos)-min(pos)))

In [None]:
sumaf = csq2 %>%
    filter(af > 0.005) %>%
    mutate(severe=csqtype %in% c("frameshift", "stop_gained", "feature_elongation", "start_lost", "stop_gained", "stop_lost"))%>% 
    glimpse() %>%
    group_by(gene_id, severe) %>%
    summarise(sum_af=sum(af)) %>%
    glimpse()

In [None]:
genes = read_tsv("../paper_plots//orthogroup_stats//input//nlr_cleaned.tsv") %>%
    glimpse()

In [None]:
pdat = sumaf %>%
    left_join(genes %>% select(gene_id=id, nlrtype=nlr_tracker_type), by="gene_id") %>%
    mutate(is_nlr=ifelse(is.na(nlrtype), "Non-NLR", "NLR"))

In [None]:
alltypes = csq2 %>%
    filter(af > 0.005) %>%
    group_by(gene_id, csqtype) %>%
    summarise(sum_af=sum(af), .groups="drop") %>%
    left_join(genes %>% select(gene_id=id, nlrtype=nlr_tracker_type), by="gene_id") %>%
    mutate(is_nlr=ifelse(is.na(nlrtype), "Non-NLR", "NLR")) %>%
    mutate(csqtype=fct_reorder(csqtype, sum_af, .fun=sum)) %>%
    glimpse()

In [None]:
p = alltypes %>%
    ggplot(aes(y=csqtype, x=sum_af, colour=is_nlr, fill=is_nlr)) +
        geom_boxplot(outlier.shape = NA) +
        scale_fill_brewer(palette="Set1", name="Gene Type", aesthetics = c("fill", "colour")) +
        xlim(c(0, 50)) +
        labs(y=NULL, x="sum(Allele Freq)") +
        theme_classic() +
        theme(
            legend.position = c(.7, .16),
            axis.text.y=element_text(angle=0, hjust=1, vjust=0.5),
        )
print(p)
ggsave("output/all_types_sumaf_boxplot.png", width=3, height=4)
ggsave("output/all_types_sumaf_boxplot.svg", width=3, height=4)
saveRDS(p, "output/all_types_sumaf_boxplot.rds")

In [None]:
p = alltypes %>%
    filter(csqtype %in% c("frameshift", "stop_gained", "feature_elongation", "start_lost", "stop_gained", "stop_lost"))%>% 
    mutate(
        is_nlr=factor(is_nlr, levels=c("Non-NLR", "NLR")),
    ) %>%
    ggplot(aes(y=csqtype, x=sum_af, fill=is_nlr)) +
        geom_boxplot(outlier.shape = NA) +
        scale_fill_brewer(palette="Set1", name="Gene Type", aesthetics = c("fill", "colour")) +
        xlim(c(0, .5)) +
        labs(y=NULL, x="sum(Allele Freq)") +
        theme_classic() +
        theme(
            legend.position = c(.7, .16),
            axis.text.y=element_text(angle=0, hjust=1, vjust=0.5),
        )
print(p)
ggsave("output/severe_types_sumaf_boxplot.png", width=3, height=4)
ggsave("output/severe_types_sumaf_boxplot.svg", width=3, height=4)
saveRDS(p, "output/severe_types_sumaf_boxplot.rds")

In [None]:
p = pdat %>%
    filter(severe) %>%
    ggboxplot(y="sum_af", x="is_nlr", fill="is_nlr", outlier.shape = NA, notch=T) +
        stat_compare_means(comparisons=list(c("Non-NLR", "NLR")), label = "p.signif", label.y=0.40) +
        scale_fill_brewer(palette="Set1", name="Gene Type") +
        scale_y_continuous(limits = c(0, 0.6)) +
        labs(x="Gene Type", y="sum(Allele Freq)", title="Severe SNVs") +
        theme(legend.position = "none")
print(p)
saveRDS(p, "output/sumaf-normalvsnlr-onlysevere.rds")
ggsave("output/sumaf-normalvsnlr-onlysevere.png", width=3, height=3, dpi=600)
ggsave("output/sumaf-normalvsnlr-onlysevere.svg", width=3, height=3)

In [None]:
p = pdat %>%
    filter(severe) %>%
    ggviolin(y="sum_af", x="is_nlr", fill="is_nlr") +
        stat_compare_means(comparisons=list(c("Non-NLR", "NLR")), label = "p.signif") +
        scale_fill_brewer(palette="Set1", name="Gene Type") +
        #scale_y_continuous(limits = c(0, 3.2)) +
        labs(x="Gene Type", y="sum(Allele Freq)", title="Severe SNVs") +
        theme(legend.position = "none")
print(p)
saveRDS(p, "output/sumaf-normalvsnlr-onlysevere.rds")
ggsave("output/sumaf-normalvsnlr-onlysevere.png", width=2, height=3, dpi=600)
ggsave("output/sumaf-normalvsnlr-onlysevere.svg", width=2, height=3)

In [None]:
p = pdat %>%
    ggviolin(y="sum_af", x="is_nlr", fill="is_nlr") +
        stat_compare_means(comparisons=list(c("Non-NLR", "NLR")), label = "p.signif", label.y.npc = 0.9, bracket.size = 1.2) +
        scale_fill_brewer(palette="Set1", name="Gene Type") +
        scale_y_continuous(limits = c(0, NA)) +
        labs(x="Gene Type", y="sum(Allele Freq)", title="All SNVs") +
        theme(legend.position = "none")
print(p)
saveRDS(p, "output/sumaf-normalvsnlr-allvariants.rds")
ggsave("output/sumaf-normalvsnlr-allvariants.png", width=2, height=3, dpi=600)
ggsave("output/sumaf-normalvsnlr-allvariants.svg", width=2, height=3)