In [None]:
region_bed = "input//gal_fbox_AT5G56370.bed"
gene_annotation = "../final_annotation/latest/ALL_ACCESSIONS.hodgepodgemerged.gff3"
te_annotation = "../TE_annotation/TE_annotation/output//final_TE_annotation//pangenome_TEannotation.gff3"
colour_grouping = "allgenes_colours.tsv"
colourscheme = "colourscheme.tsv"

In [None]:
library(stringr)
library(tidyverse)
library(gggenes)
options(repr.plot.width=8, repr.plot.height=6, repr.plot.res = 320)

In [None]:
regions = read_tsv(region_bed, col_names = c("chrom", "start", "end", "region_name")) %>%
    mutate(accession=sub("(at\\d+)_.*", "\\1", chrom), nh_offset=pmax(1, start-100))

In [None]:
genes = read_tsv(gene_annotation, col_names=c("chrom", "source", "type", "start", "end", "score", "strand", "phase", "attrs"))  %>%
    filter(type%in%c("gene", "pseudogene", "pseuogenic_region", "transposable_element_gene")) %>%
    mutate(geneid=sub("ID=([^;]+).*", "\\1", attrs, perl=T))

In [None]:
ipr = read_tsv("../domainly/output/all_genic_iprscan.tsv",
               col_names=c("seqname", "seqsum", "seqlen", "db", "hit_id", "hit_name", "hit_start", "hit_end", "evalue",
                           "status", "rundate", "ipr_acc", "ipr_desc")
              ) %>%
    mutate(geneid=sub("(at\\d+_G\\d+)_.*", "\\1", seqname))

In [None]:
dumbipr = ipr %>%
    group_by(geneid, seqname, seqlen) %>%
    filter(db != "MobiDBLite") %>%
    summarise(hits=paste(hit_name, collapse=";"))

In [None]:
gene_classes = genes %>%
    left_join(dumbipr, by="geneid") %>%
    mutate(
        class = case_when(
            grepl("transposable", type) ~ "TE gene",
            grepl("pseudogen", type) & grepl("f-?box", hits, ignore.case=T)  ~ "F-Box pseudogene",
            grepl("pseudogen", type) & !grepl("f-?box", hits, ignore.case=T)  ~ "pseudogene",
            type == "gene" & grepl("f-?box", hits, ignore.case=T)  ~ "F-Box",
            type == "gene" & !grepl("f-?box", hits, ignore.case=T)  ~ "gene",
        )
    ) %>%
    select(geneid, class)

In [None]:
table(gene_classes$class)

In [None]:
colourscheme = tribble(
    ~class, ~colour,
    "gene", "#1F78B4",
    "F-Box",  "lawngreen",
    "F-Box;FBD",  "seagreen2",
    "F-Box;LRR",  "seagreen3",
    "F-Box;LRR;FBD", "seagreen4",
    "F-Box pseudogene", "#E31A1C",
    "pseudogene", "pink",
    "TE", "#555555",
    "TE gene", "#222222",
)
pal.colour=colourscheme$colour
names(pal.colour) = colourscheme$class

In [None]:
nh_genes = regions %>%
    left_join(genes, by=join_by(chrom, start<=start, end>=end), suffix = c(".region", ".gene"), relationship = "many-to-many" ) %>%
    mutate(accession=sub("(at\\d+)_.*", "\\1", chrom))

In [None]:
te = read_tsv(te_annotation, col_names=c("chrom", "source", "type", "start", "end", "score", "strand", "phase", "attrs"), comment = "#") %>%
    filter(!grepl("Parent", attrs)) %>%
    mutate(teid=sub("ID=([^;]+).*", "\\1", attrs, perl=T)) %>%
    select(-attrs, -score, -phase, -source, -type)

In [None]:
nh_te = regions %>%
    left_join(te, by=join_by(chrom, start<=start, end>=end), suffix = c(".region", ".gene"), relationship = "many-to-many" )  %>%
    mutate(accession=sub("(at\\d+)_.*", "\\1", chrom))

In [None]:
lib = read_tsv("candiates.tsv") %>%
    select(-n) %>%
    filter(!is.na(domain))

In [None]:
lib

In [None]:
fboxannot = ipr %>%
    filter(geneid %in%nhg$geneid, !db %in% c("MobiDBLite", "Coils", "PANTHER")) %>%
    left_join(lib)  %>%
    group_by(geneid, seqname, domain, seqlen) %>%
    arrange(geneid, domain, hit_start) %>%
    mutate(indx = c(0, cumsum(as.numeric(lead(hit_start)) > cummax(as.numeric(hit_end)))[-n()])) %>%
    ungroup() %>%
    group_by(geneid, seqname, domain, seqlen, indx) %>%
    summarise(start=min(hit_start), end=max(hit_end)) %>%
    arrange(geneid, start) %>%
    group_by(geneid) %>%
    filter(seqname == first(seqname[seqlen == max(seqlen)]), !is.na(domain)) %>%
    mutate(
        group =  cumsum(c("SDFSDFS", lag(domain)[-1]) != domain)
    ) %>%
    group_by(geneid, domain, group) %>%
    summarise(start=min(start), end=max(end)) %>%
    group_by(geneid) %>%
    arrange(start) %>%
    summarise(domains=paste(domain, collapse=";"))

In [None]:
gene_classes = genes %>%
    left_join(fboxannot, by="geneid") %>%
    mutate(
        class = case_when(
            grepl("transposable", type) ~ "TE gene",
            grepl("pseudogen", type) & is.na(domains)  ~ "pseudogene",
            grepl("pseudogen", type) & !is.na(domains)  ~ "F-Box pseudogene",
            type == "gene" & is.na(domains)  ~ "gene",
            type == "gene" & !is.na(domains)  ~ domains,
        )
    ) %>%
    select(geneid, class)

In [None]:
gene_classes %>%
    count(class)

In [None]:
dir.create("output/gal", recursive = T, showWarnings = F)
for (nh_name in unique(regions$region_name)) {
    reg = regions %>%
        filter(region_name == nh_name) %>%
        mutate(startO=start-nh_offset, endO=end-nh_offset)
    nhg = nh_genes %>%
        filter(region_name == nh_name) %>%
        mutate(startO=start.gene-nh_offset, endO=end.gene-nh_offset) %>%
        left_join(gene_classes, by=join_by(geneid)) 
    print(nhg)
    nht = nh_te  %>%
        filter(region_name == nh_name) %>%
        mutate(startO=start.gene-nh_offset, endO=end.gene-nh_offset, class="TE")

    p=ggplot(nhg, aes(xmin = startO, xmax = endO, y = accession)) +
        annotate("segment", x=reg$startO, xend=reg$endO, y=reg$accession, yend=reg$accession, colour="grey") +
        geom_gene_arrow(aes(forward=strand=="+", colour=class, fill=class), arrowhead_height = unit(1, "mm"), arrowhead_width = unit(.6, "mm"), alpha=0.8) +
        geom_gene_arrow(aes(forward=strand=="+", colour=class, fill=class), arrowhead_height = unit(.5, "mm"), arrowhead_width = unit(.6, "mm"), arrow_body_height=unit(1.4, "mm"), data=nht, alpha=0.6) +
        scale_fill_manual(values=pal.colour, guide=guide_legend(nrow =1), name="Gene Type", aesthetics = c("fill", "colour")) +
        theme_genes()+
        labs(y=NULL, x="Neighbourhood Coordinate (bp)", title=nh_name) +
        theme(
            legend.position="bottom", 
            panel.grid.major.y = element_blank()
        )

    print(p)
    ggsave(sprintf("output/gal/%s_series.png", nh_name), width=7, height=5)
    ggsave(sprintf("output/gal/%s_series.svg", nh_name), width=7, height=5)
}