In [5]:
## Variables

In [217]:
srr_names <- c("SRR5090597", "SRR5090599")
srr_names <- c("SRR12091998", "SRR12091997", "SRR12091996", "SRR12091995", "SRR12091994", "SRR12091993")
donor_name <- "hpv16"
donor_name <- "sars-cov-2"
recipient_name <- "USCShg38"
inputs_folder <- "../../wallaby/workflows/cromwell-final-outputs/"
inputs_folder <- "../../wallaby/workflows/datasets/sars-cov-2/"
donor_ref_genome <- "../../wallaby/data/ref_genomes/hpv/HPV16.fasta"
recipient_ref_genome <- "../../wallaby/data/ref_genomes/human/USCS.hg38.fasta"

In [None]:
, "SRR12091992", "SRR12091991", 
"SRR12091990", "SRR12507513", "SRR12507514", "SRR12507515", "SRR12507516", "SRR12507517", "SRR12507518", "SRR12507519", 
"SRR12507520", "SRR12507521", "SRR12507522", "SRR12507523", "SRR12507524", "SRR12507525", "SRR12507526", "SRR12507527", 
"SRR12507528", "SRR12507529", "SRR12507530", "SRR12507531", "SRR12507532", "SRR12507533", "SRR12507534", "SRR12507535", 
"SRR12507536", "SRR12507537", "SRR12507538", "SRR12507539", "SRR12507540", "SRR12507541", "SRR12507542", "SRR12507543", 
"SRR12507544", "SRR12507545", "SRR12507546", "SRR12507547", "SRR12507548", "SRR12507549", "SRR12507550", "SRR12507551", 
"SRR12507552", "SRR12134528", "SRR12134529", "SRR12134530", "SRR12134531", "SRR12134532", "SRR12134533", "SRR12134534", 
"SRR12134535", "SRR12134536", "SRR12134537", "SRR12134538", "SRR12134539", "SRR12134540", "SRR12134541", "SRR12134542", 
"SRR12134543", "SRR12134544", "SRR12134545"

# Tertiary analysis of a Donor-to-Recipient pipeline

In [689]:
## Setup Environment

In [8]:
# Enable multithreading when possible (library dependent)
options(Ncpus = parallel::detectCores())
Sys.setenv(OMP_NUM_THREADS=toString(parallel::detectCores()))
Sys.setenv(OMP_THREAD_LIMIT=toString(parallel::detectCores()))
Sys.setenv(OMP_NUM_THREADS=parallel::detectCores())
Sys.setenv(OMP_THREAD_LIMIT=parallel::detectCores())

# Install tricky packages
suppressMessages(install.packages("../../BSgenome.Hsapiens.UCSC.hg38_1.4.3.tar.gz", repos = NULL, type = "source"))
suppressPackageStartupMessages(library(BSgenome.Hsapiens.UCSC.hg38))

## Load or install and load all libraries
suppressPackageStartupMessages(library("pacman", character.only = TRUE))

# List of CRAN packages to either Load, or Install and Load
pacman::p_load(dplyr, 
               ggplot2, shiny, shinyLP, DT,  ggrepel,  tidyr, data.table, 
               kableExtra, knitr, IRdisplay)

# List of Bioconductor packages to either Load, or Install and Load
pacman::p_load(GenomicFeatures, GenomicAlignments,  Rsubread,  Rsamtools, bamsignals,  
               rtracklayer, GenomicRanges, org.Hs.eg.db, Organism.dplyr,
               TxDb.Hsapiens.UCSC.hg38.knownGene,  regioneR, karyoploteR,  seqinr, Repitools, Gviz, Biostrings)

In [691]:
## Helper Functions

In [692]:
## Loading pipeline output files
# Try to open all files - note that some will not exists as that specific crossing bucket did not return any matches

In [209]:
srrs <- list()
beds_donor <- list()
beds_recipient <- list()

for (srr_name in srr_names){
    file <- paste(inputs_folder, paste(srr_name, '-to-', recipient_name, sep = ""),"_MMd_MUr.bed", sep = "")
    if (file.exists(file)) {recip_MMd_MUr <- import(file)}

    file <- paste(inputs_folder, paste(srr_name, '-to-', recipient_name, sep = ""),"_MMd_UMr.bed", sep = "")
    if (file.exists(file)) {recip_MMd_UMr <- import(file)}

    file <- paste(inputs_folder, paste(srr_name, '-to-', recipient_name, sep = ""),"_MUd_MMr.bed", sep = "")
    if (file.exists(file)) {recip_MUd_MMr <- import(file)}

    file <- paste(inputs_folder, paste(srr_name, '-to-', recipient_name, sep = ""),"_MUd_UMr.bed", sep = "")
    if (file.exists(file)) {recip_MUd_UMr <- import(file)}

    file <- paste(inputs_folder, paste(srr_name, '-to-', recipient_name, sep = ""),"_UMd_MMr.bed", sep = "")
    if (file.exists(file)) {recip_UMd_MMr <- import(file)}

    file <- paste(inputs_folder, paste(srr_name, '-to-', recipient_name, sep = ""),"_UMd_MUr.bed", sep = "")
    if (file.exists(file)) {recip_UMd_MUr <- import(file)}

    beds <- list(recip_MMd_MUr, recip_MMd_UMr, recip_MUd_MMr, recip_MUd_UMr, recip_UMd_MMr, recip_UMd_MUr)
    names(beds) <- c('MMd_MUr', 'MMd_UMr', 'MUd_MMr', 'MUd_UMr', 'UMd_MMr', 'UMd_MUr')
    
    srrs[srr_name] <- list(beds)
}

In [28]:
src <- suppressMessages(src_organism("TxDb.Hsapiens.UCSC.hg38.knownGene"))

"Ignoring remaining part of query: CREATE INDEX IF NOT EXISTS entrez_accession ON id_accession (entrez);"
"Ignoring remaining part of query: CREATE INDEX IF NOT EXISTS entrez_protein ON id_protein (entrez);"


In [210]:
summary_table <- function(srr = "number", granges, granges_labels, min_num_crossings = 3, min_num_reads = 3){
    granges_df <- lapply(granges, annoGR2DF)
    names(granges_df) <- granges_labels
    merged_df <- bind_rows(granges_df, .id = "crossing")
    merged_dt <- as.data.table(merged_df)

    ## Interval per chromosome
    merged_dt[,group := { ir <- IRanges(start, end); subjectHits(findOverlaps(ir, reduce(ir))) }, by = chr]
    merged_final <- merged_dt[, list(start=min(start), 
                                     stop=max(end), 
                                     num_crossings=length(unique(list(crossing)[[1]])),
                                     unique_crossings=list(unique(crossing)),
                                     num_reads=length(list(name)[[1]])
                                     ), by=list(group,chr)]
    merged_final <- merged_final[merged_final[, num_reads > (min_num_reads - 1)]]
    merged_final <- merged_final[merged_final[, num_crossings > (min_num_crossings - 1)]]
    
    txdb <- TxDb.Hsapiens.UCSC.hg38.knownGene
    Hsapiens <- BSgenome.Hsapiens.UCSC.hg38
    
    merged_final$gene_name <- apply(merged_final, 1, FUN = function(x) toString(
        unique(unlist(suppressWarnings(annoGR2DF(
                                transcripts(src, 
                                             filter=~(GRangesFilter(
                                                 GenomicRanges::GRanges(
                                                     paste(toString(x["chr"]), ":", 
                                                           as.integer(x["start"]), "-", 
                                                           as.integer(x["stop"]), sep = "")))), 
                                             columns=c("symbol")))$symbol)))))
    
    merged_final$srr <- srr

    merged_final$sequence <- apply(merged_final, 1, FUN = function(x) toString(getSeq(Hsapiens, 
                                                                       toString(x["chr"]), 
                                                                       start = as.integer(x["start"]), 
                                                                       end = as.integer(x["stop"]))))

    merged_final <- merged_final[, !"group"]
    merged_final <- merged_final[with(merged_final, order(num_reads, decreasing = TRUE)), ]
                                 
    return(merged_final)
}

In [None]:
srrs_list <- list()

for (srr_name in srr_names){
    srr_table <- summary_table(srr = srr_name, granges = srrs[srr_name][[1]], names(srrs[srr_name][[1]]), min_num_crossings = 3, min_num_reads = 1)
    srr_table$srr_name <- srr_name
    srrs_list[[srr_name]] <- srr_table
}

srrs_table = do.call(rbind, srrs_list)
srrs_table <- srrs_table[, !"srr_name"]

In [None]:
srrs_table %>%
kable("html") %>%
    kable_styling(bootstrap_options = "striped", full_width = F, position = "left") %>%
    kable_paper(full_width = F) %>%
    column_spec(9, width = "30em", width_max = "30em", background = "green") %>%
    as.character() %>%
    display_html()     