In [9]:
## Load packages
library(readxl)
library(stringr)
library(rentrez)
library(dplyr)
library(tidyr)
library(jsonlite)

In [5]:
## Adjust PATH
wd <- getwd()
subdir <- "/bin" # should be 'scripts' if location is ./scripts/thisfile
if (endsWith(wd, subdir)) {
    wd <- str_remove(wd, subdir)
    setwd(wd)
}
getwd()

In [6]:
## Functions

#' Get list entry and use NA for missing information
#'
#' Get entries from list entries and ignore missing information (e.g. NULL)
#' as well as errors (e.g. missing columns in data.frame)
#' 
get_list_entry <- function(x) {

    x <- tryCatch(
        error = function(cnd) NA, x
    )

    if (is.null(x)) {x <- NA}

    return(x)
}

In [11]:
# Define variables

## Input files 
in_tables <- 'docs/supplementary-tables.xlsx'

## Data storage
path <- "data/phages/"
dir.create(path, recursive = TRUE)
genomes <- list(
    accession = "accession.txt",
    overview = "overview.csv",
    annotation = "annotation.gtf",
    zip = "genomes.zip",
    archive = "genomes/ncbi_dataset/data/"
)
for (i in names(genomes)) {genomes[[i]] <- paste0(path, genomes[[i]])}

"'data\phages' already exists"


In [14]:
# Read data
tables <- list()
for (i in excel_sheets(in_tables)) {
    print(paste('Reading table', i))
    tables[[i]] <- read_excel(in_tables, sheet = i)
    }

[1] "Reading table S4_phages"
[1] "Reading table S7_phage-genomes"
[1] "Reading table S1_bacteria"
[1] "Reading table S5_TS-proteins"
[1] "Reading table S6_ara-hC-transferases"
[1] "Reading table S8_glucosylation-enzymes"
[1] "Reading table S9_DNA-modification-enzymes"


In [24]:
# Select phage genomes
data <- tables$`S7_phage-genomes`

## Check for missing IDs
paste('Missing accession IDs:', any(is.na(data$Accession)))

In [27]:
## Download phage genomes

# Write accession numbers to file
writeLines(data$Accession, genomes$accession)

# Query genomes using NCBI datasets CLI
message(paste("Downloading", length(data$Accession), "virus genomes"))
cli_call <- paste0("datasets download virus genome accession")
cli_call <- paste(c(cli_call, "--inputfile", genomes$accession, "--filename", genomes$zip, 
                    "--include annotation,biosample,cds,genome,protein"), collapse = " ")
system(cli_call)

Downloading 35 virus genomes

"'CreateProcess' failed to run 'C:\Users\dieol22p\ANACON~1\envs\ARABIN~1\bin\datasets.exe download virus genome accession --inputfile data/phages/accession.txt --filename data/phages/genomes.zip --include annotation,biosample,cds,genome,protein'"


In [None]:
## Extract data
unzip(genomes$zip, exdir = str_remove(genomes$zip, ".zip"))
ncbi <- list()
for (i in list.files(genomes$archive)) {
    j <- str_split(i, "\\.")[[1]][1]
    ncbi[[j]] <- paste0(genomes$archive, i)
}
ncbi

In [None]:
## Investigate data report

# Read data report
report <- as.list(readLines(ncbi$data_report, skipNul = TRUE))
report <- lapply(report, fromJSON)

# Re-format
for (n in 1:length(report)) {
    x <- report[[n]]
    report[[n]] <- data.frame(
    accession = get_list_entry(x[["accession"]]),
    virusName = get_list_entry(x[["virus"]][["organismName"]]),
    virusClass = get_list_entry(x[["virus"]][["lineage"]][["name"]][[5]]),
    virusGenus = get_list_entry(x[["virus"]][["lineage"]][["name"]][[6]]),
    completeness = get_list_entry(x[["completeness"]]),
    geneCount = get_list_entry(x[["geneCount"]]),
    genomeSize = get_list_entry(x[["length"]]),
    geoLocation = get_list_entry(x[["location"]][["geographicLocation"]]),
    geoRegion = get_list_entry(x[["location"]][["geographicRegion"]]),
    labHost = get_list_entry(x[["labHost"]])
    )
}
report <- bind_rows(report)

# Check accession numbers
all(query$accession %in% report$accession)
all(report$accession %in% query$accession)

In [None]:
## Add additional information

# Read genomes
genome <- readDNAStringSet(ncbi$genomic)
names(genome) <- str_split(names(genome), ',',simplify=TRUE)[,1]

# Mutate report
report$key <- paste(report$accession, report$virusName)
report$genomePresent <- names(genome) %in% report$key
report$genomeIndex <- match(names(genome), report$key)

In [None]:
## Investigate report

# Dimensions
x <- table(report$genomePresent)
message(paste0("Genomes present \n", "True: ", x[[2]], ", False: ", x[[1]]))
message(paste(ncol(report), "Annotations"))

# View
rbind(head(report,3), tail(report,3))

# Check absence of NAs in important columns
message(paste("Any missing accession:", any(is.na(report$accession))))
message(paste("Any missing virusName:", any(is.na(report$virusName))))

In [None]:
## Save report
write.table(report, genomes$overview, sep = ",")

In [None]:
## Format annotation to GTF
gtf_fields <- c("accession","gene-cds-name", "gene-cds-nuc-fasta-title","gene-cds-nuc-fasta-seq-id",
                "gene-cds-nuc-fasta-range-start","gene-cds-nuc-fasta-range-stop",
                "gene-cds-protein-fasta-accession","gene-cds-protein-fasta-seq-id","gene-cds-protein-fasta-title"
               )
gtf_fields <- paste0(gtf_fields, collapse = ",")
system_call <- paste("dataformat tsv virus-annotation --fields",gtf_fields,"--inputfile", ncbi$annotation_report, ">", genomes$annotation)
message(system_call)
system(system_call)

In [None]:
sessionInfo()