In [1]:
## Load packages
library(readxl)
library(stringr)
library(rentrez)
library(dplyr)
library(tidyr)
library(jsonlite)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union


Loading required package: BiocGenerics


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:dplyr’:

    combine, intersect, setdiff, union


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min


Loading required package: S4Vectors

Loading required package: stats4


Attaching 

In [2]:
## Adjust PATH
wd <- getwd()
subdir <- "/bin" # should be 'scripts' if location is ./scripts/thisfile
if (endsWith(wd, subdir)) {
    wd <- str_remove(wd, subdir)
    setwd(wd)
}
getwd()

In [3]:
## Functions

#' Get list entry and use NA for missing information
#'
#' Get entries from list entries and ignore missing information (e.g. NULL)
#' as well as errors (e.g. missing columns in data.frame)
#' 
get_list_entry <- function(x) {

    x <- tryCatch(
        error = function(cnd) NA, x
    )

    if (is.null(x)) {x <- NA}

    return(x)
}

In [4]:
# Define variables

## Input files 
in_tables <- 'docs/supplementary-tables.xlsx'

## Data storage
path <- "data/phages/"
dir.create(path, recursive = TRUE)
genomes <- list(
    accession = "accession.txt",
    overview = "overview.csv",
    annotation = "annotation.gtf",
    zip = "genomes.zip",
    archive = "genomes/ncbi_dataset/data/"
)
for (i in names(genomes)) {genomes[[i]] <- paste0(path, genomes[[i]])}

“'data/phages' already exists”


In [5]:
# Read data
tables <- list()
for (i in excel_sheets(in_tables)) {
    print(paste('Reading table', i))
    tables[[i]] <- read_excel(in_tables, sheet = i)
    }

[1] "Reading table S4_phages"
[1] "Reading table S7_phage-genomes"
[1] "Reading table S1_bacteria"
[1] "Reading table S5_TS-proteins"
[1] "Reading table S6_ara-hC-transferases"
[1] "Reading table S8_glucosylation-enzymes"
[1] "Reading table S9_DNA-modification-enzymes"


In [6]:
# Select phage genomes
data <- tables$`S7_phage-genomes`

## Check for missing IDs
paste('Missing accession IDs:', any(is.na(data$accession)))

In [7]:
## Download phage genomes

# Write accession numbers to file
writeLines(data$accession, genomes$accession)

# Query genomes using NCBI datasets CLI
message(paste("Downloading", length(data$accession), "virus genomes"))
cli_call <- paste0("datasets download virus genome accession")
cli_call <- paste(c(cli_call, "--inputfile", genomes$accession, "--filename", genomes$zip, 
                    "--include annotation,biosample,cds,genome,protein"), collapse = " ")
system(cli_call)

Downloading 34 virus genomes



In [8]:
## Extract data
unzip(genomes$zip, exdir = str_remove(genomes$zip, ".zip"))
ncbi <- list()
for (i in list.files(genomes$archive)) {
    j <- str_split(i, "\\.")[[1]][1]
    ncbi[[j]] <- paste0(genomes$archive, i)
}
ncbi

In [9]:
## Investigate data report

# Read data report
report <- as.list(readLines(ncbi$data_report, skipNul = TRUE))
report <- lapply(report, fromJSON)

# Re-format
for (n in 1:length(report)) {
    x <- report[[n]]
    report[[n]] <- data.frame(
    accession = get_list_entry(x[["accession"]]),
    virusName = get_list_entry(x[["virus"]][["organismName"]]),
    virusClass = get_list_entry(x[["virus"]][["lineage"]][["name"]][[5]]),
    virusGenus = get_list_entry(x[["virus"]][["lineage"]][["name"]][[6]]),
    completeness = get_list_entry(x[["completeness"]]),
    geneCount = get_list_entry(x[["geneCount"]]),
    genomeSize = get_list_entry(x[["length"]]),
    geoLocation = get_list_entry(x[["location"]][["geographicLocation"]]),
    geoRegion = get_list_entry(x[["location"]][["geographicRegion"]]),
    labHost = get_list_entry(x[["labHost"]])
    )
}
report <- bind_rows(report)

# Check accession numbers
all(data$accession %in% report$accession)
all(report$accession %in% data$accession)

In [10]:
report

accession,virusName,virusClass,virusGenus,completeness,geneCount,genomeSize,geoLocation,geoRegion,labHost
<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<chr>,<chr>
AF158101.6,Escherichia phage T4,Caudoviricetes,Straboviridae,COMPLETE,278,168903,,,
MK639187.1,Shigella phage SSE1,Caudoviricetes,Straboviridae,COMPLETE,266,169744,China,Asia,Shigella dysenteriae
MK962754.1,Shigella phage JK36,Caudoviricetes,Straboviridae,COMPLETE,280,168893,Ireland,Europe,Escherichia coli
MN334766.1,Serratia phage PCH45,Caudoviricetes,unclassified Caudoviricetes,COMPLETE,225,212807,New Zealand: Dunedin,Oceania,
MN505213.1,Serratia phage JS26,Caudoviricetes,Casjensviridae,COMPLETE,84,63971,,,
MN662249.1,Acinetobacter phage Stupor,Caudoviricetes,Straboviridae,COMPLETE,245,164309,Russia,Europe,
MN850579.1,Escherichia phage mogra,Caudoviricetes,Straboviridae,COMPLETE,266,168724,Denmark,Europe,
MN850590.1,Escherichia phage moha,Caudoviricetes,Straboviridae,COMPLETE,270,168676,Denmark,Europe,
MN850622.1,Escherichia phage mobillu,Caudoviricetes,Straboviridae,COMPLETE,260,163063,Denmark,Europe,
MT385367.1,Acinetobacter phage Abraxas,Caudoviricetes,Straboviridae,COMPLETE,249,166559,Russia,Europe,


In [11]:
## Add additional information

# Read genomes
genome <- readDNAStringSet(ncbi$genomic)
names(genome) <- str_split(names(genome), ',',simplify=TRUE)[,1]

# Mutate report
report$key <- paste(report$accession, report$virusName)
report$genomePresent <- names(genome) %in% report$key
report$genomeIndex <- match(names(genome), report$key)

In [12]:
## Investigate report
x <- data$accession %in% report$accession
table(x)
data$accession[!x]

x
TRUE 
  34 

In [13]:
## Investigate report

# Dimensions
x <- table(report$genomePresent)
message(paste0("Genomes present \n", "True: ", x[[2]], ", False: ", x[[1]]))
message(paste(ncol(report), "Annotations"))

# View
rbind(head(report,3), tail(report,3))

# Check absence of NAs in important columns
message(paste("Any missing accession:", any(is.na(report$accession))))
message(paste("Any missing virusName:", any(is.na(report$virusName))))

Genomes present 
True: 30, False: 4

13 Annotations



Unnamed: 0_level_0,accession,virusName,virusClass,virusGenus,completeness,geneCount,genomeSize,geoLocation,geoRegion,labHost,key,genomePresent,genomeIndex
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<lgl>,<int>
1,AF158101.6,Escherichia phage T4,Caudoviricetes,Straboviridae,COMPLETE,278,168903,,,,AF158101.6 Escherichia phage T4,True,1
2,MK639187.1,Shigella phage SSE1,Caudoviricetes,Straboviridae,COMPLETE,266,169744,China,Asia,Shigella dysenteriae,MK639187.1 Shigella phage SSE1,True,2
3,MK962754.1,Shigella phage JK36,Caudoviricetes,Straboviridae,COMPLETE,280,168893,Ireland,Europe,Escherichia coli,MK962754.1 Shigella phage JK36,True,3
32,OP617331.1,Serratia phage vB_SspM_LC53,Caudoviricetes,Straboviridae,COMPLETE,274,172075,New Zealand: Dunedin,Oceania,,OP617331.1 Serratia phage vB_SspM_LC53,True,32
33,OQ703618.1,Escherichia phage GADS24,Caudoviricetes,Straboviridae,COMPLETE,265,168896,Saudi Arabia,Asia,,OQ703618.1 Escherichia phage GADS24,True,33
34,OR088902.1,Serratia phage 92A1,Caudoviricetes,Straboviridae,COMPLETE,300,174432,,,,OR088902.1 Serratia phage 92A1,True,34


Any missing accession: FALSE

Any missing virusName: FALSE



In [17]:
## Save report
write.table(report, genomes$overview, sep = ",")

In [18]:
## Format annotation to GTF
gtf_fields <- c("accession","gene-cds-name", "gene-cds-nuc-fasta-title","gene-cds-nuc-fasta-seq-id",
                "gene-cds-nuc-fasta-range-start","gene-cds-nuc-fasta-range-stop",
                "gene-cds-protein-fasta-accession","gene-cds-protein-fasta-seq-id","gene-cds-protein-fasta-title"
               )
gtf_fields <- paste0(gtf_fields, collapse = ",")
system_call <- paste("dataformat tsv virus-annotation --fields",gtf_fields,"--inputfile", ncbi$annotation_report, ">", genomes$annotation)
message(system_call)
system(system_call)

dataformat tsv virus-annotation --fields accession,gene-cds-name,gene-cds-nuc-fasta-title,gene-cds-nuc-fasta-seq-id,gene-cds-nuc-fasta-range-start,gene-cds-nuc-fasta-range-stop,gene-cds-protein-fasta-accession,gene-cds-protein-fasta-seq-id,gene-cds-protein-fasta-title --inputfile data/phages/genomes/ncbi_dataset/data/annotation_report.jsonl > data/phages/annotation.gtf



In [19]:
sessionInfo()

R version 4.3.3 (2024-02-29)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Rocky Linux 9.2 (Blue Onyx)

Matrix products: default
BLAS/LAPACK: /home/dieol22p/miniconda3/envs/arabinosylation-anti-crispr/lib/libopenblasp-r0.3.28.so;  LAPACK version 3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Pacific/Auckland
tzcode source: system (glibc)

attached base packages:
[1] stats4    stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] Biostrings_2.70.1   GenomeInfoDb_1.38.1 XVector_0.42.0     
 [4] IRanges_2.36.0      S4Vectors_0.40.2    BiocGenerics_0.48.1
 [7] jsonlite_1.8.9      tidyr_1.3.1         