version 0.10.6

cran · Dec 16, 2023 · 6a1f545 · 6a1f545
1 parent a81e0d3
commit 6a1f545
Show file tree

Hide file tree

Showing 14 changed files with 198 additions and 70 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,23 +1,23 @@
 Package: taxonomizr
-Maintainer: Scott Sherrill-Mix <shescott@upenn.edu>
+Maintainer: Scott Sherrill-Mix <ssm@msu.edu>
 License: GPL (>= 2) | file LICENSE
 Title: Functions to Work with NCBI Accessions and Taxonomy
 Type: Package
 LazyLoad: yes
 Author: Scott Sherrill-Mix [aut, cre]
 Authors@R: c(person("Scott", "Sherrill-Mix", role = c("aut", "cre"),
-                     email = "shescott@upenn.edu"))
+                     email = "ssm@msu.edu"))
 BugReports: https://github.com/sherrillmix/taxonomizr/issues
 Description: Functions for assigning taxonomy to NCBI accession numbers and taxon IDs based on NCBI's accession2taxid and taxdump files. This package allows the user to download NCBI data dumps and create a local database for fast and local taxonomic assignment.
-Version: 0.10.2
-Date: 2023-01-31
+Version: 0.10.6
+Date: 2023-12-15
 Suggests: testthat, knitr, rmarkdown
 Depends: R (>= 3.0.0)
 Imports: RSQLite, R.utils, data.table, curl (>= 5.0.0)
 Encoding: UTF-8
-RoxygenNote: 7.2.1
 VignetteBuilder: knitr
+RoxygenNote: 7.2.3
 NeedsCompilation: yes
-Packaged: 2023-01-31 15:39:24 UTC; scott
+Packaged: 2023-12-15 14:53:02 UTC; scott
 Repository: CRAN
-Date/Publication: 2023-01-31 17:20:02 UTC
+Date/Publication: 2023-12-15 21:50:02 UTC
diff --git a/MD5 b/MD5
@@ -1,42 +1,42 @@
-3d825676dc902e4f70908698b113fdd3 *DESCRIPTION
+c399d242e4750b526ae148923243aee1 *DESCRIPTION
 b234ee4d69f5fce4486a80fdaf4a4263 *LICENSE
 3a83463b2cd3aedc428c8903411180da *NAMESPACE
-9abbbb65d0dd89cd7daa858be7ebef3f *R/taxa.R
-dca186a14d1cc35dea7f8ea3ec1080bb *README.md
-acd055c71e59076b6e68c53f9796abac *build/vignette.rds
+36bfe80431c61d354b77372c25511cae *R/taxa.R
+6f2f4174ea66a7b5a38536eb577d6dc2 *README.md
+1c7313ff57202a366d27202e53c930e0 *build/vignette.rds
 229358c617531cd15b11bed9c1154021 *inst/doc/usage.R
-b93379f32724abf91dd4ba6de98f7ca9 *inst/doc/usage.Rmd
-080c9b763f744c356bf2ca855b5670a6 *inst/doc/usage.html
+01797b8e4445dd523476138ba8105e60 *inst/doc/usage.Rmd
+57f9b12089c57ef5b2844e90c7fbb415 *inst/doc/usage.html
 2c6938a3b6c0303311c55f0e328ff746 *inst/testdata/fakeNamesNodes.tar.gz
 6f928eac975decb053a812bf58c4a3c1 *man/accessionToTaxa.Rd
 fc39f17d296a65ca038d98b5b3a69d28 *man/condenseTaxa.Rd
-a1a6b3e0bed93b12a0548972f8cfa6a7 *man/getAccession2taxid.Rd
+d89dee9a58941c615c5110824d9b639d *man/getAccession2taxid.Rd
 c43ce37f96837e90311d5cee78c678e8 *man/getAccessions.Rd
 aec3d9c390611bae680e4f63f097dd5b *man/getCommon.Rd
 6e4096c392df47fe7026d08fdd10a9f3 *man/getDescendants.Rd
 e8e9c3495ab479bf781e676b888e4308 *man/getId.Rd
 f540636f46822a6c1447da68507bdd93 *man/getId2.Rd
-bfc067a38ec4cb9a0c4f9c733b3bbcdd *man/getNamesAndNodes.Rd
+9954bf6c88730b34c53be6b95ff7282f *man/getNamesAndNodes.Rd
 ef633603d85a981a74348b694673eaa1 *man/getRawTaxonomy.Rd
 0a170ee794b9879f60d055ff642d5268 *man/getTaxonomy.Rd
 939e0aeca4cdde8f33aa0604be8f61a5 *man/getTaxonomy2.Rd
 8420a5306860a41886c0da14e1bf47b0 *man/lastNotNa.Rd
 5e6d614d7a1e327ce0644ec86139c190 *man/makeNewick.Rd
 c35329a2ac1d0fe5ef11fd893165f33c *man/normalizeTaxa.Rd
-3857a6f1ac3435a4029d26f0bed29313 *man/prepareDatabase.Rd
+8b6c9102c1b74c79e3448ba3c632e985 *man/prepareDatabase.Rd
 c9a16d46548f76fc3419089efaf2f60e *man/read.accession2taxid.Rd
 a13a799708b180b7ac44cc8b82706f73 *man/read.names.Rd
 b8acad587cc16040aefe6a463d0b6638 *man/read.names.sql.Rd
 6b4ff51c8ec932c67904cff0c8db0eaf *man/read.nodes.Rd
 07cb545a316213b200ccd2350c108532 *man/read.nodes.sql.Rd
-39c859e4c831e66d8a2254c7fc31d5eb *man/resumableDownload.Rd
+b1bbcbf5b0ee230580b0e7169b6d3305 *man/resumableDownload.Rd
 1027698cdbf5027647038674dcb2103e *man/streamingRead.Rd
-d2ff52d475e7882a0c26b2533a6ffe1f *man/taxonomizr-package.Rd
+2d32fc94c19fa281c72c98bd2268e718 *man/taxonomizr-package.Rd
 fa6879ee01dac9fd5708c4f29dae6431 *man/taxonomizrSwitch.Rd
 378caa88f6c8dd64326f2a35e247f271 *man/topoSort.Rd
 d47fc7ed590032dbfa141b6952d998d3 *man/trimTaxa.Rd
 c6f8208857cc84cc3694a2f4a566745f *src/taxaTrim.c
 1b509a76cc18244a3847a2ab27ab9f77 *src/taxonomizr-init.c
 09570ff5fc4fe1c8f81b47295a3faafb *tests/testthat.R
-fd91eff2cdf264fa4338cd0fd563014b *tests/testthat/test_taxa.R
-b93379f32724abf91dd4ba6de98f7ca9 *vignettes/usage.Rmd
+63ce5b91f14a8a4adedb17057dab169d *tests/testthat/test_taxa.R
+01797b8e4445dd523476138ba8105e60 *vignettes/usage.Rmd
diff --git a/R/taxa.R b/R/taxa.R
@@ -468,13 +468,14 @@ getParentNodes<-function(ids,sqlFile='nameNode.sqlite',getDescendants=FALSE){
 checkDownloadMd5<-function(url,file,errorIfNoMd5=FALSE){
   md5<-sprintf('%s.md5',url)
   tmp<-tempfile()
-  check<-tryCatch(curl::curl_download(md5,tmp,mode='wb',quiet=FALSE),warning=function(xx)FALSE,error=function(xx)FALSE)
+  check<-tryCatch(curl::curl_download(md5,tmp,mode='wb',quiet=TRUE),warning=function(xx)FALSE,error=function(xx)FALSE)
   if(check==FALSE){
     if(errorIfNoMd5)stop("Problem downloading md5 ",md5)
-    else return(TRUE)
+    else return(list('result'=TRUE,'remote'=as.character(NA),'local'=as.character(NA)))
   }
   hash<-strsplit(readLines(tmp),' ')[[1]][1]
-  return(hash==tools::md5sum(file))
+  localHash<-tools::md5sum(file)
+  return(list('result'=unname(hash==localHash),'remote'=hash,'local'=unname(localHash)))
 }
 
 
@@ -810,12 +811,13 @@ accessionToTaxa<-function(accessions,sqlFile,version=c('version','base')){
   tmpDb <- RSQLite::dbConnect(RSQLite::SQLite(), dbname=tmp)
   on.exit(if(file.exists(tmp))file.remove(tmp))
   on.exit(RSQLite::dbDisconnect(tmpDb),add=TRUE)
-  RSQLite::dbWriteTable(tmpDb,'query',data.frame('accession'=accessions,stringsAsFactors=FALSE),overwrite=TRUE)
+  RSQLite::dbWriteTable(tmpDb,'query',data.frame('accession'=as.character(accessions),stringsAsFactors=FALSE),overwrite=TRUE)
   #load the big sql
   db <- RSQLite::dbConnect(RSQLite::SQLite(), dbname=sqlFile)
   on.exit(RSQLite::dbDisconnect(db),add=TRUE)
   #attach the temp table
   RSQLite::dbExecute(db, sprintf("ATTACH '%s' AS tmp",tmp))
+  #hangs on next if accessions are numeric
   taxaDf<-RSQLite::dbGetQuery(db,sprintf('SELECT tmp.query.accession, taxa FROM tmp.query LEFT OUTER JOIN accessionTaxa ON tmp.query.accession=accessionTaxa.%s',version))
   RSQLite::dbExecute(db,'DROP TABLE tmp.query')
   RSQLite::dbExecute(db,'DETACH tmp')
@@ -884,6 +886,7 @@ condenseTaxa<-function(taxaTable,groupings=rep(1,nrow(taxaTable))){
 #' @param url the url where taxdump.tar.gz is located
 #' @param fileNames the filenames desired from the tar.gz file
 #' @param protocol the protocol to be used for downloading. Probably either \code{'http'} or \code{'ftp'}. Overridden if \code{url} is provided directly
+#' @param resume if TRUE attempt to resume downloading an interrupted file without starting over from the beginning
 #' @return a vector of file path strings of the locations of the output files
 #' @seealso \code{\link{read.nodes.sql}}, \code{\link{read.names.sql}}
 #' @references \url{https://ftp.ncbi.nih.gov/pub/taxonomy/}, \url{https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/}
@@ -892,7 +895,7 @@ condenseTaxa<-function(taxaTable,groupings=rep(1,nrow(taxaTable))){
 #' \dontrun{
 #'   getNamesAndNodes()
 #' }
-getNamesAndNodes<-function(outDir='.',url=sprintf('%s://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz',protocol),fileNames=c('names.dmp','nodes.dmp'),protocol='ftp'){
+getNamesAndNodes<-function(outDir='.',url=sprintf('%s://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz',protocol),fileNames=c('names.dmp','nodes.dmp'),protocol='ftp',resume=TRUE){
   outFiles<-file.path(outDir,fileNames)
   if(all(file.exists(outFiles))){
     message(paste(outFiles,collapse=', '),' already exist. Delete to redownload')
@@ -902,8 +905,9 @@ getNamesAndNodes<-function(outDir='.',url=sprintf('%s://ftp.ncbi.nih.gov/pub/tax
   tmpDir<-tempfile()
   dir.create(tmpDir)
   tarFile<-file.path(tempdir(),base)
-  resumableDownload(url,tarFile,quiet=FALSE)
-  if(!checkDownloadMd5(url,tarFile))stop('Downloaded file does not match ',url,' File corrupted or download ended early?')
+  resumableDownload(url,tarFile,resume=resume)
+  check<-checkDownloadMd5(url,tarFile)
+  if(!check[['result']])stop('Downloaded file does not match ',url,' File corrupted or download ended early?\nLocal: ',check[['local']],'\nRemote: ',check[['remote']])
   utils::untar(tarFile,fileNames,exdir=tmpDir,tar='internal')
   tmpFiles<-file.path(tmpDir,fileNames)
   if(!all(file.exists(tmpFiles)))stop("Problem finding files ",paste(tmpFiles[!file.exists(tmpFiles)],collapse=', '))
@@ -921,6 +925,7 @@ getNamesAndNodes<-function(outDir='.',url=sprintf('%s://ftp.ncbi.nih.gov/pub/tax
 #' @param baseUrl the url of the directory where accession2taxid.gz files are located
 #' @param types the types if accession2taxid.gz files desired where type is the prefix of xxx.accession2taxid.gz. The default is to download all nucl_ accessions. For protein accessions, try \code{types=c('prot')}.
 #' @param protocol the protocol to be used for downloading. Probably either \code{'http'} or \code{'ftp'}. Overridden if \code{baseUrl} is provided directly
+#' @param resume if TRUE attempt to resume downloading an interrupted file without starting over from the beginning
 #' @return a vector of file path strings of the locations of the output files
 #' @seealso \code{\link{read.accession2taxid}}
 #' @references \url{https://ftp.ncbi.nih.gov/pub/taxonomy/}, \url{https://www.ncbi.nlm.nih.gov/genbank/acc_prefix/}
@@ -935,19 +940,20 @@ getNamesAndNodes<-function(outDir='.',url=sprintf('%s://ftp.ncbi.nih.gov/pub/tax
 #'
 #'   getAccession2taxid()
 #' }
-getAccession2taxid<-function(outDir='.',baseUrl=sprintf('%s://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/',protocol),types=c('nucl_gb','nucl_wgs'),protocol='ftp'){
-  message('This can be a big (several gigabytes) download. Please be patient and use a fast connection.')
+getAccession2taxid<-function(outDir='.',baseUrl=sprintf('%s://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/',protocol),types=c('nucl_gb','nucl_wgs'),protocol='ftp',resume=TRUE){
   fileNames<-sprintf('%s.accession2taxid.gz',types)
   outFiles<-file.path(outDir,fileNames)
   if(all(file.exists(outFiles))){
     message(paste(outFiles,collapse=', '),' already exist. Delete to redownload')
     return(outFiles)
   }
+  message('This can be a big (several gigabytes) download. Please be patient and use a fast connection.')
   if(!substring(baseUrl,nchar(baseUrl)) %in% c('/','\\'))baseUrl<-sprintf('%s/',baseUrl)
   urls<-paste(baseUrl,fileNames,sep='')
   mapply(function(xx,yy){
-    resumableDownload(xx,yy)
-    if(!checkDownloadMd5(xx,yy))stop('Downloaded file does not match ',xx,' File corrupted or download ended early?')
+    resumableDownload(xx,yy,resume=resume)
+    check<-checkDownloadMd5(xx,yy)
+    if(!check[['result']])stop('Downloaded file does not match ',xx,' File corrupted or download ended early?\nLocal: ',check[['local']],'\nRemote: ',check[['remote']])
   },urls,outFiles)
   return(outFiles)
 }
@@ -1374,25 +1380,36 @@ topoSort<-function(vectors,maxIter=1000,errorIfAmbiguous=FALSE){
 #' @param quiet If TRUE show the progress reported by \code{multi_download}
 #' @param resume If TRUE try to resume interrupted downloads using intermediate file \code{tmpFile}. Otherwise delete \code{tempFile} on error
 #' @param ... Additional arguments to \code{multi_download}
-#' @return invisibly return the output frmo multi_download
+#' @return invisibly return the output from multi_download
 #' @seealso \code{\link[curl]{multi_download}}
 #' @examples
 #' \dontrun{
 #'   url<-'https://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/prot.accession2taxid.FULL.1.gz'
 #'   resumableDownload(url,'downloadedFile.gz')
 #' }
 resumableDownload<-function(url,outFile=basename(url),tmpFile=sprintf('%s.__TMP__',outFile),quiet=FALSE,resume=TRUE,...){
+  minTmpFileSize<-10000
   if(!resume) on.exit(unlink(tmpFile))
   out<-curl::multi_download(url,tmpFile,progress=!quiet,resume=resume,...)
+  if(out$status_code >399){ #could also use not %in% c(0,200,206) here but assuming outside 200/300 range = error
+    out$success<-FALSE
+    if(!'error' %in% colnames(out) || is.na(out$error))out$error<-sprintf('Error status code %d returned',out$status_code)
+  }
   if(is.na(out$success)||!out$success){
     if(length(out$error)>0&&!is.na(out$error))extraError<-sprintf(' with error: "%s"',out$error)
     else extraError<-''
-    if(resume&&file.exists(tmpFile)&&file.size(tmpFile)>0){
-      extraError<-sprintf('%s. Progress is saved in %s and continued download can be attempted by repeating the previous command.\nDelete %s or set resume=FALSE to start from scratch',extraError,tmpFile,tmpFile)
+    if(resume&&file.exists(tmpFile)){
+      if(file.size(tmpFile)>minTmpFileSize){
+        extraError<-sprintf('%s. Progress is saved in %s and continued download can be attempted by repeating the previous command.\nDelete %s or set resume=FALSE to start from scratch',extraError,tmpFile,tmpFile)
+      }else{
+        #too small to be useful so clear
+        unlink(tmpFile)
+      }
     }
     stop('Download failed',extraError,'.')
   }
   file.rename(tmpFile,outFile)
+  if(!quiet)message('Downloaded file: ',out$url,'\nModified: ',out$modified,'\nStatus: ',out$status_code)
   invisible(out)
 }
 

diff --git a/README.md b/README.md
@@ -1,18 +1,8 @@
 # Convert accession numbers to taxonomy
 
-[![Build Status](https://travis-ci.org/sherrillmix/taxonomizr.svg?branch=master)](https://travis-ci.org/sherrillmix/taxonomizr)
 [![codecov](https://codecov.io/gh/sherrillmix/taxonomizr/branch/master/graph/badge.svg)](https://app.codecov.io/gh/sherrillmix/taxonomizr)
 [![CRAN_Status_Badge](http://www.r-pkg.org/badges/version/taxonomizr)](https://cran.r-project.org/package=taxonomizr)
 
-## Note: NCBI Name changes in early 2023
-Please note that the [NCBI is planning to change their naming of several major prokaryote phylums](https://ncbiinsights.ncbi.nlm.nih.gov/2022/11/14/prokaryotic-phylum-name-changes/) e.g. [Firmicutes will become Bacillota](https://ftp.ncbi.nih.gov/pub/taxonomy/Major_phylum_updates_for_prokaryotes_2023.txt). The exact date that this transition will percolate into the taxonomy downloads used for this package is not precisely defined but it seems likely to be sometime early in 2023.
-
-Please watch out for any problems that could arise. For example: 
-  * names of assigned taxonomy may shift after updating a database to a post-change version
-  * comparisons of old analyses performed pre-change to new analyses performed post-change will need to be done with care
-
-If I understand things correctly, then the actual taxonomy ID will not change so it might be wise to retain the taxonomy ID for all analyses. Then on final analysis, the taxonomic names can be assigned based on whatever naming scheme is in use at that time.
-
 ## Introduction
 
 `taxonomizr` provides some simple functions to parse NCBI taxonomy files and accession dumps and efficiently use them to assign [taxonomy](https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/) to accession numbers or taxonomic IDs. This is useful for example to assign taxonomy to BLAST results. This is all done locally after downloading the appropriate files from NCBI using included functions (see [below](#preparation)). 
@@ -408,7 +398,7 @@ taxonomizr::read.names.sql('names.dmp','nameNode.sqlite',overwrite=TRUE)
 ```
 
 
-### Condensing taxonomy
+### Condensing taxonomy a.k.a. lowest common ancestor LCA
 You can use the `condenseTaxa` function to find the agreements among taxonomic hits. For example to condense the taxonomy from the previous section to the lowest taxonomic rank shared by all three taxa:
 
 
@@ -667,9 +657,31 @@ makeNewick(taxa,excludeTerminalNAs=TRUE)
 
 Note that taxa may be the most specific taxon for a given taxa in the taxonomy matrix but will not be a leaf in the resulting tree if it appears in other taxonomy e.g. Chordata in this example. 
 
+## Note: NCBI name changes in early 2023
+
+Please note that the [NCBI change their naming of several major prokaryote phylums](https://ncbiinsights.ncbi.nlm.nih.gov/2022/11/14/prokaryotic-phylum-name-changes/) e.g. [Firmicutes became Bacillota](https://ftp.ncbi.nih.gov/pub/taxonomy/Major_taxonomic_updates_2023.txt) in early 2023.
+Please watch out for any problems that could arise. For example: 
+  * names of assigned taxonomy may shift after updating a database to a post-change version
+  * comparisons of old analyses performed pre-change to new analyses performed post-change will need to be done with care
+
+If I understand things correctly, then the actual taxonomy ID will not change so it might be wise to retain the taxonomy ID for all analyses. Then on final analysis, the taxonomic names can be assigned based on whatever naming scheme is in use at that time.
+
 
 
 ## Changelog
+
+### v0.10.5
+  * Catch 404 errors and report as errors
+  * Add resume argument to download functions
+  * Don't retain temp files for downloads if less than 10kb
+  * README touchups
+
+### v0.10.4
+  * Minor improvement to output md5 and modification date for downloads to aid in debugging network issues
+
+### v0.10.3
+  * Minor fix to prevent `accessionToTaxa` from hanging when given numeric inputs
+
 ### v0.10.2
   * Behind the scenes switch to `multi_download` function from `curl` package to allow download resumption on interrupted downloads. This adds a dependency that `curl` package be >=5.0.0.
   * Add `protocol` option to choose between FTP and HTTP protocols for downloading. The two protocols should perform similarly and the relative speeds of NCBI's ftp and http servers seem to vary so probably not a whole lot of reason to choose one over the other unless a firewall is blocking FTP ports.

diff --git a/build/vignette.rds b/build/vignette.rds
diff --git a/inst/doc/usage.Rmd b/inst/doc/usage.Rmd
@@ -384,7 +384,7 @@ taxonomizr::read.names.sql('names.dmp','nameNode.sqlite',overwrite=TRUE)
 ```
 
 
-### Condensing taxonomy
+### Condensing taxonomy a.k.a. lowest common ancestor LCA
 You can use the `condenseTaxa` function to find the agreements among taxonomic hits. For example to condense the taxonomy from the previous section to the lowest taxonomic rank shared by all three taxa:
 
 ```{r,eval=FALSE}
@@ -599,9 +599,31 @@ makeNewick(taxa,excludeTerminalNAs=TRUE)
 
 Note that taxa may be the most specific taxon for a given taxa in the taxonomy matrix but will not be a leaf in the resulting tree if it appears in other taxonomy e.g. Chordata in this example. 
 
+## Note: NCBI name changes in early 2023
+
+Please note that the [NCBI change their naming of several major prokaryote phylums](https://ncbiinsights.ncbi.nlm.nih.gov/2022/11/14/prokaryotic-phylum-name-changes/) e.g. [Firmicutes became Bacillota](https://ftp.ncbi.nih.gov/pub/taxonomy/Major_taxonomic_updates_2023.txt) in early 2023.
+Please watch out for any problems that could arise. For example: 
+  * names of assigned taxonomy may shift after updating a database to a post-change version
+  * comparisons of old analyses performed pre-change to new analyses performed post-change will need to be done with care
+
+If I understand things correctly, then the actual taxonomy ID will not change so it might be wise to retain the taxonomy ID for all analyses. Then on final analysis, the taxonomic names can be assigned based on whatever naming scheme is in use at that time.
+
 
 
 ## Changelog
+
+### v0.10.5
+  * Catch 404 errors and report as errors
+  * Add resume argument to download functions
+  * Don't retain temp files for downloads if less than 10kb
+  * README touchups
+
+### v0.10.4
+  * Minor improvement to output md5 and modification date for downloads to aid in debugging network issues
+
+### v0.10.3
+  * Minor fix to prevent `accessionToTaxa` from hanging when given numeric inputs
+
 ### v0.10.2
   * Behind the scenes switch to `multi_download` function from `curl` package to allow download resumption on interrupted downloads. This adds a dependency that `curl` package be >=5.0.0.
   * Add `protocol` option to choose between FTP and HTTP protocols for downloading. The two protocols should perform similarly and the relative speeds of NCBI's ftp and http servers seem to vary so probably not a whole lot of reason to choose one over the other unless a firewall is blocking FTP ports.