version 0.6.0

cran · Feb 8, 2021 · 5742b9d · 5742b9d
1 parent 126bf23
commit 5742b9d
Show file tree

Hide file tree

Showing 28 changed files with 1,005 additions and 745 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,14 +9,14 @@ Authors@R: c(person("Scott", "Sherrill-Mix", role = c("aut", "cre"),
                      email = "shescott@upenn.edu"))
 BugReports: https://github.com/sherrillmix/taxonomizr/issues
 Description: Functions for assigning taxonomy to NCBI accession numbers and taxon IDs based on NCBI's accession2taxid and taxdump files. This package allows the user to downloads NCBI data dumps and create a local database for fast and local taxonomic assignment.
-Version: 0.5.3
-Date: 2019-04-22
+Version: 0.6.0
+Date: 2021-02-08
 Suggests: testthat, knitr, rmarkdown
 Depends: R (>= 3.0.0)
 Imports: RSQLite, R.utils, data.table
-RoxygenNote: 6.1.1
+RoxygenNote: 7.1.1
 VignetteBuilder: knitr
 NeedsCompilation: yes
-Packaged: 2019-04-22 14:48:47 UTC; scott
+Packaged: 2021-02-08 22:11:23 UTC; scott
 Repository: CRAN
-Date/Publication: 2019-04-22 15:40:02 UTC
+Date/Publication: 2021-02-09 00:40:03 UTC
diff --git a/MD5 b/MD5
@@ -1,35 +1,36 @@
-8d4b277c8f846f24093d281e44649400 *DESCRIPTION
+d0bd0dc8e7ff64cc33174c28810d9a3e *DESCRIPTION
 b234ee4d69f5fce4486a80fdaf4a4263 *LICENSE
-7bd5a2024023fcdc3e56246fede69169 *NAMESPACE
-3a009b285ae5300163dbea924ba788c2 *R/taxa.R
-a1094152fa9b1854590563f64c7ea0d6 *README.md
-fcf3c6caf0d0dd0a9b926ed36c95e470 *build/vignette.rds
-f39228a24c29b08f85dce533c316d7f3 *inst/doc/usage.R
-9cd9056c62e0a87181b7c92929d848ae *inst/doc/usage.Rmd
-3b87d5428958babc38177d8072d20140 *inst/doc/usage.html
-68387ebc183536fcefb27616ef5de18e *man/accessionToTaxa.Rd
+84962c558e130b52da658da0b1fc726e *NAMESPACE
+7df4f150b9d4acb1f70cf2ba9f1a84ad *R/taxa.R
+16ce04c81eabb9fc6238234bd0813c1a *README.md
+a8a1058a87a44a7503f21f3c8d631c62 *build/vignette.rds
+5e114353b5f3d1db8a6f4bf4f7d38c5c *inst/doc/usage.R
+455a026c98530af9982e4dcd7baf76cb *inst/doc/usage.Rmd
+dfd65f421d965a3f2b45683eb767742d *inst/doc/usage.html
+78ef5645c9c3719c5b9630c65fc46d62 *man/accessionToTaxa.Rd
 fc39f17d296a65ca038d98b5b3a69d28 *man/condenseTaxa.Rd
-169e36eec9af8ef8aef31c04202fc968 *man/getAccession2taxid.Rd
-36d76252ba95662414a6036833d6b34d *man/getAccessions.Rd
-0e28b5f644daca47d88b18d505948669 *man/getId.Rd
-552082d7d53e19e9754c99d666fcaa17 *man/getId2.Rd
-c86cadc2fddc68e701044bf9bf97d0e5 *man/getNamesAndNodes.Rd
-00605228ae8f11848491341529906238 *man/getTaxonomy.Rd
-2ef6bcd32c52cf48367e7f1a068516fa *man/getTaxonomy2.Rd
+5abd188f372981c90221c3f1b8caf705 *man/getAccession2taxid.Rd
+c43ce37f96837e90311d5cee78c678e8 *man/getAccessions.Rd
+af0074232ee13d06861fd9aac069d226 *man/getId.Rd
+f540636f46822a6c1447da68507bdd93 *man/getId2.Rd
+4dbf0c375b0cb150abae5982b9c30712 *man/getNamesAndNodes.Rd
+5324e8df544a6af8f1e0dfd874ce9610 *man/getTaxonomy.Rd
+939e0aeca4cdde8f33aa0604be8f61a5 *man/getTaxonomy2.Rd
 8420a5306860a41886c0da14e1bf47b0 *man/lastNotNa.Rd
-3a249b5c8bfe45e767e5996c6dfdf114 *man/prepareDatabase.Rd
-50eafeef4158011645d22df58638cd2f *man/read.accession2taxid.Rd
-1e155828aed61000b228f43577e00846 *man/read.names.Rd
-00411138edcc308a64c9e2b9794744d5 *man/read.names.sql.Rd
-bcc485c9bb1dee0463a07ab9f8472ad5 *man/read.nodes.Rd
-bce1eeb9d00f788e027f2e195a41926f *man/read.nodes.sql.Rd
-65d64cd604ca4cd8bdc667d97e4b79e2 *man/streamingRead.Rd
+18ba1fd63285dd13a9b307a319046eb8 *man/makeNewick.Rd
+fc4b09e78de2f046cc848e543e6baa6a *man/prepareDatabase.Rd
+0fad56e840faf6c80a7ef5736dd0181a *man/read.accession2taxid.Rd
+e75135f81ee83f197b8c3713c7fc2cc3 *man/read.names.Rd
+aac448f7a807f719ecac3de0b269de3b *man/read.names.sql.Rd
+60aeeb75f42af4f90c83cd8660ba71ca *man/read.nodes.Rd
+e8f9e9d197cc6309398088c6663ad9c9 *man/read.nodes.sql.Rd
+d6b0b3e8d6df05c1ef28d04b48d3aaba *man/streamingRead.Rd
 5e3d2fb08aa1d61618a136e8e56c69c7 *man/taxonomizr-package.Rd
-f9f540122f8ba96b91d56226eebc9026 *man/taxonomizrSwitch.Rd
+669eaa9ec667e196cdb5ddd61250354f *man/taxonomizrSwitch.Rd
 d47fc7ed590032dbfa141b6952d998d3 *man/trimTaxa.Rd
 973c222136ffbf4008ff747d643bc6a3 *src/taxaTrim.c
 1b509a76cc18244a3847a2ab27ab9f77 *src/taxonomizr-init.c
 09570ff5fc4fe1c8f81b47295a3faafb *tests/testthat.R
 2c6938a3b6c0303311c55f0e328ff746 *tests/testthat/fakeNamesNodes.tar.gz
-3a3f946ad3a46b159a383366405fe63f *tests/testthat/test_taxa.R
-9cd9056c62e0a87181b7c92929d848ae *vignettes/usage.Rmd
+d3f7947be1e53c61d4ccce8d01a79dd5 *tests/testthat/test_taxa.R
+455a026c98530af9982e4dcd7baf76cb *vignettes/usage.Rmd
diff --git a/NAMESPACE b/NAMESPACE
@@ -10,6 +10,7 @@ export(getNamesAndNodes)
 export(getTaxonomy)
 export(getTaxonomy2)
 export(lastNotNa)
+export(makeNewick)
 export(prepareDatabase)
 export(read.accession2taxid)
 export(read.names)

diff --git a/R/taxa.R b/R/taxa.R
@@ -83,7 +83,8 @@ read.names<-function(nameFile,onlyScientific=TRUE){
 #' )
 #' tmpFile<-tempfile()
 #' writeLines(namesText,tmpFile)
-#' read.names.sql(tmpFile)
+#' sqlFile<-tempfile()
+#' read.names.sql(tmpFile,sqlFile)
 read.names.sql<-function(nameFile,sqlFile='nameNode.sqlite',overwrite=FALSE){
   if(file.exists(sqlFile)){
     dbTest <- RSQLite::dbConnect(RSQLite::SQLite(), dbname=sqlFile)
@@ -159,9 +160,9 @@ read.nodes<-function(nodeFile){
 #'  "9\t|\t32199\t|\tspecies\t|\tBA\t|\t0\t|\t1\t|\t11\t|\t1\t|\t0\t|\t1\t|\t1\t|\t0\t|\t\t|"
 #' )
 #' tmpFile<-tempfile()
-#' outFile<-tempfile()
+#' sqlFile<-tempfile()
 #' writeLines(nodes,tmpFile)
-#' read.nodes.sql(tmpFile,outFile)
+#' read.nodes.sql(tmpFile,sqlFile)
 read.nodes.sql<-function(nodeFile,sqlFile='nameNode.sqlite',overwrite=FALSE){
   if(file.exists(sqlFile)){
     dbTest <- RSQLite::dbConnect(RSQLite::SQLite(), dbname=sqlFile)
@@ -288,10 +289,10 @@ trimTaxa<-function(inFile,outFile,desiredCols=c(2,3)){
 #'   "Z17430\tZ17430.1\t3702\t16572"
 #' )
 #' inFile<-tempfile()
-#' outFile<-tempfile()
+#' sqlFile<-tempfile()
 #' writeLines(taxa,inFile)
-#' read.accession2taxid(inFile,outFile,vocal=FALSE)
-#' db<-RSQLite::dbConnect(RSQLite::SQLite(),dbname=outFile)
+#' read.accession2taxid(inFile,sqlFile,vocal=FALSE)
+#' db<-RSQLite::dbConnect(RSQLite::SQLite(),dbname=sqlFile)
 #' RSQLite::dbGetQuery(db,'SELECT * FROM accessionTaxa')
 #' RSQLite::dbDisconnect(db)
 read.accession2taxid<-function(taxaFiles,sqlFile,vocal=TRUE,extraSqlCommand='',indexTaxa=FALSE,overwrite=FALSE){
@@ -314,7 +315,7 @@ read.accession2taxid<-function(taxaFiles,sqlFile,vocal=TRUE,extraSqlCommand='',i
     trimTaxa(ii,tmp,1:3)
   }
   db <- RSQLite::dbConnect(RSQLite::SQLite(), dbname=sqlFile)
-  on.exit(RSQLite::dbDisconnect(db))
+  on.exit(RSQLite::dbDisconnect(db),add=TRUE)
   if(extraSqlCommand!='')RSQLite::dbExecute(db,extraSqlCommand)
   if(vocal)message('Reading in values. This may take a while.')
   RSQLite::dbWriteTable(conn = db, name = "accessionTaxa", value =tmp, row.names = FALSE, header = TRUE,sep='\t')
@@ -452,7 +453,7 @@ getParentNodes<-function(ids,sqlFile='nameNode.sqlite'){
   on.exit(RSQLite::dbDisconnect(db),add=TRUE)
   RSQLite::dbExecute(db, sprintf("ATTACH '%s' AS tmp",tmp))
   taxaDf<-RSQLite::dbGetQuery(db,'SELECT tmp.query.id, name,parent, rank FROM tmp.query LEFT OUTER JOIN nodes ON tmp.query.id=nodes.id LEFT OUTER JOIN names ON tmp.query.id=names.id WHERE names.scientific=1 OR names.scientific IS NULL')
-  if(!identical(taxaDf$id,ids))stop(simpleError('Problem finding ids'))
+  if(!identical(taxaDf$id,unname(ids)))stop(simpleError('Problem finding ids')) #don't actually need the unname here since as.numeric strips names but good to be safe
   return(taxaDf[,c('name','parent','rank')])
 }
 
@@ -605,7 +606,7 @@ accessionToTaxa<-function(accessions,sqlFile,version=c('version','base')){
   RSQLite::dbExecute(db,'DROP TABLE tmp.query')
   RSQLite::dbExecute(db,'DETACH tmp')
   file.remove(tmp)
-  if(!identical(taxaDf$accession,accessions))stop(simpleError('Query and SQL mismatch'))
+  if(!identical(taxaDf$accession,unname(accessions)))stop(simpleError('Query and SQL mismatch'))
   return(taxaDf$taxa)
 }
 
@@ -668,6 +669,7 @@ condenseTaxa<-function(taxaTable,groupings=rep(1,nrow(taxaTable))){
 #' @param outDir the directory to put names.dmp and nodes.dmp in
 #' @param url the url where taxdump.tar.gz is located
 #' @param fileNames the filenames desired from the tar.gz file
+#' @param timeout time in seconds for the download to time out
 #' @return a vector of file path strings of the locations of the output files
 #' @seealso \code{\link{read.nodes.sql}}, \code{\link{read.names.sql}}
 #' @references \url{ftp://ftp.ncbi.nih.gov/pub/taxonomy/}, \url{https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/}
@@ -676,7 +678,10 @@ condenseTaxa<-function(taxaTable,groupings=rep(1,nrow(taxaTable))){
 #' \dontrun{
 #'   getNamesAndNodes()
 #' }
-getNamesAndNodes<-function(outDir='.',url='ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz',fileNames=c('names.dmp','nodes.dmp')){
+getNamesAndNodes<-function(outDir='.',url='ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump.tar.gz',fileNames=c('names.dmp','nodes.dmp'),timeout=36000){
+  oldTimeout<-getOption('timeout')
+  on.exit(options('timeout'=oldTimeout))
+  options('timeout'=timeout)
   outFiles<-file.path(outDir,fileNames)
   if(all(file.exists(outFiles))){
     message(paste(outFiles,collapse=', '),' already exist. Delete to redownload')
@@ -687,7 +692,7 @@ getNamesAndNodes<-function(outDir='.',url='ftp://ftp.ncbi.nih.gov/pub/taxonomy/t
   dir.create(tmp)
   tarFile<-file.path(tmp,base)
   utils::download.file(url,tarFile,mode='wb')
-  utils::untar(tarFile,fileNames,exdir=tmp,tar='internal',compressed='gzip')
+  utils::untar(tarFile,fileNames,exdir=tmp,tar='internal')
   tmpFiles<-file.path(tmp,fileNames)
   if(!all(file.exists(tmpFiles)))stop("Problem finding files ",paste(tmpFiles[!file.exists(tmpFiles)],collapse=', '))
   mapply(file.copy,tmpFiles,outFiles)
@@ -703,6 +708,7 @@ getNamesAndNodes<-function(outDir='.',url='ftp://ftp.ncbi.nih.gov/pub/taxonomy/t
 #' @param outDir the directory to put the accession2taxid.gz files in
 #' @param baseUrl the url of the directory where accession2taxid.gz files are located
 #' @param types the types if accession2taxid.gz files desired where type is the prefix of xxx.accession2taxid.gz. The default is to download all nucl_ accessions. For protein accessions, try \code{types=c('prot')}.
+#' @param timeout time in seconds for the download to time out
 #' @return a vector of file path strings of the locations of the output files
 #' @seealso \code{\link{read.accession2taxid}}
 #' @references \url{ftp://ftp.ncbi.nih.gov/pub/taxonomy/}, \url{https://www.ncbi.nlm.nih.gov/Sequin/acc.html}
@@ -717,7 +723,10 @@ getNamesAndNodes<-function(outDir='.',url='ftp://ftp.ncbi.nih.gov/pub/taxonomy/t
 #'
 #'   getAccession2taxid()
 #' }
-getAccession2taxid<-function(outDir='.',baseUrl='ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/',types=c('nucl_gb','nucl_wgs')){
+getAccession2taxid<-function(outDir='.',baseUrl='ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/',types=c('nucl_gb','nucl_wgs'),timeout=36000){
+  oldTimeout<-getOption('timeout')
+  on.exit(options('timeout'=oldTimeout))
+  options('timeout'=timeout)
   message('This can be a big (several gigabytes) download. Please be patient and use a fast connection.')
   fileNames<-sprintf('%s.accession2taxid.gz',types)
   outFiles<-file.path(outDir,fileNames)
@@ -793,10 +802,11 @@ getId2<-function(taxa,taxaNames){
 #' )
 #' tmpFile<-tempfile()
 #' writeLines(namesText,tmpFile)
-#' names<-read.names.sql(tmpFile)
-#' getId('Bacteria',names)
-#' getId('Not a real name',names)
-#' getId('Multi',names)
+#' sqlFile<-tempfile()
+#' read.names.sql(tmpFile,sqlFile)
+#' getId('Bacteria',sqlFile)
+#' getId('Not a real name',sqlFile)
+#' getId('Multi',sqlFile)
 getId<-function(taxa,sqlFile='nameNode.sqlite',onlyScientific=TRUE){
   if('data.table' %in% class(sqlFile))return(getId2(taxa,sqlFile))
   tmp<-tempfile()
@@ -883,10 +893,10 @@ prepareDatabase<-function(sqlFile='nameNode.sqlite',tmpDir='.',vocal=TRUE,...){
 #'   "Z17430\tZ17430.1\t3702\t16572"
 #' )
 #' inFile<-tempfile()
-#' outFile<-tempfile()
+#' sqlFile<-tempfile()
 #' writeLines(taxa,inFile)
-#' read.accession2taxid(inFile,outFile)
-#' getAccessions(3702,outFile)
+#' read.accession2taxid(inFile,sqlFile,vocal=FALSE)
+#' getAccessions(3702,sqlFile)
 getAccessions<-function(taxaId,sqlFile,version=c('version','base'),limit=NULL){
   version<-match.arg(version)
   if(version=='version')version<-'accession'
@@ -896,21 +906,40 @@ getAccessions<-function(taxaId,sqlFile,version=c('version','base'),limit=NULL){
   #set up a new table of accessions in a temp db (avoiding concurrency issues)
   #some trouble with dbWriteTable writing to "tmp.xxx" in the main database if we do this inside the attach
   tmpDb <- RSQLite::dbConnect(RSQLite::SQLite(), dbname=tmp)
+  on.exit(RSQLite::dbDisconnect(tmpDb))
   RSQLite::dbWriteTable(tmpDb,'query',data.frame('taxa'=taxaId,stringsAsFactors=FALSE),overwrite=TRUE)
-  RSQLite::dbDisconnect(tmpDb)
   #load the big sql
   db <- RSQLite::dbConnect(RSQLite::SQLite(), dbname=sqlFile)
+  on.exit(RSQLite::dbDisconnect(db),add=TRUE)
   #attach the temp table
   RSQLite::dbExecute(db, sprintf("ATTACH '%s' AS tmp",tmp))
   taxaDf<-RSQLite::dbGetQuery(db,sprintf('SELECT tmp.query.taxa, %s FROM tmp.query LEFT OUTER JOIN accessionTaxa ON tmp.query.taxa=accessionTaxa.taxa%s',version,ifelse(!is.null(limit),sprintf(' LIMIT %s',limit),'')))
   RSQLite::dbExecute(db,'DROP TABLE tmp.query')
   RSQLite::dbExecute(db,'DETACH tmp')
-  RSQLite::dbDisconnect(db)
   file.remove(tmp)
   colnames(taxaDf)<-c('taxa','accession')
   return(taxaDf)
 }
 
+#' Create a Newick tree from taxonomy
+#'
+#' Create a Newick formatted tree from a data.frame of taxonomic assignments
+#' @param taxa a matrix with a row for each leaf of the tree and a column for each taxonomic classification e.g. the output from getTaxonomy
+#' @param naSub a character string to substitute in place of NAs in the taxonomy
+#' @seealso \code{\link{getTaxonomy}}
+#' @export
+#' @examples
+#' taxa<-matrix(c('A','A','A','B','B','C','D','D','E','F','G','H'),nrow=3)
+#' makeNewick(taxa)
+makeNewick<-function(taxa,naSub='_'){
+  if(!is.null(naSub))taxa[is.na(taxa)]<-naSub
+  if(ncol(taxa)==0)return('')
+  bases<-unique(taxa[,1])
+  innerTree<-sapply(bases,function(ii)makeNewick(taxa[taxa[,1]==ii,-1,drop=FALSE]))
+  out<-sprintf('(%s)',paste(sprintf('%s%s',innerTree,bases),collapse=','))
+  return(out)
+}
+
 #' Switch from data.table to SQLite
 #'
 #' In version 0.5.0, taxonomizr switched from data.table to SQLite name and node lookups. See below for more details.

diff --git a/README.md b/README.md
@@ -16,6 +16,7 @@ The major functions are:
 More specialized functions are:
  * `getId`: convert a biological name to taxonomic ID
  * `getAccessions`: find accessions for a given taxonomic ID
+ * `makeNewick`: generate a Newick formatted tree from taxonomic output
 
 And a simple use case might look like (see below for more details):
 
@@ -38,7 +39,7 @@ The package is on CRAN, so it should install with a simple:
 ```r
 install.packages("taxonomizr")
 ```
-If you want the development version directly from github, use the [<code>devtools</code>](https://github.com/hadley/devtools) library and run:
+If you want the development version directly from github, use the [<code>devtools</code>](https://github.com/r-lib/devtools) library and run:
 
 ```r
 devtools::install_github("sherrillmix/taxonomizr")
@@ -125,9 +126,29 @@ prepareDatabase('accessionTaxa.sql')
 
 ## Assigning taxonomy
 
-### Finding taxonomy for NCBI accession numbers
+### Producing accession numbers
+
+NCBI accession numbers are often obtained when doing a BLAST search (usually the second column of output from blastn, blastx, blastp, ...). For example the output might look like:
+
+```
+read1   gi|326539903|gb|CP002582.1|     69.68   1745    448     69      3       1702    3517898 3519606 3e-169  608
+read2   gi|160426828|gb|CP000885.1|     68.46   1763    452     82      3       1711    1790367 1788655 4e-140  511
+...
+```
+
+So to identify a taxon for a given sequence you would blast it against e.g. the NCBI nt database and load the results into R. For NCBI databases, the accession number is often the 4th item in the `|` (pipe) separated reference field (often the second column in a tab separated result). For example, the `CP002582.1` in the gi|326539903|gb|**CP002582.1**| above.
+
+So just as an example, reading in blast results might look something like:
+
 
-NCBI accession numbers are often obtained when doing a BLAST search (usually the second column of output from blastn, blastx, blastp, ...). So to identify a taxon for a given sequence you would blast it against e.g. the NCBI nt database and load the results into R.
+```r
+blastResults<-read.table('XXXX.blast',header=FALSE,stringsAsFactors=FALSE)
+#grab the 4th |-separated field from the reference name in the second column
+accessions<-sapply(strsplit(blastResults[,2],'\\|'),'[',4)
+```
+
+
+### Finding taxonomy for NCBI accession numbers
 
 Now we are ready to convert NCBI accession numbers to taxonomic IDs. For example, to find the taxonomic IDs associated with NCBI accession numbers "LN847353.1" and "AL079352.3":
 
@@ -292,9 +313,41 @@ getAccessions(3702,'accessionTaxa.sql',limit=10)
 ## 10 3702  X52320.1
 ```
 
+### Convert taxonomy to Newick tree
+
+This is probably only useful in a few specific cases but a convenience function `makeNewick` to convert taxonomy into a Newick tree is included. The function takes a matrix giving with columns corresponding to taxonomic categories and rows different to taxonomic assignments, e.g. the output from `condenseTaxa` or `getTaxonomy` and reduces it to a Newick formatted tree. For example:
+
+
+
+
+```r
+taxa
+```
+
+```
+##      [,1]        [,2]       [,3]       [,4]       [,5]        [,6]   
+## [1,] "Eukaryota" "Chordata" "Mammalia" "Primates" "Hominidae" "Homo" 
+## [2,] "Eukaryota" "Chordata" "Mammalia" "Primates" "Hominidae" "Pan"  
+## [3,] "Eukaryota" "Chordata" "Mammalia" NA         "Cervidae"  "Alces"
+```
+
+```r
+makeNewick(taxa)
+```
+
+```
+## [1] "((((((Homo,Pan)Hominidae)Primates,((Alces)Cervidae)_)Mammalia)Chordata)Eukaryota)"
+```
+
+
 ## Changelog
 
-### v0.5.2
+### v0.5.3
+  * Fix named vector bug in `accessionToTaxa`
+  * Add `makeNewick` function
+  * Deal with default 60 second timeout for downloads in R
+
+### v0.5.3
   * Remove `nucl_est` and `nucl_gss` from defaults since NCBI folded them into `nucl_gb` and removed
   * Squash R:devel bug
 

diff --git a/build/vignette.rds b/build/vignette.rds