version 0.5.3

cran · Apr 22, 2019 · 126bf23 · 126bf23
1 parent 997b7c3
commit 126bf23
Show file tree

Hide file tree

Showing 18 changed files with 726 additions and 503 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -9,14 +9,14 @@ Authors@R: c(person("Scott", "Sherrill-Mix", role = c("aut", "cre"),
                      email = "shescott@upenn.edu"))
 BugReports: https://github.com/sherrillmix/taxonomizr/issues
 Description: Functions for assigning taxonomy to NCBI accession numbers and taxon IDs based on NCBI's accession2taxid and taxdump files. This package allows the user to downloads NCBI data dumps and create a local database for fast and local taxonomic assignment.
-Version: 0.5.1
-Date: 2018-08-31
+Version: 0.5.3
+Date: 2019-04-22
 Suggests: testthat, knitr, rmarkdown
 Depends: R (>= 3.0.0)
-Imports: parallel, RSQLite, data.table, R.utils
-RoxygenNote: 6.0.1
+Imports: RSQLite, R.utils, data.table
+RoxygenNote: 6.1.1
 VignetteBuilder: knitr
 NeedsCompilation: yes
-Packaged: 2018-08-31 20:05:46 UTC; scott
+Packaged: 2019-04-22 14:48:47 UTC; scott
 Repository: CRAN
-Date/Publication: 2018-08-31 20:50:03 UTC
+Date/Publication: 2019-04-22 15:40:02 UTC
diff --git a/MD5 b/MD5
@@ -1,36 +1,35 @@
-0cb81087ee9a9c9e13f93ee1f8f6455f *DESCRIPTION
+8d4b277c8f846f24093d281e44649400 *DESCRIPTION
 b234ee4d69f5fce4486a80fdaf4a4263 *LICENSE
 7bd5a2024023fcdc3e56246fede69169 *NAMESPACE
-a4dc51bda75b22027162408f28cc41e4 *R/taxa.R
-19f81903072e403d63f69f7a613c5dc2 *README.md
+3a009b285ae5300163dbea924ba788c2 *R/taxa.R
+a1094152fa9b1854590563f64c7ea0d6 *README.md
 fcf3c6caf0d0dd0a9b926ed36c95e470 *build/vignette.rds
-14b6e1db194ff18f1fadabaf830c04a0 *inst/doc/usage.R
-a25b8ed785ad9a3819c6a439ba6c5639 *inst/doc/usage.Rmd
-01a9e61092c47c18c0d7f94ce5f2626e *inst/doc/usage.html
+f39228a24c29b08f85dce533c316d7f3 *inst/doc/usage.R
+9cd9056c62e0a87181b7c92929d848ae *inst/doc/usage.Rmd
+3b87d5428958babc38177d8072d20140 *inst/doc/usage.html
 68387ebc183536fcefb27616ef5de18e *man/accessionToTaxa.Rd
 fc39f17d296a65ca038d98b5b3a69d28 *man/condenseTaxa.Rd
-e05a27da96d9eaeec0a3ab74437418c5 *man/getAccession2taxid.Rd
-f45d1ede2cc22a697cab44ea01b21f7b *man/getAccessions.Rd
+169e36eec9af8ef8aef31c04202fc968 *man/getAccession2taxid.Rd
+36d76252ba95662414a6036833d6b34d *man/getAccessions.Rd
 0e28b5f644daca47d88b18d505948669 *man/getId.Rd
 552082d7d53e19e9754c99d666fcaa17 *man/getId2.Rd
 c86cadc2fddc68e701044bf9bf97d0e5 *man/getNamesAndNodes.Rd
 00605228ae8f11848491341529906238 *man/getTaxonomy.Rd
-69eb3ce54dce2802545d337794564c0c *man/getTaxonomy2.Rd
+2ef6bcd32c52cf48367e7f1a068516fa *man/getTaxonomy2.Rd
 8420a5306860a41886c0da14e1bf47b0 *man/lastNotNa.Rd
-47b86f075854e98ec479a4fbfa4fe073 *man/prepareDatabase.Rd
-dbf8bf8dbcebfb85405709e7af6e7385 *man/read.accession2taxid.Rd
+3a249b5c8bfe45e767e5996c6dfdf114 *man/prepareDatabase.Rd
+50eafeef4158011645d22df58638cd2f *man/read.accession2taxid.Rd
 1e155828aed61000b228f43577e00846 *man/read.names.Rd
-507d3c41836c5b6c6efebec94862d88c *man/read.names.sql.Rd
+00411138edcc308a64c9e2b9794744d5 *man/read.names.sql.Rd
 bcc485c9bb1dee0463a07ab9f8472ad5 *man/read.nodes.Rd
-8746dd71ef08324d519db790f61f4f7e *man/read.nodes.sql.Rd
-5d8ea142c3bfff486547b8479f2d52a3 *man/streamingRead.Rd
+bce1eeb9d00f788e027f2e195a41926f *man/read.nodes.sql.Rd
+65d64cd604ca4cd8bdc667d97e4b79e2 *man/streamingRead.Rd
 5e3d2fb08aa1d61618a136e8e56c69c7 *man/taxonomizr-package.Rd
 f9f540122f8ba96b91d56226eebc9026 *man/taxonomizrSwitch.Rd
 d47fc7ed590032dbfa141b6952d998d3 *man/trimTaxa.Rd
 973c222136ffbf4008ff747d643bc6a3 *src/taxaTrim.c
 1b509a76cc18244a3847a2ab27ab9f77 *src/taxonomizr-init.c
 09570ff5fc4fe1c8f81b47295a3faafb *tests/testthat.R
 2c6938a3b6c0303311c55f0e328ff746 *tests/testthat/fakeNamesNodes.tar.gz
-ee7e47ac573f20970c95c7d71522bbcf *tests/testthat/test_taxa.R
-a25b8ed785ad9a3819c6a439ba6c5639 *vignettes/usage.Rmd
-0cd93bfeaacc55b489c2238496ae6ddc *vignettes/usage.md
+3a3f946ad3a46b159a383366405fe63f *tests/testthat/test_taxa.R
+9cd9056c62e0a87181b7c92929d848ae *vignettes/usage.Rmd
diff --git a/R/taxa.R b/R/taxa.R
@@ -333,7 +333,7 @@ read.accession2taxid<-function(taxaFiles,sqlFile,vocal=TRUE,extraSqlCommand='',i
 #' @param taxaNodes a nodes data.table from \code{\link{read.nodes}}
 #' @param taxaNames a names data.table from \code{\link{read.names}}
 #' @param desiredTaxa a vector of strings giving the desired taxa levels
-#' @param mc.cores the number of cores to use when processing
+#' @param mc.cores DEPRECATED the number of cores to use when processing. Note this option is now deprecated and has no effect. Please switch to \code{\link{getTaxonomy}} (see \link{taxonomizrSwitch}) for much faster processing without requiring multiple cores.
 #' @param debug if TRUE output node and name vectors with dput for each id (probably useful only for development)
 #' @return a matrix of taxonomic strings with a row for each id and a column for each desiredTaxa rank
 #' @import data.table
@@ -410,7 +410,7 @@ getTaxonomy2<-function(ids,taxaNodes ,taxaNames, desiredTaxa=c('superkingdom','p
   ids<-as.numeric(ids)
   if(length(ids)==0)return(NULL)
   uniqIds<-unique(ids)
-  taxa<-do.call(rbind,parallel::mclapply(uniqIds,function(id){
+  taxa<-do.call(rbind,lapply(uniqIds,function(id){
       out<-structure(rep(as.character(NA),length(desiredTaxa)),names=desiredTaxa)
       if(is.na(id))return(out)
       thisId<-id
@@ -433,7 +433,7 @@ getTaxonomy2<-function(ids,taxaNodes ,taxaNames, desiredTaxa=c('superkingdom','p
         dput(tmp2)
       }
       return(out)
-  },mc.cores=mc.cores))
+  }))
   rownames(taxa)<-format(uniqIds,scientific=FALSE)
   out<-taxa[format(ids,scientific=FALSE),,drop=FALSE]
   return(out)
@@ -605,7 +605,7 @@ accessionToTaxa<-function(accessions,sqlFile,version=c('version','base')){
   RSQLite::dbExecute(db,'DROP TABLE tmp.query')
   RSQLite::dbExecute(db,'DETACH tmp')
   file.remove(tmp)
-  if(any(taxaDf$accession!=accessions))stop(simpleError('Query and SQL mismatch'))
+  if(!identical(taxaDf$accession,accessions))stop(simpleError('Query and SQL mismatch'))
   return(taxaDf$taxa)
 }
 
@@ -717,7 +717,7 @@ getNamesAndNodes<-function(outDir='.',url='ftp://ftp.ncbi.nih.gov/pub/taxonomy/t
 #'
 #'   getAccession2taxid()
 #' }
-getAccession2taxid<-function(outDir='.',baseUrl='ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/',types=c('nucl_gb','nucl_est','nucl_gss','nucl_wgs')){
+getAccession2taxid<-function(outDir='.',baseUrl='ftp://ftp.ncbi.nih.gov/pub/taxonomy/accession2taxid/',types=c('nucl_gb','nucl_wgs')){
   message('This can be a big (several gigabytes) download. Please be patient and use a fast connection.')
   fileNames<-sprintf('%s.accession2taxid.gz',types)
   outFiles<-file.path(outDir,fileNames)
@@ -764,7 +764,7 @@ getId2<-function(taxa,taxaNames){
   multiHits<-sapply(out,length)>1
   if(any(multiHits)){
     warning('Multiple taxa ids found for ',paste(taxa[multiHits],collapse=', '),'. Collapsing with commas')
-    out<-sapply(out,function(xx)ifelse(is.na(xx)||is.null(xx),NA,paste(xx,collapse=',')))
+    out<-sapply(out,function(xx)ifelse(all(is.na(xx))||is.null(xx),NA,paste(xx,collapse=',')))
   }
   out<-as.character(unlist(out))
   names(out)<-uniqTaxa

diff --git a/README.md b/README.md
@@ -86,14 +86,6 @@ prepareDatabase('accessionTaxa.sql')
 ## Reading ./nucl_gb.accession2taxid.gz.
 ```
 
-```
-## Reading ./nucl_est.accession2taxid.gz.
-```
-
-```
-## Reading ./nucl_gss.accession2taxid.gz.
-```
-
 ```
 ## Reading ./nucl_wgs.accession2taxid.gz.
 ```
@@ -110,6 +102,7 @@ prepareDatabase('accessionTaxa.sql')
 ## [1] "accessionTaxa.sql"
 ```
 
+
 If everything works then that should have prepared a SQLite database ready for use. You can skip the "Manual preparation" steps below.
 
 All files are cached locally and so the preparation is only required once (delete/rename the SQLite database and recall the function to regenerate the database). It is not necessary to manually check for the presence of the database since the function checks to see if SQLite database is present and if so skips downloading/processing. For example, running the command again produces:
@@ -129,12 +122,16 @@ prepareDatabase('accessionTaxa.sql')
 
 
 
+
 ## Assigning taxonomy
 
 ### Finding taxonomy for NCBI accession numbers
 
+NCBI accession numbers are often obtained when doing a BLAST search (usually the second column of output from blastn, blastx, blastp, ...). So to identify a taxon for a given sequence you would blast it against e.g. the NCBI nt database and load the results into R.
+
 Now we are ready to convert NCBI accession numbers to taxonomic IDs. For example, to find the taxonomic IDs associated with NCBI accession numbers "LN847353.1" and "AL079352.3":
 
+
 ```r
 taxaId<-accessionToTaxa(c("LN847353.1","AL079352.3"),"accessionTaxa.sql")
 print(taxaId)
@@ -174,6 +171,7 @@ print(taxaId)
 ```
 
 
+
 ```r
 taxaId<-accessionToTaxa(c("LN847353","AL079352"),"accessionTaxa.sql",version='base')
 print(taxaId)
@@ -184,7 +182,6 @@ print(taxaId)
 ```
 
 
-
 ### Finding taxonomy for taxonomic names
 
 If you'd like to find IDs for taxonomic names then you can do something like:
@@ -205,8 +202,8 @@ And again to get the taxonomy for those IDs use `getTaxonomy`:
 taxa<-getTaxonomy(taxaId,'accessionTaxa.sql')
 print(taxa)
 ```
-
 ```
+
 ##      superkingdom phylum     class      order      family      genus  
 ## 9606 "Eukaryota"  "Chordata" "Mammalia" "Primates" "Hominidae" "Homo" 
 ## 9913 "Eukaryota"  "Chordata" "Mammalia" NA         "Bovidae"   "Bos"  
@@ -255,7 +252,23 @@ To find all the accessions for a given taxonomic ID, you can use the `getAccessi
 
 
 ```r
-read.accession2taxid(list.files('.','accession2taxid.gz$'),'accessionTaxa.sql',indexTaxa=TRUE)
+read.accession2taxid(list.files('.','accession2taxid.gz$'),'accessionTaxa.sql',indexTaxa=TRUE,overwrite=TRUE)
+```
+
+```
+## Reading nucl_gb.accession2taxid.gz.
+```
+
+```
+## Reading nucl_wgs.accession2taxid.gz.
+```
+
+```
+## Reading in values. This may take a while.
+```
+
+```
+## Adding index. This may also take a while.
 ```
 
 Then you can get the accessions for taxa 3702 with a command like (note that the limit argument is used here in order to preserve space):
@@ -267,29 +280,28 @@ getAccessions(3702,'accessionTaxa.sql',limit=10)
 
 ```
 ##    taxa accession
-## 1  3702  Z17427.1
-## 2  3702  Z17428.1
-## 3  3702  Z17429.1
-## 4  3702  Z17430.1
-## 5  3702  Z17431.1
-## 6  3702  Z17432.1
-## 7  3702  Z17433.1
-## 8  3702  Z17434.1
-## 9  3702  Z17435.1
-## 10 3702  Z17436.1
+## 1  3702  X58148.1
+## 2  3702  X66414.1
+## 3  3702  X60045.1
+## 4  3702  X07376.1
+## 5  3702  X54927.1
+## 6  3702  X54926.1
+## 7  3702  X54928.1
+## 8  3702  X54930.1
+## 9  3702  X54929.1
+## 10 3702  X52320.1
 ```
 
-## Switch from data.table to SQLite
-Version 0.5.0 marked a change for name and node lookups from using data.table to using SQLite. This was necessary to increase performance (10-100x speedup for `getTaxonomy`) and create a simpler interface (a single SQLite database contains all necessary data). Unfortunately, this switch requires a couple breaking changes: 
-  * `getTaxonomy` changes from `getTaxonomy(ids,namesDT,nodesDT)` to `getTaxonomy(ids,sqlFile)`
-  * `getId` changes from  `getId(taxa,namesDT)` to `getId(taxa,sqlFile)`
-  * `read.names` is deprecated, instead use `read.names.sql`. For example, instead of calling `names<-read.names('names.dmp')` in every session, simply call `read.names.sql('names.dmp','accessionTaxa.sql')` once (or use the convenient `prepareDatabase` as <a href='#preparation'>above</a>)).
-  * `read.nodes` is deprecated, instead use `read.names.sql`. For example. instead of calling `nodes<-read.names('nodes.dmp')` in every session, simply call `read.nodes.sql('nodes.dmp','accessionTaxa.sql')` once (or use the convenient `prepareDatabase` as <a href='#preparation'>above</a>).
+## Changelog
 
-  I've tried to ease any problems with this by overloading `getTaxonomy` and `getId` to still function (with a warning) if passed a data.table names and nodes argument and providing a simpler `prepareDatabase` function for completing all setup steps (hopefully avoiding direct calls to `read.names` and `read.nodes` for most users). 
+### v0.5.2
+  * Remove `nucl_est` and `nucl_gss` from defaults since NCBI folded them into `nucl_gb` and removed
+  * Squash R:devel bug
 
-I plan to eventually remove data.table functionality to avoid a split codebase so please switch to the new SQLite format in all new code.
-
+### v0.5.0
+  * Transitioned from data.table to SQLite
+  * Addeded convenience `prepareDatabase()` function
+  * Squashed Windows testing errors
 
 ## Manual preparation of database (usually not necessary)
 **Note:** Since version 0.5.0, it is usually not necessary to run the following manually, the function `prepareDatabase()` should do most of this automatically for you (see <a href='#preparation'>above</a>).
@@ -319,17 +331,25 @@ getAccession2taxid()
 ```
 
 ```
-## [1] "./nucl_gb.accession2taxid.gz"  "./nucl_est.accession2taxid.gz"
-## [3] "./nucl_gss.accession2taxid.gz" "./nucl_wgs.accession2taxid.gz"
+## This can be a big (several gigabytes) download. Please be patient and use a fast connection.
 ```
 
+```
+## [1] "./nucl_gb.accession2taxid.gz"  "./nucl_wgs.accession2taxid.gz"
+```
+
+
 If you would also like to identify protein accession numbers, also download the prot file from NCBI (again this is a _big_ download):
 
 ```r
 #this is a big download
 getAccession2taxid(types='prot')
 ```
 
+```
+## This can be a big (several gigabytes) download. Please be patient and use a fast connection.
+```
+
 ```
 ## [1] "./prot.accession2taxid.gz"
 ```
@@ -350,20 +370,16 @@ Next process the downloaded accession files into the same database (this one cou
 read.accession2taxid(list.files('.','accession2taxid.gz$'),'accessionTaxa.sql')
 ```
 
-```
-## Reading nucl_est.accession2taxid.gz.
-```
-
 ```
 ## Reading nucl_gb.accession2taxid.gz.
 ```
 
 ```
-## Reading nucl_gss.accession2taxid.gz.
+## Reading nucl_wgs.accession2taxid.gz.
 ```
 
 ```
-## Reading nucl_wgs.accession2taxid.gz.
+## Reading prot.accession2taxid.gz.
 ```
 
 ```
@@ -374,8 +390,21 @@ read.accession2taxid(list.files('.','accession2taxid.gz$'),'accessionTaxa.sql')
 ## Adding index. This may also take a while.
 ```
 
+
 Now everything should be ready for processing. All files are cached locally and so the preparation is only required once (or whenever you would like to update the data). It is not necessary to manually check for the presence of these files since the functions automatically check to see if their output is present and if so skip downloading/processing. Delete the local files if you would like to redownload or reprocess them.
 
 
 
+## Switch from data.table to SQLite
+Version 0.5.0 marked a change for name and node lookups from using data.table to using SQLite. This was necessary to increase performance (10-100x speedup for `getTaxonomy`) and create a simpler interface (a single SQLite database contains all necessary data). Unfortunately, this switch requires a couple breaking changes: 
+  * `getTaxonomy` changes from `getTaxonomy(ids,namesDT,nodesDT)` to `getTaxonomy(ids,sqlFile)`
+  * `getId` changes from  `getId(taxa,namesDT)` to `getId(taxa,sqlFile)`
+  * `read.names` is deprecated, instead use `read.names.sql`. For example, instead of calling `names<-read.names('names.dmp')` in every session, simply call `read.names.sql('names.dmp','accessionTaxa.sql')` once (or use the convenient `prepareDatabase` as <a href='#preparation'>above</a>)).
+  * `read.nodes` is deprecated, instead use `read.names.sql`. For example. instead of calling `nodes<-read.names('nodes.dmp')` in every session, simply call `read.nodes.sql('nodes.dmp','accessionTaxa.sql')` once (or use the convenient `prepareDatabase` as <a href='#preparation'>above</a>).
+
+  I've tried to ease any problems with this by overloading `getTaxonomy` and `getId` to still function (with a warning) if passed a data.table names and nodes argument and providing a simpler `prepareDatabase` function for completing all setup steps (hopefully avoiding direct calls to `read.names` and `read.nodes` for most users). 
+
+I plan to eventually remove data.table functionality to avoid a split codebase so please switch to the new SQLite format in all new code.
+
+
 
diff --git a/inst/doc/usage.R b/inst/doc/usage.R
@@ -52,7 +52,7 @@ library(taxonomizr)
 #  condenseTaxa(taxa,groupings)
 
 ## ----eval=FALSE----------------------------------------------------------
-#  read.accession2taxid(list.files('.','accession2taxid.gz$'),'accessionTaxa.sql',indexTaxa=TRUE)
+#  read.accession2taxid(list.files('.','accession2taxid.gz$'),'accessionTaxa.sql',indexTaxa=TRUE,overwrite=TRUE)
 
 ## ----eval=FALSE----------------------------------------------------------
 #  getAccessions(3702,'accessionTaxa.sql',limit=10)