version 1.2.0

cran · Apr 15, 2018 · bbbdc48 · bbbdc48
1 parent 9e2fd89
commit bbbdc48
Show file tree

Hide file tree

Showing 123 changed files with 3,117 additions and 1,583 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,5 +1,5 @@
 Package: quanteda
-Version: 1.1.1
+Version: 1.2.0
 Title: Quantitative Analysis of Textual Data
 Description: A fast, flexible, and comprehensive framework for 
     quantitative text analysis in R.  Provides functionality for corpus management,
@@ -20,7 +20,7 @@ Authors@R: c( person("Kenneth", "Benoit", email = "kbenoit@lse.ac.uk", role =
 License: GPL-3
 Depends: R (>= 3.1.0), methods
 Imports: extrafont, Matrix (>= 1.2), data.table (>= 1.9.6), SnowballC,
-        sna, network, ggrepel, Rcpp (>= 0.12.12), RcppParallel,
+        sna, ggrepel, network, Rcpp (>= 0.12.12), RcppParallel,
         RSpectra, stringi, fastmatch, ggplot2 (>= 2.2.0), XML, yaml,
         lubridate, magrittr, spacyr, stopwords
 LinkingTo: Rcpp, RcppParallel, RcppArmadillo (>= 0.7.600.1.0)
@@ -42,10 +42,10 @@ Collate: 'RcppExports.R' 'View.R' 'bootstrap_dfm.R'
         'dfm_compress.R' 'dfm_group.R' 'dfm_lookup.R' 'dfm_replace.R'
         'dfm_sample.R' 'dfm_select.R' 'dfm_sort.R' 'dfm_subset.R'
         'dfm_trim.R' 'dfm_weight.R' 'dictionaries.R' 'docnames.R'
-        'docvars.R' 'fcm-methods.R' 'fcm.R' 'kwic.R' 'nfunctions.R'
-        'nscrabble.R' 'nsyllable.R' 'phrases.R'
-        'quanteda-documentation.R' 'quanteda_options.R'
-        'readtext-methods.R' 'regex2fixed.R' 'settings.R'
+        'docvars.R' 'fcm-classes.R' 'fcm-methods.R' 'fcm-subsetting.R'
+        'fcm.R' 'kwic.R' 'nfunctions.R' 'nscrabble.R' 'nsyllable.R'
+        'pattern2fixed.R' 'phrases.R' 'quanteda-documentation.R'
+        'quanteda_options.R' 'readtext-methods.R' 'settings.R'
         'spacyr-methods.R' 'stopwords.R' 'textmodel-methods.R'
         'textmodel_affinity.R' 'textmodel_ca.R' 'textmodel_lsa.R'
         'textmodel_nb.R' 'textmodel_wordfish.R'
@@ -59,11 +59,10 @@ Collate: 'RcppExports.R' 'View.R' 'bootstrap_dfm.R'
         'tokens_ngrams.R' 'tokens_replace.R' 'tokens_segment.R'
         'tokens_select.R' 'tokens_subset.R' 'utils.R' 'wordstem.R'
         'zzz.R'
-RcppModules: ngramMaker
 RoxygenNote: 6.0.1
 SystemRequirements: C++11
 NeedsCompilation: yes
-Packaged: 2018-03-06 20:11:48 UTC; kbenoit
+Packaged: 2018-04-15 17:51:45 UTC; kbenoit
 Author: Kenneth Benoit [aut, cre, cph],
   Kohei Watanabe [ctb],
   Paul Nulty [ctb],
@@ -74,4 +73,4 @@ Author: Kenneth Benoit [aut, cre, cph],
   Will Lowe [ctb]
 Maintainer: Kenneth Benoit <kbenoit@lse.ac.uk>
 Repository: CRAN
-Date/Publication: 2018-03-07 10:03:16 UTC
+Date/Publication: 2018-04-15 19:13:13 UTC
diff --git a/MD5 b/MD5
diff --git a/NAMESPACE b/NAMESPACE
@@ -57,6 +57,7 @@ S3method(as.list,tokens)
 S3method(as.matrix,dfm)
 S3method(as.matrix,dist_selection)
 S3method(as.matrix,simil)
+S3method(as.network,default)
 S3method(as.network,fcm)
 S3method(as.statistics_textmodel,data.frame)
 S3method(as.statistics_textmodel,matrix)
@@ -218,6 +219,7 @@ S3method(nscrabble,default)
 S3method(nsentence,character)
 S3method(nsentence,corpus)
 S3method(nsentence,default)
+S3method(nsentence,spacyr_parsed)
 S3method(nsentence,tokens)
 S3method(nsyllable,character)
 S3method(nsyllable,default)
@@ -276,6 +278,7 @@ S3method(spacy_parse,corpus)
 S3method(sparsity,default)
 S3method(sparsity,dfm)
 S3method(str,corpus)
+S3method(summary,character)
 S3method(summary,corpus)
 S3method(summary,influence.predict.textmodel_affinity)
 S3method(summary,textmodel_nb)
@@ -435,6 +438,7 @@ export(fcm_sort)
 export(fcm_tolower)
 export(fcm_toupper)
 export(featnames)
+export(index_types)
 export(is.collocations)
 export(is.corpus)
 export(is.corpuszip)
@@ -455,6 +459,8 @@ export(nsentence)
 export(nsyllable)
 export(ntoken)
 export(ntype)
+export(pattern2fixed)
+export(pattern2id)
 export(phrase)
 export(quanteda_options)
 export(scrabble)
@@ -518,9 +524,7 @@ exportMethods(t)
 import(Matrix)
 import(data.table)
 import(ggplot2)
-import(ggrepel)
 import(methods)
-import(network)
 import(stopwords)
 import(stringi)
 importFrom(Rcpp,evalCpp)

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,23 @@
-# quanteda v1.1.0
+# quanteda v1.2.0
+
+### New Features
+
+* Added an `nsentence()` method for **spacyr** parsed objects.  (#1289)
+
+### Bug fixes and stability enhancements
+
+* Fix bug in `nsyllable()` that incorrectly handled cased words, and returned wrong names with `use.names = TRUE`. (#1282)
+* Fix the overwriting of `summary.character()` caused by previous import of the **network** package namespace. (#1285)
+* `dfm_smooth()` now correctly sets the smooth value in the dfm (#1274).  Arithmetic operations on dfm objects are now much more consistent and do not drop attributes of the dfm, as sometimes happened with earlier versions.
+
+### Behaviour changes
+
+* `tokens_toupper()` and `tokens_tolower()` no longer remove unused token types.  Solves #1278.
+* `dfm_trim()` now takes more options, and these are implemented more consistently.  `min_termfreq` and `max_termfreq` have replaced `min_count` and `max_count`, and these can be modified using a `termfreq_type` argument.  (Similar options are implemented for `docfreq_type`.)  Solves #1253, #1254.
+* `textstat_simil()` and `textstat_dist()` now take valid dfm indexes for the relevant margin for the `selection` argument.  Previously, this could also be a direct vector or matrix for comparison, but this is no longer allowed.  Solves #1266.
+* Improved performance for `dfm_group()` (#1295).
+
+# quanteda v1.1.1
 
 ### New Features
 
@@ -8,6 +27,7 @@
 
 ### Bug fixes and stability enhancements
 
+* Fixed a problem in the examples for `textplot_scale1d()` by adjusting the refscores for `data_corpus_irishbudget2010`.
 * Eliminated unnecessary dependency on the **digest** package.
 * Updated the vignette title to be less generic.
 * Improved the robustness of `dfm_trim()` and `dfm_weight()` for previously weighted dfm objects and when supplied thresholds are proportions instead of counts.  (#1237)

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -61,8 +61,8 @@ qatd_cpp_tokens_ngrams <- function(texts_, types_, delim_, ns_, skips_) {
     .Call(`_quanteda_qatd_cpp_tokens_ngrams`, texts_, types_, delim_, ns_, skips_)
 }
 
-qatd_cpp_tokens_recompile <- function(texts_, types_) {
-    .Call(`_quanteda_qatd_cpp_tokens_recompile`, texts_, types_)
+qatd_cpp_tokens_recompile <- function(texts_, types_, gap = TRUE, dup = TRUE) {
+    .Call(`_quanteda_qatd_cpp_tokens_recompile`, texts_, types_, gap, dup)
 }
 
 qatd_cpp_tokens_segment <- function(texts_, types_, patterns_, remove, position) {
@@ -73,8 +73,12 @@ qatd_cpp_tokens_select <- function(texts_, types_, words_, mode, padding, window
     .Call(`_quanteda_qatd_cpp_tokens_select`, texts_, types_, words_, mode, padding, window_left, window_right)
 }
 
-qatd_cpp_chars_remove <- function(input_, char_remove) {
-    .Call(`_quanteda_qatd_cpp_chars_remove`, input_, char_remove)
+qatd_cpp_is_grouped_numeric <- function(values_, groups_) {
+    .Call(`_quanteda_qatd_cpp_is_grouped_numeric`, values_, groups_)
+}
+
+qatd_cpp_is_grouped_character <- function(values_, groups_) {
+    .Call(`_quanteda_qatd_cpp_is_grouped_character`, values_, groups_)
 }
 
 qatd_cpp_tbb_enabled <- function() {

diff --git a/R/bootstrap_dfm.R b/R/bootstrap_dfm.R
@@ -78,12 +78,12 @@ bootstrap_dfm.dfm <- function(x, n = 10, ..., verbose = quanteda_options("verbos
     result[['dfm_0']] <- dfm_group(x, groups = docvars(x, '_document'))
 
     # randomly resample dfm
-    docID <- index <- NULL
+    id <- index <- NULL
     for (i in seq_len(n)) {
         if (verbose) message(", ", i, appendLF = FALSE)
-        dt <- data.table(index = seq_len(ndoc(x)), docID = docvars(x, "_document"))
-        dt[, temp := sample(1:.N, replace = TRUE), by = docID]
-        dt[, sample_index := index[temp], by = docID]
+        dt <- data.table(index = seq_len(ndoc(x)), id = docvars(x, "_document"))
+        dt[, temp := sample(1:.N, replace = TRUE), by = id]
+        dt[, sample_index := index[temp], by = id]
         sample_index <- dt[, sample_index]
         temp <- x[sample_index, ]
         temp <- dfm_group(temp, groups = docvars(temp, '_document'))

diff --git a/R/casechange-functions.R b/R/casechange-functions.R
@@ -22,7 +22,7 @@ tokens_tolower.default <- function(x, keep_acronyms = FALSE, ...) {
 #' @export
 tokens_tolower.tokens <- function(x, keep_acronyms = FALSE, ...) {
     types(x) <- lowercase_types(types(x), keep_acronyms)
-    tokens_recompile(x)
+    tokens_recompile(x, gap = FALSE, dup = TRUE)
 }
 
 lowercase_types <- function(type, keep_acronyms) {
@@ -51,7 +51,7 @@ tokens_toupper.default <- function(x, ...) {
 #' @export
 tokens_toupper.tokens <- function(x, ...) {
     types(x) <- char_toupper(types(x), ...)
-    tokens_recompile(x)
+    tokens_recompile(x, gap = FALSE, dup = TRUE)
 }
 
 

diff --git a/R/character-methods.R b/R/character-methods.R
@@ -1,22 +1,21 @@
 #' Summary statistics on a character vector
 #' 
 #' Internal-only function to compute summary statistics on a character object.
-#' @method summary character
 #' @inheritParams summary.corpus
-#' @keywords internal 
+#' @keywords char internal 
 #' @examples
 #' # summarize texts
-#' summary(c("Testing this text.  Second sentence.", "And this one."))
-#' summary(data_char_ukimmig2010)
-#' myTextSummaryDF <- summary(data_char_ukimmig2010)
-#' head(myTextSummaryDF)
-summary.character <- function(object, n = 100, tolower = FALSE, ...) {
+#' quanteda:::summary_character(c("Testing this text. Second sentence.", "And this one."))
+#' quanteda:::summary_character(data_char_ukimmig2010)
+#' mysummary_ukimmig2010 <- quanteda:::summary_character(data_char_ukimmig2010)
+#' head(mysummary_ukimmig2010)
+summary_character <- function(object, n = 100, tolower = FALSE, ...) {
 
     # trap the verbose argument and ignore
     thecall <- as.list(match.call())[-1]
     if (!is.na(verbose_index <- match("verbose", names(thecall)))) {
         warning("verbose argument is defunct")
-        return(do.call(summary.character, thecall[-verbose_index]))
+        return(do.call(summary_character, thecall[-verbose_index]))
     }
 
     object <- object[1 : min(c(n, length(object)))]
@@ -35,5 +34,3 @@ summary.character <- function(object, n = 100, tolower = FALSE, ...) {
                           row.names = NULL)
     results
 }
-
-
diff --git a/R/corpus-methods-base.R b/R/corpus-methods-base.R
@@ -73,7 +73,7 @@ summary.corpus <- function(object, n = 100, showmeta = FALSE, tolower = FALSE, .
 
     n_all <- ndoc(object)
     object <- head(object, n)
-    result <- data.frame(summary(texts(object), n = n, tolower = tolower, ...))
+    result <- data.frame(summary_character(texts(object), n = n, tolower = tolower, ...))
     dvars <- docvars_internal(object)
     if (!is.null(dvars)) { 
         if (showmeta) {
@@ -208,9 +208,10 @@ tail.corpus <- function(x, n = 6L, ...) {
                 paste(metacorpus(c1, field), metacorpus(c2, field))
     }
 
-    #rowname <- c(rownames(c1$documents), rownames(c2$documents))
-    c1$documents <- rbind(c1$documents, c2$documents)
-
+    r_names <-  make.unique(c(rownames(c1$documents), rownames(c2$documents)), sep='')
+    c1$documents <- data.table::setDF(data.table::rbindlist(list(c1$documents, c2$documents), fill = TRUE))
+    rownames(c1$documents) <- r_names
+
     #  Put rownames back in because the hadleyverse discards them
     #rownames(c1$documents) <- make.unique(rowname, sep='')
 

diff --git a/R/corpus.R b/R/corpus.R
@@ -242,13 +242,14 @@ corpus.character <- function(x, docnames = NULL,
 }
 
 #' @rdname corpus
-#' @param docid_field optional column index of a document identifier; if
-#'   \code{NULL}, the constructor will use the row.names of the data.frame (if
-#'   found)
+#' @param docid_field optional column index of a document identifier; defaults
+#'   to "doc_id", but if this is not found, then will use the rownames of the
+#'   data.frame; if the rownames are not set, it will use the default sequence
+#'   based on \code{(\link{quanteda_options}("base_docname")}.
 #' @keywords corpus
 #' @method corpus data.frame
 #' @export
-corpus.data.frame <- function(x, docid_field = NULL, text_field = "text",
+corpus.data.frame <- function(x, docid_field = "doc_id", text_field = "text",
                               metacorpus = NULL, compress = FALSE, ...) {
 
     if (length(addedArgs <- list(...)))
@@ -260,7 +261,7 @@ corpus.data.frame <- function(x, docid_field = NULL, text_field = "text",
     # coerce data.frame variants to data.frame - for #1232
     x <- as.data.frame(x)
 
-    # text field
+    # text_field handling ---------
     if (length(text_field) != 1)
         stop("text_field must refer to a single column")
     if (is.numeric(text_field)) {
@@ -277,33 +278,43 @@ corpus.data.frame <- function(x, docid_field = NULL, text_field = "text",
     if (!is.character(x[[text_field]]))
         stop("text_field must refer to a character mode column")
 
-    # docname field
-    if (is.null(docid_field)) {
-        if (identical(row.names(x), as.character(seq_len(nrow(x))))) {
-            docname <- paste0(quanteda_options("base_docname"), row.names(x))
-        } else {
-            docname <- row.names(x)
-        }
+    # docid_field handling --------
+    # start by using quanteda defaults
+    docname_source <- "default"
+
+    # use docfield if if supplied
+    if (missing(docid_field)) {
+        # if not supplied, use default docfield_id if exists, else leave default
+        if (docid_field %in% names(x)) docname_source <- "docid_field"
     } else {
         if (length(docid_field) != 1)
             stop("docid_field must refer to a single column")
-        if (is.numeric(docid_field)) {
-            if (1 <= docid_field && docid_field <= length(x)) {
-                docid_field <- names(x)[docid_field]
-            } else {
-                stop("docid_field index refers to an invalid column")
-            }
+        # if supplied, throw error if column does not exist
+        if (docid_field %in% names(x) || docid_field %in% seq_len(ncol(x))) {
+            docname_source <- "docid_field"
+        } else {
+            stop("docid_field column not found or invalid")
         }
-        if (!docid_field %in% names(x))
-            stop("column name ", docid_field, " not found")
-        if (!is.character(x[[docid_field]]))
-            stop("docid_field must refer to a character mode column")
-        docname <- x[[docid_field]]
     }
-
+
+    # try using row.names if docid_field not already set
+    if (docname_source == "default" && 
+        !identical(row.names(x), as.character(seq_len(nrow(x))))) {
+        docname_source <- "row.names"
+    }
+
+    docname <-
+        switch(docname_source,
+               docid_field =  as.character(x[[docid_field]]),
+               row.names = row.names(x),
+               default = paste0(quanteda_options("base_docname"), seq_len(nrow(x))))
+
+    # to make the exclusion below work using match()
+    if (docname_source != "docid_field") docid_field <- NULL
+
     corpus(x[[text_field]],
-           docvars = x[, match(c(text_field, docid_field), 
-                               names(x)) * -1, drop = FALSE],
+           docvars = x[, match(c(text_field, docid_field), names(x)) * -1, 
+                       drop = FALSE],
            docnames = docname,
            metacorpus = metacorpus, compress = compress)
 }