Skip to content

Commit

Permalink
version 1.2.0
Browse files Browse the repository at this point in the history
  • Loading branch information
kbenoit authored and cran-robot committed Apr 15, 2018
1 parent 9e2fd89 commit bbbdc48
Show file tree
Hide file tree
Showing 123 changed files with 3,117 additions and 1,583 deletions.
17 changes: 8 additions & 9 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
Package: quanteda
Version: 1.1.1
Version: 1.2.0
Title: Quantitative Analysis of Textual Data
Description: A fast, flexible, and comprehensive framework for
quantitative text analysis in R. Provides functionality for corpus management,
Expand All @@ -20,7 +20,7 @@ Authors@R: c( person("Kenneth", "Benoit", email = "kbenoit@lse.ac.uk", role =
License: GPL-3
Depends: R (>= 3.1.0), methods
Imports: extrafont, Matrix (>= 1.2), data.table (>= 1.9.6), SnowballC,
sna, network, ggrepel, Rcpp (>= 0.12.12), RcppParallel,
sna, ggrepel, network, Rcpp (>= 0.12.12), RcppParallel,
RSpectra, stringi, fastmatch, ggplot2 (>= 2.2.0), XML, yaml,
lubridate, magrittr, spacyr, stopwords
LinkingTo: Rcpp, RcppParallel, RcppArmadillo (>= 0.7.600.1.0)
Expand All @@ -42,10 +42,10 @@ Collate: 'RcppExports.R' 'View.R' 'bootstrap_dfm.R'
'dfm_compress.R' 'dfm_group.R' 'dfm_lookup.R' 'dfm_replace.R'
'dfm_sample.R' 'dfm_select.R' 'dfm_sort.R' 'dfm_subset.R'
'dfm_trim.R' 'dfm_weight.R' 'dictionaries.R' 'docnames.R'
'docvars.R' 'fcm-methods.R' 'fcm.R' 'kwic.R' 'nfunctions.R'
'nscrabble.R' 'nsyllable.R' 'phrases.R'
'quanteda-documentation.R' 'quanteda_options.R'
'readtext-methods.R' 'regex2fixed.R' 'settings.R'
'docvars.R' 'fcm-classes.R' 'fcm-methods.R' 'fcm-subsetting.R'
'fcm.R' 'kwic.R' 'nfunctions.R' 'nscrabble.R' 'nsyllable.R'
'pattern2fixed.R' 'phrases.R' 'quanteda-documentation.R'
'quanteda_options.R' 'readtext-methods.R' 'settings.R'
'spacyr-methods.R' 'stopwords.R' 'textmodel-methods.R'
'textmodel_affinity.R' 'textmodel_ca.R' 'textmodel_lsa.R'
'textmodel_nb.R' 'textmodel_wordfish.R'
Expand All @@ -59,11 +59,10 @@ Collate: 'RcppExports.R' 'View.R' 'bootstrap_dfm.R'
'tokens_ngrams.R' 'tokens_replace.R' 'tokens_segment.R'
'tokens_select.R' 'tokens_subset.R' 'utils.R' 'wordstem.R'
'zzz.R'
RcppModules: ngramMaker
RoxygenNote: 6.0.1
SystemRequirements: C++11
NeedsCompilation: yes
Packaged: 2018-03-06 20:11:48 UTC; kbenoit
Packaged: 2018-04-15 17:51:45 UTC; kbenoit
Author: Kenneth Benoit [aut, cre, cph],
Kohei Watanabe [ctb],
Paul Nulty [ctb],
Expand All @@ -74,4 +73,4 @@ Author: Kenneth Benoit [aut, cre, cph],
Will Lowe [ctb]
Maintainer: Kenneth Benoit <kbenoit@lse.ac.uk>
Repository: CRAN
Date/Publication: 2018-03-07 10:03:16 UTC
Date/Publication: 2018-04-15 19:13:13 UTC
220 changes: 120 additions & 100 deletions MD5

Large diffs are not rendered by default.

8 changes: 6 additions & 2 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ S3method(as.list,tokens)
S3method(as.matrix,dfm)
S3method(as.matrix,dist_selection)
S3method(as.matrix,simil)
S3method(as.network,default)
S3method(as.network,fcm)
S3method(as.statistics_textmodel,data.frame)
S3method(as.statistics_textmodel,matrix)
Expand Down Expand Up @@ -218,6 +219,7 @@ S3method(nscrabble,default)
S3method(nsentence,character)
S3method(nsentence,corpus)
S3method(nsentence,default)
S3method(nsentence,spacyr_parsed)
S3method(nsentence,tokens)
S3method(nsyllable,character)
S3method(nsyllable,default)
Expand Down Expand Up @@ -276,6 +278,7 @@ S3method(spacy_parse,corpus)
S3method(sparsity,default)
S3method(sparsity,dfm)
S3method(str,corpus)
S3method(summary,character)
S3method(summary,corpus)
S3method(summary,influence.predict.textmodel_affinity)
S3method(summary,textmodel_nb)
Expand Down Expand Up @@ -435,6 +438,7 @@ export(fcm_sort)
export(fcm_tolower)
export(fcm_toupper)
export(featnames)
export(index_types)
export(is.collocations)
export(is.corpus)
export(is.corpuszip)
Expand All @@ -455,6 +459,8 @@ export(nsentence)
export(nsyllable)
export(ntoken)
export(ntype)
export(pattern2fixed)
export(pattern2id)
export(phrase)
export(quanteda_options)
export(scrabble)
Expand Down Expand Up @@ -518,9 +524,7 @@ exportMethods(t)
import(Matrix)
import(data.table)
import(ggplot2)
import(ggrepel)
import(methods)
import(network)
import(stopwords)
import(stringi)
importFrom(Rcpp,evalCpp)
Expand Down
22 changes: 21 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,23 @@
# quanteda v1.1.0
# quanteda v1.2.0

### New Features

* Added an `nsentence()` method for **spacyr** parsed objects. (#1289)

### Bug fixes and stability enhancements

* Fix bug in `nsyllable()` that incorrectly handled cased words, and returned wrong names with `use.names = TRUE`. (#1282)
* Fix the overwriting of `summary.character()` caused by previous import of the **network** package namespace. (#1285)
* `dfm_smooth()` now correctly sets the smooth value in the dfm (#1274). Arithmetic operations on dfm objects are now much more consistent and do not drop attributes of the dfm, as sometimes happened with earlier versions.

### Behaviour changes

* `tokens_toupper()` and `tokens_tolower()` no longer remove unused token types. Solves #1278.
* `dfm_trim()` now takes more options, and these are implemented more consistently. `min_termfreq` and `max_termfreq` have replaced `min_count` and `max_count`, and these can be modified using a `termfreq_type` argument. (Similar options are implemented for `docfreq_type`.) Solves #1253, #1254.
* `textstat_simil()` and `textstat_dist()` now take valid dfm indexes for the relevant margin for the `selection` argument. Previously, this could also be a direct vector or matrix for comparison, but this is no longer allowed. Solves #1266.
* Improved performance for `dfm_group()` (#1295).

# quanteda v1.1.1

### New Features

Expand All @@ -8,6 +27,7 @@

### Bug fixes and stability enhancements

* Fixed a problem in the examples for `textplot_scale1d()` by adjusting the refscores for `data_corpus_irishbudget2010`.
* Eliminated unnecessary dependency on the **digest** package.
* Updated the vignette title to be less generic.
* Improved the robustness of `dfm_trim()` and `dfm_weight()` for previously weighted dfm objects and when supplied thresholds are proportions instead of counts. (#1237)
Expand Down
12 changes: 8 additions & 4 deletions R/RcppExports.R
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ qatd_cpp_tokens_ngrams <- function(texts_, types_, delim_, ns_, skips_) {
.Call(`_quanteda_qatd_cpp_tokens_ngrams`, texts_, types_, delim_, ns_, skips_)
}

qatd_cpp_tokens_recompile <- function(texts_, types_) {
.Call(`_quanteda_qatd_cpp_tokens_recompile`, texts_, types_)
qatd_cpp_tokens_recompile <- function(texts_, types_, gap = TRUE, dup = TRUE) {
.Call(`_quanteda_qatd_cpp_tokens_recompile`, texts_, types_, gap, dup)
}

qatd_cpp_tokens_segment <- function(texts_, types_, patterns_, remove, position) {
Expand All @@ -73,8 +73,12 @@ qatd_cpp_tokens_select <- function(texts_, types_, words_, mode, padding, window
.Call(`_quanteda_qatd_cpp_tokens_select`, texts_, types_, words_, mode, padding, window_left, window_right)
}

qatd_cpp_chars_remove <- function(input_, char_remove) {
.Call(`_quanteda_qatd_cpp_chars_remove`, input_, char_remove)
qatd_cpp_is_grouped_numeric <- function(values_, groups_) {
.Call(`_quanteda_qatd_cpp_is_grouped_numeric`, values_, groups_)
}

qatd_cpp_is_grouped_character <- function(values_, groups_) {
.Call(`_quanteda_qatd_cpp_is_grouped_character`, values_, groups_)
}

qatd_cpp_tbb_enabled <- function() {
Expand Down
8 changes: 4 additions & 4 deletions R/bootstrap_dfm.R
Original file line number Diff line number Diff line change
Expand Up @@ -78,12 +78,12 @@ bootstrap_dfm.dfm <- function(x, n = 10, ..., verbose = quanteda_options("verbos
result[['dfm_0']] <- dfm_group(x, groups = docvars(x, '_document'))

# randomly resample dfm
docID <- index <- NULL
id <- index <- NULL
for (i in seq_len(n)) {
if (verbose) message(", ", i, appendLF = FALSE)
dt <- data.table(index = seq_len(ndoc(x)), docID = docvars(x, "_document"))
dt[, temp := sample(1:.N, replace = TRUE), by = docID]
dt[, sample_index := index[temp], by = docID]
dt <- data.table(index = seq_len(ndoc(x)), id = docvars(x, "_document"))
dt[, temp := sample(1:.N, replace = TRUE), by = id]
dt[, sample_index := index[temp], by = id]
sample_index <- dt[, sample_index]
temp <- x[sample_index, ]
temp <- dfm_group(temp, groups = docvars(temp, '_document'))
Expand Down
4 changes: 2 additions & 2 deletions R/casechange-functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ tokens_tolower.default <- function(x, keep_acronyms = FALSE, ...) {
#' @export
tokens_tolower.tokens <- function(x, keep_acronyms = FALSE, ...) {
types(x) <- lowercase_types(types(x), keep_acronyms)
tokens_recompile(x)
tokens_recompile(x, gap = FALSE, dup = TRUE)
}

lowercase_types <- function(type, keep_acronyms) {
Expand Down Expand Up @@ -51,7 +51,7 @@ tokens_toupper.default <- function(x, ...) {
#' @export
tokens_toupper.tokens <- function(x, ...) {
types(x) <- char_toupper(types(x), ...)
tokens_recompile(x)
tokens_recompile(x, gap = FALSE, dup = TRUE)
}


Expand Down
17 changes: 7 additions & 10 deletions R/character-methods.R
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
#' Summary statistics on a character vector
#'
#' Internal-only function to compute summary statistics on a character object.
#' @method summary character
#' @inheritParams summary.corpus
#' @keywords internal
#' @keywords char internal
#' @examples
#' # summarize texts
#' summary(c("Testing this text. Second sentence.", "And this one."))
#' summary(data_char_ukimmig2010)
#' myTextSummaryDF <- summary(data_char_ukimmig2010)
#' head(myTextSummaryDF)
summary.character <- function(object, n = 100, tolower = FALSE, ...) {
#' quanteda:::summary_character(c("Testing this text. Second sentence.", "And this one."))
#' quanteda:::summary_character(data_char_ukimmig2010)
#' mysummary_ukimmig2010 <- quanteda:::summary_character(data_char_ukimmig2010)
#' head(mysummary_ukimmig2010)
summary_character <- function(object, n = 100, tolower = FALSE, ...) {

# trap the verbose argument and ignore
thecall <- as.list(match.call())[-1]
if (!is.na(verbose_index <- match("verbose", names(thecall)))) {
warning("verbose argument is defunct")
return(do.call(summary.character, thecall[-verbose_index]))
return(do.call(summary_character, thecall[-verbose_index]))
}

object <- object[1 : min(c(n, length(object)))]
Expand All @@ -35,5 +34,3 @@ summary.character <- function(object, n = 100, tolower = FALSE, ...) {
row.names = NULL)
results
}


9 changes: 5 additions & 4 deletions R/corpus-methods-base.R
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ summary.corpus <- function(object, n = 100, showmeta = FALSE, tolower = FALSE, .

n_all <- ndoc(object)
object <- head(object, n)
result <- data.frame(summary(texts(object), n = n, tolower = tolower, ...))
result <- data.frame(summary_character(texts(object), n = n, tolower = tolower, ...))
dvars <- docvars_internal(object)
if (!is.null(dvars)) {
if (showmeta) {
Expand Down Expand Up @@ -208,9 +208,10 @@ tail.corpus <- function(x, n = 6L, ...) {
paste(metacorpus(c1, field), metacorpus(c2, field))
}

#rowname <- c(rownames(c1$documents), rownames(c2$documents))
c1$documents <- rbind(c1$documents, c2$documents)

r_names <- make.unique(c(rownames(c1$documents), rownames(c2$documents)), sep='')
c1$documents <- data.table::setDF(data.table::rbindlist(list(c1$documents, c2$documents), fill = TRUE))
rownames(c1$documents) <- r_names

# Put rownames back in because the hadleyverse discards them
#rownames(c1$documents) <- make.unique(rowname, sep='')

Expand Down
63 changes: 37 additions & 26 deletions R/corpus.R
Original file line number Diff line number Diff line change
Expand Up @@ -242,13 +242,14 @@ corpus.character <- function(x, docnames = NULL,
}

#' @rdname corpus
#' @param docid_field optional column index of a document identifier; if
#' \code{NULL}, the constructor will use the row.names of the data.frame (if
#' found)
#' @param docid_field optional column index of a document identifier; defaults
#' to "doc_id", but if this is not found, then will use the rownames of the
#' data.frame; if the rownames are not set, it will use the default sequence
#' based on \code{(\link{quanteda_options}("base_docname")}.
#' @keywords corpus
#' @method corpus data.frame
#' @export
corpus.data.frame <- function(x, docid_field = NULL, text_field = "text",
corpus.data.frame <- function(x, docid_field = "doc_id", text_field = "text",
metacorpus = NULL, compress = FALSE, ...) {

if (length(addedArgs <- list(...)))
Expand All @@ -260,7 +261,7 @@ corpus.data.frame <- function(x, docid_field = NULL, text_field = "text",
# coerce data.frame variants to data.frame - for #1232
x <- as.data.frame(x)

# text field
# text_field handling ---------
if (length(text_field) != 1)
stop("text_field must refer to a single column")
if (is.numeric(text_field)) {
Expand All @@ -277,33 +278,43 @@ corpus.data.frame <- function(x, docid_field = NULL, text_field = "text",
if (!is.character(x[[text_field]]))
stop("text_field must refer to a character mode column")

# docname field
if (is.null(docid_field)) {
if (identical(row.names(x), as.character(seq_len(nrow(x))))) {
docname <- paste0(quanteda_options("base_docname"), row.names(x))
} else {
docname <- row.names(x)
}
# docid_field handling --------
# start by using quanteda defaults
docname_source <- "default"

# use docfield if if supplied
if (missing(docid_field)) {
# if not supplied, use default docfield_id if exists, else leave default
if (docid_field %in% names(x)) docname_source <- "docid_field"
} else {
if (length(docid_field) != 1)
stop("docid_field must refer to a single column")
if (is.numeric(docid_field)) {
if (1 <= docid_field && docid_field <= length(x)) {
docid_field <- names(x)[docid_field]
} else {
stop("docid_field index refers to an invalid column")
}
# if supplied, throw error if column does not exist
if (docid_field %in% names(x) || docid_field %in% seq_len(ncol(x))) {
docname_source <- "docid_field"
} else {
stop("docid_field column not found or invalid")
}
if (!docid_field %in% names(x))
stop("column name ", docid_field, " not found")
if (!is.character(x[[docid_field]]))
stop("docid_field must refer to a character mode column")
docname <- x[[docid_field]]
}


# try using row.names if docid_field not already set
if (docname_source == "default" &&
!identical(row.names(x), as.character(seq_len(nrow(x))))) {
docname_source <- "row.names"
}

docname <-
switch(docname_source,
docid_field = as.character(x[[docid_field]]),
row.names = row.names(x),
default = paste0(quanteda_options("base_docname"), seq_len(nrow(x))))

# to make the exclusion below work using match()
if (docname_source != "docid_field") docid_field <- NULL

corpus(x[[text_field]],
docvars = x[, match(c(text_field, docid_field),
names(x)) * -1, drop = FALSE],
docvars = x[, match(c(text_field, docid_field), names(x)) * -1,
drop = FALSE],
docnames = docname,
metacorpus = metacorpus, compress = compress)
}
Expand Down
Loading

0 comments on commit bbbdc48

Please sign in to comment.