version 1.3.6

cran · Feb 1, 2012 · fc769a6 · fc769a6
1 parent 551c96c
commit fc769a6
Show file tree

Hide file tree

Showing 5 changed files with 37 additions and 17 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,11 @@
+2012-02-01  Timothy P. Jurka  <tpjurka@ucdavis.edu>
+
+	* DESCRIPTION: Release 1.3.6
+	* Fixed create_matrix() calls to tm API
+	* Fixed default ngramLength as 1, not 0
+	* Added maxDocFreq and maxWordLength parameters
+	* Updated DESCRIPTION file
+
 2012-01-16  Timothy P. Jurka  <tpjurka@ucdavis.edu>
 
 	* DESCRIPTION: Release 1.3.5

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: RTextTools
 Type: Package
 Title: Automatic Text Classification via Supervised Learning
-Version: 1.3.5
-Date: 2012-1-15
+Version: 1.3.6
+Date: 2012-02-01
 Author: Timothy P. Jurka, Loren Collingwood, Amber E. Boydstun,
         Emiliano Grossman, Wouter van Atteveldt
 Maintainer: Timothy P. Jurka <tpjurka@ucdavis.edu>
@@ -21,6 +21,6 @@ Description: RTextTools is a machine learning package for automatic
 License: GPL-3
 URL: http://www.rtexttools.com/
 LazyLoad: yes
-Packaged: 2012-01-15 23:11:46 UTC; timjurka
+Packaged: 2012-01-31 23:27:11 UTC; timjurka
 Repository: CRAN
-Date/Publication: 2012-01-16 07:32:43
+Date/Publication: 2012-02-01 08:13:24
diff --git a/MD5 b/MD5
@@ -1,12 +1,12 @@
-4cfcc82b32e1403d6f382b963f9600a9 *ChangeLog
-54f1d8bed0edc799a4c084aab7526b5d *DESCRIPTION
+f32308053a2553ac8ed56d0d7248dceb *ChangeLog
+23c9fff8b6e990010b035e3692aeceba *DESCRIPTION
 29e2e3a360bfabed5c9a84fc6cbfc8da *NAMESPACE
 a49dc723b11b7404042fe813bfcab414 *R/classify_model.R
 84d7336bc6a8d866288d526a5cc17669 *R/classify_models.R
 91a927a7c8e93cd755d30a6656537e94 *R/create_analytics.R
 4ca843c6c36cc85e5065f696cc99f671 *R/create_corpus.R
 856e5c7c49b9a2ad0a7db41bb0cb3313 *R/create_ensembleSummary.R
-6c86fffc720497ac82ef2c09f9ed8fcd *R/create_matrix.R
+40ac3b50ca0a4c896286f215b8bf595b *R/create_matrix.R
 01205062b6f6c78ed1995933a0b25a9f *R/create_precisionRecallSummary.R
 6f286a02f218365eb3422b73486511e0 *R/create_scoreSummary.R
 9ac6d88431a51ddabba87a237914a9f1 *R/cross_validate.R
@@ -36,7 +36,7 @@ c9b7d3f49cc7cc7301e5327835ab9a97 *man/analytics_container_virgin-class.Rd
 b2c696dc556bdab67ebf218488058103 *man/create_analytics.Rd
 29c7e24bf0f05b38d5ebdc862534a685 *man/create_corpus.Rd
 c000e977a000a00f1294c3b25ed6a22e *man/create_ensembleSummary.Rd
-96614a7eebce97a8ad54ea4863ced344 *man/create_matrix.Rd
+11cdad6719778f880568cfe3130e3a3e *man/create_matrix.Rd
 5e7e3946d5a3dee69e5b94ad4bbb9e45 *man/create_precisionRecallSummary.Rd
 09ec4756277affce1272d9075504b81f *man/create_scoreSummary.Rd
 de703794691e874765fb51109f613fe9 *man/cross_validate.Rd

diff --git a/R/create_matrix.R b/R/create_matrix.R
@@ -1,4 +1,4 @@
-create_matrix <- function(textColumns, language="english", minDocFreq=1, minWordLength=3, ngramLength=0, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE,  stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTf) {
+create_matrix <- function(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf, minWordLength=3, maxWordLength=Inf, ngramLength=1, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE,  stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTf) {
 
     stem_words <- function(x) {
         split <- strsplit(x," ")
@@ -7,10 +7,15 @@ create_matrix <- function(textColumns, language="english", minDocFreq=1, minWord
 
     tokenize_ngrams <- function(x, n=ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))
 
-	control <- list(language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stripWhitespace=stripWhitespace,minWordLength=minWordLength,stopwords=removeStopwords,minDocFreq=minDocFreq,weighting=weighting)
+	control <- list(bounds=list(local=c(minDocFreq,maxDocFreq)),language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stopwords=removeStopwords,stripWhitespace=stripWhitespace,wordLengths=c(minWordLength,maxWordLength),weighting=weighting)
+
+    if (ngramLength > 1) { 
+    	control <- append(control,list(tokenize=tokenize_ngrams),after=7)
+    } else {
+    	control <- append(control,list(tokenize=scan_tokenizer),after=4)
+    }
 
-    if (ngramLength > 0) control <- append(control,list(tokenize=tokenize_ngrams),after=6)
-    if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=6)
+    if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=7)
 
     trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse=" ")
     trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")

diff --git a/man/create_matrix.Rd b/man/create_matrix.Rd
@@ -8,10 +8,11 @@ creates a document-term matrix to be passed into create_corpus().
 Creates an object of class \code{DocumentTermMatrix} from \pkg{tm} that can be used in the \code{\link{create_corpus}} function.
 }
 \usage{
-create_matrix(textColumns, language = "english", minDocFreq = 1, 
-minWordLength = 3, ngramLength = 0, originalMatrix=NULL, removeNumbers = FALSE, 
-removePunctuation = TRUE, removeSparseTerms = 0, removeStopwords = TRUE, 
-stemWords = FALSE, stripWhitespace = TRUE, toLower = TRUE, weighting = weightTf)
+create_matrix(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf, 
+minWordLength=3, maxWordLength=Inf, ngramLength=1, originalMatrix=NULL, 
+removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, 
+removeStopwords=TRUE,  stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, 
+weighting=weightTf)
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
@@ -23,9 +24,15 @@ The language to be used for stemming the text data.
 }
   \item{minDocFreq}{
 The minimum number of times a word should appear in a document for it to be included in the matrix. See package \pkg{tm} for more details.
+}
+  \item{maxDocFreq}{
+The maximum number of times a word should appear in a document for it to be included in the matrix. See package \pkg{tm} for more details.
 }
   \item{minWordLength}{
-The minimum number of letters a word should contain to be included in the matrix. See package \pkg{tm} for more details.
+The minimum number of letters a word or n-gram should contain to be included in the matrix. See package \pkg{tm} for more details.
+}
+  \item{maxWordLength}{
+The maximum number of letters a word or n-gram should contain to be included in the matrix. See package \pkg{tm} for more details.
 }
   \item{ngramLength}{
 The number of words to include per n-gram for the document-term matrix.