Skip to content

Commit

Permalink
version 1.3.6
Browse files Browse the repository at this point in the history
  • Loading branch information
Timothy P. Jurka authored and gaborcsardi committed Feb 1, 2012
1 parent 551c96c commit fc769a6
Show file tree
Hide file tree
Showing 5 changed files with 37 additions and 17 deletions.
8 changes: 8 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
2012-02-01 Timothy P. Jurka <tpjurka@ucdavis.edu>

* DESCRIPTION: Release 1.3.6
* Fixed create_matrix() calls to tm API
* Fixed default ngramLength as 1, not 0
* Added maxDocFreq and maxWordLength parameters
* Updated DESCRIPTION file

2012-01-16 Timothy P. Jurka <tpjurka@ucdavis.edu>

* DESCRIPTION: Release 1.3.5
Expand Down
8 changes: 4 additions & 4 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: RTextTools
Type: Package
Title: Automatic Text Classification via Supervised Learning
Version: 1.3.5
Date: 2012-1-15
Version: 1.3.6
Date: 2012-02-01
Author: Timothy P. Jurka, Loren Collingwood, Amber E. Boydstun,
Emiliano Grossman, Wouter van Atteveldt
Maintainer: Timothy P. Jurka <tpjurka@ucdavis.edu>
Expand All @@ -21,6 +21,6 @@ Description: RTextTools is a machine learning package for automatic
License: GPL-3
URL: http://www.rtexttools.com/
LazyLoad: yes
Packaged: 2012-01-15 23:11:46 UTC; timjurka
Packaged: 2012-01-31 23:27:11 UTC; timjurka
Repository: CRAN
Date/Publication: 2012-01-16 07:32:43
Date/Publication: 2012-02-01 08:13:24
8 changes: 4 additions & 4 deletions MD5
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
4cfcc82b32e1403d6f382b963f9600a9 *ChangeLog
54f1d8bed0edc799a4c084aab7526b5d *DESCRIPTION
f32308053a2553ac8ed56d0d7248dceb *ChangeLog
23c9fff8b6e990010b035e3692aeceba *DESCRIPTION
29e2e3a360bfabed5c9a84fc6cbfc8da *NAMESPACE
a49dc723b11b7404042fe813bfcab414 *R/classify_model.R
84d7336bc6a8d866288d526a5cc17669 *R/classify_models.R
91a927a7c8e93cd755d30a6656537e94 *R/create_analytics.R
4ca843c6c36cc85e5065f696cc99f671 *R/create_corpus.R
856e5c7c49b9a2ad0a7db41bb0cb3313 *R/create_ensembleSummary.R
6c86fffc720497ac82ef2c09f9ed8fcd *R/create_matrix.R
40ac3b50ca0a4c896286f215b8bf595b *R/create_matrix.R
01205062b6f6c78ed1995933a0b25a9f *R/create_precisionRecallSummary.R
6f286a02f218365eb3422b73486511e0 *R/create_scoreSummary.R
9ac6d88431a51ddabba87a237914a9f1 *R/cross_validate.R
Expand Down Expand Up @@ -36,7 +36,7 @@ c9b7d3f49cc7cc7301e5327835ab9a97 *man/analytics_container_virgin-class.Rd
b2c696dc556bdab67ebf218488058103 *man/create_analytics.Rd
29c7e24bf0f05b38d5ebdc862534a685 *man/create_corpus.Rd
c000e977a000a00f1294c3b25ed6a22e *man/create_ensembleSummary.Rd
96614a7eebce97a8ad54ea4863ced344 *man/create_matrix.Rd
11cdad6719778f880568cfe3130e3a3e *man/create_matrix.Rd
5e7e3946d5a3dee69e5b94ad4bbb9e45 *man/create_precisionRecallSummary.Rd
09ec4756277affce1272d9075504b81f *man/create_scoreSummary.Rd
de703794691e874765fb51109f613fe9 *man/cross_validate.Rd
Expand Down
13 changes: 9 additions & 4 deletions R/create_matrix.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
create_matrix <- function(textColumns, language="english", minDocFreq=1, minWordLength=3, ngramLength=0, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTf) {
create_matrix <- function(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf, minWordLength=3, maxWordLength=Inf, ngramLength=1, originalMatrix=NULL, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTf) {

stem_words <- function(x) {
split <- strsplit(x," ")
Expand All @@ -7,10 +7,15 @@ create_matrix <- function(textColumns, language="english", minDocFreq=1, minWord

tokenize_ngrams <- function(x, n=ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))

control <- list(language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stripWhitespace=stripWhitespace,minWordLength=minWordLength,stopwords=removeStopwords,minDocFreq=minDocFreq,weighting=weighting)
control <- list(bounds=list(local=c(minDocFreq,maxDocFreq)),language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stopwords=removeStopwords,stripWhitespace=stripWhitespace,wordLengths=c(minWordLength,maxWordLength),weighting=weighting)

if (ngramLength > 1) {
control <- append(control,list(tokenize=tokenize_ngrams),after=7)
} else {
control <- append(control,list(tokenize=scan_tokenizer),after=4)
}

if (ngramLength > 0) control <- append(control,list(tokenize=tokenize_ngrams),after=6)
if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=6)
if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=7)

trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse=" ")
trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")
Expand Down
17 changes: 12 additions & 5 deletions man/create_matrix.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,11 @@ creates a document-term matrix to be passed into create_corpus().
Creates an object of class \code{DocumentTermMatrix} from \pkg{tm} that can be used in the \code{\link{create_corpus}} function.
}
\usage{
create_matrix(textColumns, language = "english", minDocFreq = 1,
minWordLength = 3, ngramLength = 0, originalMatrix=NULL, removeNumbers = FALSE,
removePunctuation = TRUE, removeSparseTerms = 0, removeStopwords = TRUE,
stemWords = FALSE, stripWhitespace = TRUE, toLower = TRUE, weighting = weightTf)
create_matrix(textColumns, language="english", minDocFreq=1, maxDocFreq=Inf,
minWordLength=3, maxWordLength=Inf, ngramLength=1, originalMatrix=NULL,
removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0,
removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE,
weighting=weightTf)
}
%- maybe also 'usage' for other objects documented here.
\arguments{
Expand All @@ -23,9 +24,15 @@ The language to be used for stemming the text data.
}
\item{minDocFreq}{
The minimum number of times a word should appear in a document for it to be included in the matrix. See package \pkg{tm} for more details.
}
\item{maxDocFreq}{
The maximum number of times a word should appear in a document for it to be included in the matrix. See package \pkg{tm} for more details.
}
\item{minWordLength}{
The minimum number of letters a word should contain to be included in the matrix. See package \pkg{tm} for more details.
The minimum number of letters a word or n-gram should contain to be included in the matrix. See package \pkg{tm} for more details.
}
\item{maxWordLength}{
The maximum number of letters a word or n-gram should contain to be included in the matrix. See package \pkg{tm} for more details.
}
\item{ngramLength}{
The number of words to include per n-gram for the document-term matrix.
Expand Down

0 comments on commit fc769a6

Please sign in to comment.