Skip to content

Commit

Permalink
version 1.3.3
Browse files Browse the repository at this point in the history
  • Loading branch information
Timothy P. Jurka authored and gaborcsardi committed Jan 5, 2012
1 parent dadd36b commit 006ef72
Show file tree
Hide file tree
Showing 12 changed files with 96 additions and 285 deletions.
11 changes: 10 additions & 1 deletion ChangeLog
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
2011-10-01 Timothy P. Jurka <tpjurka@ucdavis.edu>
2012-01-05 Timothy P. Jurka <tpjurka@ucdavis.edu>

* DESCRIPTION: Release 1.3.3
* Optimized create_analytics() function
* Optimized create_matrix() function
* Optimized create_ensembleSummary() function
* Fixed bugs related to overlapping corpus indices
* Updated DESCRIPTION file

2011-12-05 Timothy P. Jurka <tpjurka@ucdavis.edu>

* DESCRIPTION: Release 1.3.2
* Added CITATION
Expand Down
10 changes: 5 additions & 5 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
Package: RTextTools
Type: Package
Title: Automatic Text Classification via Supervised Learning
Version: 1.3.2
Date: 2011-12-05
Version: 1.3.3
Date: 2012-1-05
Author: Timothy P. Jurka, Loren Collingwood, Amber E. Boydstun, Emiliano Grossman, Wouter van Atteveldt
Maintainer: Timothy P. Jurka <tpjurka@ucdavis.edu>
Depends: R (>= 2.13.0), methods, SparseM, randomForest, tree, nnet, tm,
Depends: R (>= 2.14.0), methods, SparseM, randomForest, tree, nnet, tm,
e1071, ipred, caTools, maxent, glmnet, Rstem, tau
Suggests: RODBC
Description: RTextTools is a machine learning package for automatic text classification that makes it simple for novice users to get started with machine learning, while allowing experienced users to easily experiment with different settings and algorithm combinations. The package includes nine algorithms for ensemble classification (svm, slda, boosting, bagging, random forests, glmnet, decision trees, neural networks, maximum entropy), comprehensive analytics, and thorough documentation.
License: GPL-3
URL: http://www.rtexttools.com/
LazyLoad: yes
Packaged: 2011-12-05 00:45:25 UTC; timjurka
Packaged: 2012-01-03 05:21:35 UTC; timjurka
Repository: CRAN
Date/Publication: 2011-12-05 07:54:16
Date/Publication: 2012-01-08 14:36:08
21 changes: 10 additions & 11 deletions MD5
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
c3a26bf1af10d11934bfaf9016b36ee4 *ChangeLog
ddcb22bc243b90225a67d22bab9fd4a2 *DESCRIPTION
932524dcfd0b46968d41540c57b36dd1 *ChangeLog
5491a2bd2661ac525f6de1a0101c297a *DESCRIPTION
36f1b71474f676f265d9506a3e047d52 *NAMESPACE
3cd43a0a43c3b2bfa4a353146724f0c2 *R/classify_model.R
84d7336bc6a8d866288d526a5cc17669 *R/classify_models.R
e347d703dcd45ee398c797bc581349d6 *R/create_analytics.R
245be5a0fbd19012ac719d16a62c14d0 *R/create_corpus.R
8fc37bf5345dacb9cd1bf00508a090fd *R/create_ensembleSummary.R
ab268cd6d3983f39ce2e20dbe6468575 *R/create_matrix.R
e76c3325472713564a322f55ee471cd7 *R/create_analytics.R
dc78aa0bb79dc2abfbf7cd6c0e4ad331 *R/create_corpus.R
856e5c7c49b9a2ad0a7db41bb0cb3313 *R/create_ensembleSummary.R
261d5d4be20b15118957af6594223518 *R/create_matrix.R
01205062b6f6c78ed1995933a0b25a9f *R/create_precisionRecallSummary.R
6f286a02f218365eb3422b73486511e0 *R/create_scoreSummary.R
9ac6d88431a51ddabba87a237914a9f1 *R/cross_validate.R
20b8c8c6735ef589c873f3112748c684 *R/print_algorithms.R
30c216943a7aae46fa0c5dad76c78be6 *R/read_data.R
1ef9dc06d6bac896944ef0c5b9057459 *R/recall_accuracy.R
69c085a36061266fc57d64560ddccdba *R/recall_accuracy.R
16459d6f214110976e54a24ffd0c0389 *R/train_model.R
1cd5ad901d90775613ccc96121e8dbad *R/train_models.R
942b84479ba6c5ed3ddbe6313bfd6f6c *R/wizard_read_data.R
Expand All @@ -27,16 +27,15 @@ b65d2c93e0ea86ad2f4b7168bf1c1b32 *inst/examples/conference_demo.R
90e025248904b57b5fce0ed6c3d14896 *inst/examples/normal_demo.R
1f49dc90fb2db56d6edce7dbd06821bb *inst/examples/simple_demo.R
196d8d41427ad2a26b04b08963c04143 *man/NYTimes.Rd
334cc7a16a9c729dd9fc884e10f35a4e *man/RTextTools-package.Rd
9d4540420663ce569ac4929d3a171670 *man/USCongress.Rd
bd67b7a512697151fcf54076e066e0ae *man/analytics_container-class.Rd
c9b7d3f49cc7cc7301e5327835ab9a97 *man/analytics_container_virgin-class.Rd
4573775596ab40e89d3bb06ef0e3b3cf *man/classify_model.Rd
6b7a1827febffe47081c3792ed379d71 *man/classify_models.Rd
1dd0091b7dabd38f402c86b6e7765f45 *man/create_analytics.Rd
b2c696dc556bdab67ebf218488058103 *man/create_analytics.Rd
8a0a88d768577ff9902b9d9f1666d0f8 *man/create_corpus.Rd
ebee4e93ea0f3d297b9d755a3ab1bdf0 *man/create_ensembleSummary.Rd
9360a42f553c4ea34083a4fed512bbb8 *man/create_matrix.Rd
c000e977a000a00f1294c3b25ed6a22e *man/create_ensembleSummary.Rd
4067b5b19f5291229e11f3c038c8d46b *man/create_matrix.Rd
5e7e3946d5a3dee69e5b94ad4bbb9e45 *man/create_precisionRecallSummary.Rd
09ec4756277affce1272d9075504b81f *man/create_scoreSummary.Rd
de703794691e874765fb51109f613fe9 *man/cross_validate.Rd
Expand Down
107 changes: 24 additions & 83 deletions R/create_analytics.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
create_analytics <- function(corpus,classification_results,b=1,threshold=NULL) {
create_analytics <- function(corpus,classification_results,b=1) {
create_documentSummary <- function(container, scores) {
return(cbind(MANUAL_CODE=as.numeric(as.vector(container@testing_codes)),CONSENSUS_CODE=scores$BEST_LABEL,CONSENSUS_AGREE=scores$NUM_AGREE,CONSENSUS_INCORRECT=container@testing_codes!=scores$BEST_LABEL,PROBABILITY_CODE=scores$BEST_PROB,PROBABILITY_INCORRECT=container@testing_codes!=scores$BEST_PROB))
}
Expand Down Expand Up @@ -40,100 +40,41 @@ create_analytics <- function(corpus,classification_results,b=1,threshold=NULL) {
create_algorithmSummary <- function(container, scores) {
topic_codes <- unique(container@testing_codes)

bagging_accuracy <- c()
slda_accuracy <- c()
logitboost_accuracy <- c()
svm_accuracy <- c()
forests_accuracy <- c()
glmnet_accuracy <- c()
tree_accuracy <- c()
nnetwork_accuracy <- c()
maxentropy_accuracy <- c()
accuracies <- list()

algorithm_summary <- cbind(TOPIC_CODE=as.numeric(as.vector(topic_codes)))
columns <- colnames(scores)
labels <- c()

for (code in topic_codes) {
if (pmatch("SVM_LABEL",columns,nomatch=0) > 0) {
num_manual <- length(container@testing_codes[container@testing_codes==code])
pct_analysis <- container@testing_codes[container@testing_codes==scores$SVM_LABEL]==code
pct_correct <- length(pct_analysis[pct_analysis == TRUE])/num_manual
svm_accuracy <- append(svm_accuracy,pct_correct)
for (i in seq(1,length(columns)-3)) {
num_manual <- length(container@testing_codes[container@testing_codes==code])
pct_analysis <- container@testing_codes[container@testing_codes==scores[,i]]==code
pct_correct <- length(pct_analysis[pct_analysis == TRUE])/num_manual

accuracies[[columns[i]]] <- append(accuracies[[columns[i]]],pct_correct)
}

if (pmatch("SLDA_LABEL",columns,nomatch=0) > 0) {
num_manual <- length(container@testing_codes[container@testing_codes==code])
pct_analysis <- container@testing_codes[container@testing_codes==scores$SLDA_LABEL]==code
pct_correct <- length(pct_analysis[pct_analysis == TRUE])/num_manual
slda_accuracy <- append(slda_accuracy,pct_correct)
}

if (pmatch("LOGITBOOST_LABEL",columns,nomatch=0) > 0) {
num_manual <- length(container@testing_codes[container@testing_codes==code])
pct_analysis <- container@testing_codes[container@testing_codes==scores$LOGITBOOST_LABEL]==code
pct_correct <- length(pct_analysis[pct_analysis == TRUE])/num_manual
logitboost_accuracy <- append(logitboost_accuracy,pct_correct)
}

if (pmatch("BAGGING_LABEL",columns,nomatch=0) > 0) {
num_manual <- length(container@testing_codes[container@testing_codes==code])
pct_analysis <- container@testing_codes[container@testing_codes==scores$BAGGING_LABEL]==code
pct_correct <- length(pct_analysis[pct_analysis == TRUE])/num_manual
bagging_accuracy <- append(bagging_accuracy,pct_correct)
}

if (pmatch("FORESTS_LABEL",columns,nomatch=0) > 0) {
num_manual <- length(container@testing_codes[container@testing_codes==code])
pct_analysis <- container@testing_codes[container@testing_codes==scores$FORESTS_LABEL]==code
pct_correct <- length(pct_analysis[pct_analysis == TRUE])/num_manual
forests_accuracy <- append(forests_accuracy,pct_correct)
}

if (pmatch("GLMNET_LABEL",columns,nomatch=0) > 0) {
num_manual <- length(container@testing_codes[container@testing_codes==code])
pct_analysis <- container@testing_codes[container@testing_codes==scores$GLMNET_LABEL]==code
pct_correct <- length(pct_analysis[pct_analysis == TRUE])/num_manual
glmnet_accuracy <- append(glmnet_accuracy,pct_correct)
}

if (pmatch("TREE_LABEL",columns,nomatch=0) > 0) {
num_manual <- length(container@testing_codes[container@testing_codes==code])
pct_analysis <- container@testing_codes[container@testing_codes==scores$TREE_LABEL]==code
pct_correct <- length(pct_analysis[pct_analysis == TRUE])/num_manual
tre_accuracy <- append(tree_accuracy,pct_correct)
}

if (pmatch("NNETWORK_LABEL",columns,nomatch=0) > 0) {
num_manual <- length(container@testing_codes[container@testing_codes==code])
pct_analysis <- container@testing_codes[container@testing_codes==scores$NNETWORK_LABEL]==code
pct_correct <- length(pct_analysis[pct_analysis == TRUE])/num_manual
nnetwork_accuracy <- append(nnetwork_accuracy,pct_correct)
}

if (pmatch("MAXENTROPY_LABEL",columns,nomatch=0) > 0) {
num_manual <- length(container@testing_codes[container@testing_codes==code])
pct_analysis <- container@testing_codes[container@testing_codes==scores$MAXENTROPY_LABEL]==code
pct_correct <- length(pct_analysis[pct_analysis == TRUE])/num_manual
maxentropy_accuracy <- append(maxentropy_accuracy,pct_correct)
}

}

if (length(bagging_accuracy) > 0) algorithm_summary <- cbind(algorithm_summary,BAGGING_ACCURACY=bagging_accuracy*100)
if (length(slda_accuracy) > 0) algorithm_summary <- cbind(algorithm_summary,SLDA_ACCURACY=slda_accuracy*100)
if (length(logitboost_accuracy) > 0) algorithm_summary <- cbind(algorithm_summary,LOGITBOOST_ACCURACY=logitboost_accuracy*100)
if (length(svm_accuracy) > 0) algorithm_summary <- cbind(algorithm_summary,SVM_ACCURACY=svm_accuracy*100)
if (length(forests_accuracy) > 0) algorithm_summary <- cbind(algorithm_summary,FORESTS_ACCURACY=forests_accuracy*100)
if (length(glmnet_accuracy) > 0) algorithm_summary <- cbind(algorithm_summary,GLMNET_ACCURACY=glmnet_accuracy*100)
if (length(tree_accuracy) > 0) algorithm_summary <- cbind(algorithm_summary,TREE_ACCURACY=tree_accuracy*100)
if (length(nnetwork_accuracy) > 0) algorithm_summary <- cbind(algorithm_summary,NNETWORK_ACCURACY=nnetwork_accuracy*100)
if (length(maxentropy_accuracy) > 0) algorithm_summary <- cbind(algorithm_summary,MAXENTROPY_ACCURACY=maxentropy_accuracy*100)
for (i in seq(1,length(columns)-3)) {
algorithm_summary <- cbind(algorithm_summary,accuracies[[i]]*100)

label <- paste(strsplit(columns[i],"_")[[1]][1],"_ACCURACY",sep="")
labels <- append(labels,label)
}

colnames(algorithm_summary) <- c("TOPIC_CODE", labels)

return(algorithm_summary)
}

if (is.null(threshold)) threshold <- (ncol(classification_results)/2)
if (corpus@virgin == FALSE) {
#print(system.time(score_summary <- create_scoreSummary(corpus, classification_results)))
#print(system.time(document_summary <- create_documentSummary(corpus, score_summary)))
#print(system.time(topic_summary <- as.data.frame(create_topicSummary(corpus, score_summary))))
#print(system.time(algorithm_summary <- as.data.frame(create_algorithmSummary(corpus, score_summary))))
#print(system.time(statistics_summary <- as.data.frame(create_precisionRecallSummary(corpus, classification_results, b))))

score_summary <- create_scoreSummary(corpus, classification_results)
document_summary <- create_documentSummary(corpus, score_summary)
topic_summary <- as.data.frame(create_topicSummary(corpus, score_summary))
Expand All @@ -148,7 +89,7 @@ create_analytics <- function(corpus,classification_results,b=1,threshold=NULL) {
algorithm_summary <- cbind(statistics_summary, algorithm_summary)
algorithm_summary <- algorithm_summary[,(-ncol(statistics_summary)-1)]

ensemble_summary <- create_ensembleSummary(as.data.frame(raw_summary),threshold=threshold)
ensemble_summary <- create_ensembleSummary(as.data.frame(raw_summary))

container <- new("analytics_container", label_summary=as.data.frame(topic_summary)[,-1], document_summary=as.data.frame(raw_summary), algorithm_summary=as.data.frame(algorithm_summary), ensemble_summary=ensemble_summary)
} else {
Expand Down
2 changes: 1 addition & 1 deletion R/create_corpus.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
create_corpus <- function(matrix,labels,trainSize,testSize,virgin) {
totalSize <- sort(append(trainSize,testSize))
totalSize <- sort(unique(append(trainSize,testSize)))
column_names <- colnames(matrix)
data_matrix <- as.compressed.matrix(matrix[totalSize])

Expand Down
25 changes: 20 additions & 5 deletions R/create_ensembleSummary.R
Original file line number Diff line number Diff line change
@@ -1,6 +1,21 @@
create_ensembleSummary <- function(document_summary, threshold) {
algorithms <- document_summary[document_summary$CONSENSUS_AGREE>=threshold,]
coverage <- paste("Minimum",threshold,"ensemble agreement coverage is",round(dim(algorithms)[1]/dim(document_summary)[1],2))
recall <- paste("Minimum",threshold,"ensemble recall accuracy is", round(recall_accuracy(algorithms$MANUAL_CODE,algorithms$CONSENSUS_CODE),2))
return(rbind(coverage,recall))
create_ensembleSummary <- function(document_summary) {
label <- function(x) {
return(paste("n >=",x))
}

summary <- c()
for (threshold in 1:max(document_summary$CONSENSUS_AGREE)) {
algorithms <- document_summary[document_summary$CONSENSUS_AGREE>=threshold,]
agreement <- round(dim(algorithms)[1]/dim(document_summary)[1],2)
recall <- round(recall_accuracy(algorithms$MANUAL_CODE,algorithms$CONSENSUS_CODE),2)

summary <- append(summary,c(agreement,recall))
}

summary <- matrix(summary,byrow=TRUE,ncol=2)
colnames(summary) <- c("n-ENSEMBLE COVERAGE","n-ENSEMBLE RECALL")
rownames(summary) <- c(1:max(document_summary$CONSENSUS_AGREE))
rownames(summary) <- sapply(rownames(summary),label)

return(summary)
}
55 changes: 15 additions & 40 deletions R/create_matrix.R
Original file line number Diff line number Diff line change
@@ -1,48 +1,23 @@
create_matrix <- function(textColumns, language="en", minDocFreq=1, minWordLength=3, ngramLength=0, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, selectFreqTerms=0, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTf) {

stem_words <- function(x, language) {
corpus <- Corpus(VectorSource(x),readerControl=list(language=language))
matrix <- DocumentTermMatrix(corpus,control=control)
tokens <- colnames(matrix)
tokens <- substr(tokens,1,255)
stemmed <- wordStem(tokens,language=language)
return(iconv(paste(stemmed,collapse=" "),to="UTF8",sub="byte"))
}
create_matrix <- function(textColumns, language="english", minDocFreq=1, minWordLength=3, ngramLength=0, removeNumbers=FALSE, removePunctuation=TRUE, removeSparseTerms=0, removeStopwords=TRUE, stemWords=FALSE, stripWhitespace=TRUE, toLower=TRUE, weighting=weightTf) {

select_topFreq <- function(x, language, cutoff, control) {
corpus <- Corpus(VectorSource(x),readerControl=list(language=language))
matrix <- as.matrix(DocumentTermMatrix(corpus,control=control))
termCol <- cbind(colnames(matrix),matrix[1,])
wordDist <- sort(termCol[,2],decreasing=TRUE)
topWords <- rownames(as.matrix(wordDist))[0:cutoff]
if (length(topWords) == 0) return("")
return(iconv(paste(topWords[!is.na(topWords)],collapse=" "),to="UTF8",sub="byte"))
}

tokenize_ngrams <- function(x, n=ngramLength) {
return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))
stem_words <- function(x) {
split <- strsplit(x," ")
return(wordStem(split[[1]],language=language))
}

tokenize_ngrams <- function(x, n=ngramLength) return(rownames(as.data.frame(unclass(textcnt(x,method="string",n=n)))))

if (class(textColumns) == "character") {
trainingColumn <- textColumns
} else if (class(textColumns) == "matrix") {
trainingColumn <- c()
for (i in 1:ncol(textColumns)) trainingColumn <- paste(trainingColumn,textColumns[,i])
}

if (ngramLength > 0) {
control <- list(weighting=weighting,language=language,tolower=toLower,stopwords=removeStopwords,removePunctuation=removePunctuation,removeNumbers=removeNumbers, stripWhitespace=TRUE, minWordLength=minWordLength , minDocFreq=minDocFreq, tokenize=tokenize_ngrams)
} else {
control <- list(weighting=weighting,language=language,tolower=toLower,stopwords=removeStopwords,removePunctuation=removePunctuation,removeNumbers=removeNumbers, stripWhitespace=TRUE, minWordLength=minWordLength , minDocFreq=minDocFreq)
}
trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")

if (stemWords == TRUE) trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),stem_words,language=language)
if (selectFreqTerms > 0) trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),select_topFreq,language=language,cutoff=selectFreqTerms,control=control)
control <- list(language=language,tolower=toLower,removeNumbers=removeNumbers,removePunctuation=removePunctuation,stripWhitespace=stripWhitespace,minWordLength=minWordLength,stopwords=removeStopwords,minDocFreq=minDocFreq,weighting=weighting)

if (ngramLength > 0) control <- append(control,list(tokenize=tokenize_ngrams),after=6)
if (stemWords == TRUE) control <- append(control,list(stemming=stem_words),after=6)

trainingColumn <- apply(as.matrix(textColumns),1,paste,collapse=" ")
trainingColumn <- sapply(as.vector(trainingColumn,mode="character"),iconv,to="UTF8",sub="byte")

corpus <- Corpus(VectorSource(as.vector(trainingColumn,mode="character")),readerControl=list(language=language))
corpus <- Corpus(VectorSource(trainingColumn),readerControl=list(language=language))
matrix <- DocumentTermMatrix(corpus,control=control);
if (removeSparseTerms > 0) matrix <- removeSparseTerms(matrix,removeSparseTerms) # Advisable value for removeSparseTerms: 0.9998
if (removeSparseTerms > 0) matrix <- removeSparseTerms(matrix,removeSparseTerms)

gc()
return(matrix)
Expand Down
34 changes: 6 additions & 28 deletions R/recall_accuracy.R
Original file line number Diff line number Diff line change
@@ -1,31 +1,9 @@
recall_accuracy <- function(true_labels, predicted_labels)
{
a <- cbind(true_labels, predicted_labels)
#Function for Apply, compares each row one by one.
identical_row <- function(vector) {
vec1 <- vector[1]
vec2 <- vector[2]
if (vec1 %in% vec2 == FALSE) {
answer <- "FALSE"
}
else {
answer <- "TRUE"
}
return(answer)
}
out <- apply(a, 1, identical_row)
#If all cases agree
if (length(table(out))==1 && names(table(out)) =="TRUE"){
out2 <- 100
}
#If all cases disagree
else if (length(table(out))==1 && names(table(out)) =="FALSE") {
out2 <- 0
}
#When there's variation, this will happen in 99.9% of cases, exactly
else {
out2 <- table(out)[2]/sum(table(out))
}
names(out2) <- "Recall Accuracy"
return(out2)
true_labels <- as.vector(true_labels)
predicted_labels <- as.vector(predicted_labels,mode=class(true_labels))
analyze <- predicted_labels == true_labels

accuracy <- length(analyze[analyze == TRUE])/length(true_labels)
return(accuracy)
}

0 comments on commit 006ef72

Please sign in to comment.