In [5]:
require(jsonlite)
require(dplyr)
require(tm)

# import commits from JSON
comments <- fromJSON("http://pyro.primeprocessor.com:8888/files/comments.json")

# create corpus
comments <- VCorpus(VectorSource(comments$comment))

# remove puncutation, stopwords, numbers, whitespace
mod <- tm_map(comments, tolower)
mod <- tm_map(mod, removePunctuation)
mod <- tm_map(mod, removeNumbers)
mod <- tm_map(mod, removeWords, stopwords("english"))
mod <- tm_map(mod, stripWhitespace)

mod <- tm_map(mod, PlainTextDocument)
dtm <- DocumentTermMatrix(mod)

In [10]:
dim(dtm)
inspect(dtm[1,1:5])

<<DocumentTermMatrix (documents: 1, terms: 5)>>
Non-/sparse entries: 0/5
Sparsity           : 100%
Maximal term length: 17
Weighting          : term frequency (tf)

              Terms
Docs           aadfabebeacaaeade aaeaeecebfb aaebcffcdafccdbf aaron
  character(0)                 0           0                0     0
              Terms
Docs           abacaacfbbccfb
  character(0)              0


The term-frequency-inverse-document-frequency matrix assigns weights to words in a document according to their frequency of appearance.  To account for the fact that some words appear more frequently in a given corpus (a collection of documents), each term frequency is multiplied by the inverse document frequency.  This tampers the weights.  

In [12]:
dtm_tfxidf <- weightTfIdf(dtm)

In weightTfIdf(dtm): empty document(s): character(0)

In [None]:
term_wt <- apply(dtm_tfxidf,2,sum)
length(term_wt[which(term_wt>2.5)])

dtm_tfxidf <- dtm_tfxidf[,term_wt>2.5]

# Determine optimal number of clusters
tss <- c()
for (i in 1:50) {
  fit <- kmeans(dtm_tfxidf, i)
  tss[i] <- sum(fit$tot.withinss)
}

# no elbow in chart, pick k = 10
plot(tss, main="Optimal K's", ylab="Total SS Within", xlab="# of ks")

fit <- kmeans(dtm_tfxidf, 10)
clusters <- fit$cluster

clust.df <- as.matrix(dtm_tfxidf)
clust.df<- as.data.frame(clust.df)
all <- cbind(fit$cluster,clust.df)

names(all)[1] <- "cluster"

# Find top 5 words by weight for each cluster
freq_terms <- list()
for (i in 1:10) {
  selection <- all %>% filter(cluster==i)
  vector <- colSums(selection[,-1]) %>% sort(., decreasing=T)
  freq_terms[[i]] <- vector[1:5]
}

