# Natural Language Processing

# Import Dataset & Libraries

In [13]:
install.packages('tm', repos="http://cran.us.r-project.org")
install.packages('SnowballC', repos="http://cran.us.r-project.org")
install.packages('caTools', repos="http://cran.us.r-project.org")
install.packages('e1071', repos="http://cran.us.r-project.org")
library(tm)
library(SnowballC)
library(caTools)
library(randomForest)
library(e1071)

dataset_original = read.delim('Google_Reviewsss.txt', quote= '', stringsAsFactors = FALSE)

# Create The Corpus

In [14]:
corpus = VCorpus(VectorSource(dataset_original$Content))
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removeNumbers)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, stopwords())
corpus = tm_map(corpus, stemDocument)
corpus = tm_map(corpus, stripWhitespace)

# Create the Document Term Matrix

In [15]:
dtm = DocumentTermMatrix(corpus)
dtm = removeSparseTerms(dtm, 0.999)
dataset = as.data.frame(as.matrix(dtm))
dataset$Thumbs = dataset_original$Thumbs

# Split Dataset into Training & Test

In [62]:
dataset$Thumbs = factor(dataset$Thumbs, levels = c(0,1))
set.seed(123)
split = sample.split(dataset$Thumbs, SplitRatio =.15)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)

# Implement NLP Model

In [63]:
classifier = naiveBayes(x = training_set[-206],
                        y = training_set$Thumbs)
y_pred = predict(classifier, newdata = test_set[-206])

# Predict Test Results

In [64]:
cm = table(test_set[, 205], y_pred)
print(cm)