# Natural Language Processing

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t", quoting = 3) 
# the independent and dependent variables in TSV files are delimited by tabs because commas (the c in CSV) are used in the text itself
# the "quoting = 3" parameter instructs pandas to ignore all quotation marks 



## Cleaning the texts

In [None]:
import re
# regex package
import nltk
# a package of NLP tools
nltk.download("stopwords")
nltk.corpus
from nltk.corpus import stopwords
# stopwords are words that do not benefit bag of words analysis like "a", "the" "it" etc.
from nltk.stem.porter import PorterStemmer
# a class for getting rid of inflectional affixes so each stem gets just one entry in the sparse matrix
corpus = []
# will contain the cleaned dataset
for i in range(0, 1000):
    review = re.sub("[^a-zA-Z]", " ", dataset["Review"][i])
    # replaces all non-alphabet characters with a space
    review = review.lower()
    # changes all to lower case
    review = review.split()
    # splits into individual words
    ps = PorterStemmer()
    all_stopwords = stopwords.words("english")
    all_stopwords.remove("not")
    # The list of stopwords may contain words that can influence sentiment, so I manually removed them from the list
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    # only stem if the word is not in the list of English stopwords
    review = " ".join(review)
    # join the review back together with spaces between words
    corpus.append(review)

In [None]:
print(corpus)

## Creating the Bag of Words model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
# the total feature count is 1566. There's a sweet spot for the number of max features; it appears to be around 500 here
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values


In [None]:
len(X[0])

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)
# Changed the test size to 25% so it's nice round numbers in a 400 entry test set

## Training the Naive Bayes model on the Training set

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

## Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1 )), 1)) 
# predictions on the left, real data on the right

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred)