# Natural Language Processing

In [0]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_csv('drive/My Drive/ML - AZ Course/Data/Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

# Cleaning the texts

In [2]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Creating the Bag of Words model

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

## Splitting the dataset into the Training set and Test set
### Training the model:
1. Naive Bayes
2. Random Forest Classifier
3. KNeighbors Classifier

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [5]:
# Generating the Confusion Matrix for NB
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[55 42]
 [12 91]]


In [0]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)

y_pred_rfc = rfc.predict(X_test)

In [12]:
#Confusion matrix for RFC
cm_rfc = confusion_matrix(y_test, y_pred_rfc)
print(cm_rfc)

[[86 11]
 [51 52]]


In [0]:
from sklearn.neighbors import KNeighborsClassifier

knc = KNeighborsClassifier()
knc.fit(X_train, y_train)

y_pred_knc = knc.predict(X_test)

In [17]:
#Confusion matrix for KNV
cm_knc = confusion_matrix(y_test, y_pred_knc)
print(cm_knc)

[[74 23]
 [55 48]]


# Evaluate the performance of each model.

* Accuracy = (TP + TN) / (TP + TN + FP + FN)

* Precision = TP / (TP + FP)

* Recall = TP / (TP + FN)

* F1 Score = 2 * Precision * Recall / (Precision + Recall)

In [9]:
# Naive Bayes
nb_acc = (55+91) / 200
nb_pre = 55 / (55+42)
nb_rec = 55 / (55+12)
nb_f1 = 2 * nb_pre * nb_rec / (nb_pre+nb_rec)

print(f"Naive Bayes performance: \n Accuracy: {nb_acc} \n Precision: {nb_pre} \n Recall: {nb_rec} \n F1_score: {nb_f1}")

Naive Bayes performance: 
 Accuracy: 0.73 
 Precision: 0.5670103092783505 
 Recall: 0.8208955223880597 
 F1_score: 0.6707317073170731


In [13]:
# Random Forest
nb_acc_rfc = (86+52) / 200
nb_pre_rfc = 86 / (86+11)
nb_rec_rfc = 86 / (86+51)
nb_f1_rfc = 2 * nb_pre_rfc * nb_rec_rfc / (nb_pre_rfc+nb_rec_rfc)

print(f"Random Forest performance: \n Accuracy: {nb_acc_rfc} \n Precision: {nb_pre_rfc} \n Recall: {nb_rec_rfc} \n F1_score: {nb_f1_rfc}")

Random Forest performance: 
 Accuracy: 0.69 
 Precision: 0.8865979381443299 
 Recall: 0.6277372262773723 
 F1_score: 0.7350427350427351


In [18]:
# KNeighbors
nb_acc_knc = (74+48) / 200
nb_pre_knc = 74 / (74+23)
nb_rec_knc = 74 / (74+55)
nb_f1_knc = 2 * nb_pre_knc * nb_rec_knc / (nb_pre_knc+nb_rec_knc)

print(f"KNeighbors performance: \n Accuracy: {nb_acc_knc} \n Precision: {nb_pre_knc} \n Recall: {nb_rec_knc} \n F1_score: {nb_f1_knc}")

KNeighbors performance: 
 Accuracy: 0.61 
 Precision: 0.7628865979381443 
 Recall: 0.5736434108527132 
 F1_score: 0.654867256637168
