# Importing modules 

In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# Loading the dataset

In [2]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

# Cleaning the data

In [10]:
all_stopwords = stopwords.words('english')
stopwords_to_remove = [
    'no', 'not', 'nor', 'never', 'none', 'nothing', 'nowhere', 'nobody', 
    "isn't", "aren't", "wasn't", "weren't", 
    "haven't", "hasn't", "hadn't", 
    "won't", "wouldn't", "shouldn't", 
    "can't", "cannot", "couldn't", 
    "don't", "doesn't", "didn't", 
    "mustn't"
]
for sw in stopwords_to_remove:
    if sw in all_stopwords:
        all_stopwords.remove(sw)

corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if word not in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

# Creating the Bag of Words model

In [16]:
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, -1].values

In [25]:
print(X[0])

[0 0 0 ... 0 0 0]


# Model training

In [39]:
def modeling(model, X, y):
    # train-test spliting
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # training
    model.fit(X_train, y_train)

    # predicting
    y_pred = model.predict(X_test)

    print('Results:')
    print('1. Accuracy -->', accuracy_score(y_test, y_pred))
    print('2. Precision -->', precision_score(y_test, y_pred))
    print('3. Recall -->', recall_score(y_test, y_pred))
    print('4. F1-score -->', f1_score(y_test, y_pred))
    print('--------------------------------')
    print('Confusion Matrix:\n', confusion_matrix(y_test, y_pred))

# Model prediction

In [40]:
model = GaussianNB()
modeling(model, X, y)

Results:
1. Accuracy --> 0.67
2. Precision --> 0.6417910447761194
3. Recall --> 0.8269230769230769
4. F1-score --> 0.7226890756302521
--------------------------------
Confusion Matrix:
 [[48 48]
 [18 86]]


In [42]:
model = MultinomialNB()
modeling(model, X, y)

Results:
1. Accuracy --> 0.76
2. Precision --> 0.8043478260869565
3. Recall --> 0.7115384615384616
4. F1-score --> 0.7551020408163265
--------------------------------
Confusion Matrix:
 [[78 18]
 [30 74]]


In [45]:
model = DecisionTreeClassifier()
modeling(model, X, y)

Results:
1. Accuracy --> 0.75
2. Precision --> 0.7872340425531915
3. Recall --> 0.7115384615384616
4. F1-score --> 0.7474747474747475
--------------------------------
Confusion Matrix:
 [[76 20]
 [30 74]]


In [53]:
model = SVC(kernel='linear')
modeling(model, X, y)

Results:
1. Accuracy --> 0.76
2. Precision --> 0.8043478260869565
3. Recall --> 0.7115384615384616
4. F1-score --> 0.7551020408163265
--------------------------------
Confusion Matrix:
 [[78 18]
 [30 74]]


In [55]:
model = LogisticRegression(random_state=42)
modeling(model, X, y)

Results:
1. Accuracy --> 0.78
2. Precision --> 0.8488372093023255
3. Recall --> 0.7019230769230769
4. F1-score --> 0.7684210526315789
--------------------------------
Confusion Matrix:
 [[83 13]
 [31 73]]
