### Importing the libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Importing the dataset

In [2]:
df = pd.read_csv('Data/Restaurant_Reviews.tsv', sep='\t', quoting=3)
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### Cleaning the text

In [3]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Elvin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [7]:
%%time
corpus = []
for i in range(1000):
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    review = review.lower().split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)

Wall time: 697 ms


In [43]:
corpus[:5]

['wow love place',
 'crust not good',
 'not tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

In [4]:
# my cleaning function
def clean_text(text):
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower().split()
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    text = [ps.stem(word) for word in text if not word in set(all_stopwords)]
    text = ' '.join(text)
    return text

In [5]:
%%time
df.Review.apply(clean_text)

Wall time: 579 ms


0                                         wow love place
1                                         crust not good
2                                 not tasti textur nasti
3      stop late may bank holiday rick steve recommen...
4                                select menu great price
                             ...                        
995                        think food flavor textur lack
996                               appetit instantli gone
997                 overal not impress would not go back
998    whole experi underwhelm think go ninja sushi n...
999    wast enough life pour salt wound draw time too...
Name: Review, Length: 1000, dtype: object

### Creating the Bag of Words model

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values
X.shape, y.shape

((1000, 1500), (1000,))

### Spliting the dataset into Training set and Test set

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 1500), (200, 1500), (800,), (200,))

### Training the Naive Bayes model on the Training set

In [123]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

### Predicting the Test results

In [124]:
y_pred = clf.predict(X_test)

### Making the Confusion Matrix

In [11]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score
from sklearn.model_selection import cross_val_score

print(confusion_matrix(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Cross Val:', cross_val_score(clf, X, y, cv=5).mean())

[[89  8]
 [36 67]]
Accuracy: 0.78
Precision: 0.8933333333333333
Recall: 0.6504854368932039
F1 Score: 0.752808988764045
Cross Val: 0.8019999999999999


In [131]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=110, random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Cross Val:', cross_val_score(clf, X, y, cv=5).mean())

[[91  6]
 [37 66]]
Accuracy: 0.785
Precision: 0.9166666666666666
Recall: 0.6407766990291263
F1 Score: 0.7542857142857143
Cross Val: 0.788


In [12]:
from sklearn.svm import SVC

clf = SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Cross Val:', cross_val_score(clf, X, y, cv=5).mean())

[[89  8]
 [36 67]]
Accuracy: 0.78
Precision: 0.8933333333333333
Recall: 0.6504854368932039
F1 Score: 0.752808988764045
Cross Val: 0.8019999999999999


In [133]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred))
print('Recall:', recall_score(y_test, y_pred))
print('F1 Score:', f1_score(y_test, y_pred))
print('Cross Val:', cross_val_score(clf, X, y, cv=5).mean())

[[77 20]
 [33 70]]
Accuracy: 0.735
Precision: 0.7777777777777778
Recall: 0.6796116504854369
F1 Score: 0.7253886010362696
Cross Val: 0.756


### Predicting if a single review is positive or negative

In [120]:
# my implementation 
my_y_pred = clf.predict(cv.transform([clean_text('just fantastic, absolutely fantastic'),
                                      clean_text('hate it, absolutely hate it')]).toarray())
print(f'just fantastic, absolutely fantastic: {my_y_pred[0]}')
print(f'hate it, absolutely hate it: {my_y_pred[1]}')

just fantastic, absolutely fantastic: 1
hate it, absolutely hate it: 0


In [122]:
# my implementation 
my_y_pred = clf.predict(cv.transform([clean_text('I love this restaurant so much'),
                                      clean_text('I hate this restaurant so much')]).toarray())
print(f'I love this restaurant so much: {my_y_pred[0]}')
print(f'I hate this restaurant so much: {my_y_pred[1]}')

I love this restaurant so much: 1
I hate this restaurant so much: 0


In [13]:
clf.predict(cv.transform([clean_text('food are delicious'),
                          clean_text('i do not like the arangment of the tables')]).toarray())

array([1, 0], dtype=int64)