## Loading Dataset

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


## Cleaning The Texts

In [4]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

corpus = []

for i in range(0,1000):
    review = re.sub('[^a-zA-Z]', ' ', data['Review'][i])
    review = review.lower()
    review = review.split()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not')
    review = [ps.stem(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/graciaeangelica/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
print(corpus)

['wow love thi place', 'crust is not good', 'not tasti and the textur wa just nasti', 'stop by dure the late may bank holiday off rick steve recommend and love it', 'the select on the menu wa great and so were the price', 'now i am get angri and i want my damn pho', 'honeslti it didn t tast that fresh', 'the potato were like rubber and you could tell they had been made up ahead of time be kept under a warmer', 'the fri were great too', 'a great touch', 'servic wa veri prompt', 'would not go back', 'the cashier had no care what so ever on what i had to say it still end up be wayyy overpr', 'i tri the cape cod ravoli chicken with cranberri mmmm', 'i wa disgust becaus i wa pretti sure that wa human hair', 'i wa shock becaus no sign indic cash onli', 'highli recommend', 'waitress wa a littl slow in servic', 'thi place is not worth your time let alon vega', 'did not like at all', 'the burritto blah', 'the food amaz', 'servic is also cute', 'i could care less the interior is just beauti', 's

## Creating the Bag of Words

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x = cv.fit_transform(corpus).toarray()
y = data.iloc[:, -1].values

## Splitting dataset into training and test set

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.2, random_state = 0)

## Applying Machine Learning Algorithm

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline

In [10]:
pipeline_lr = Pipeline([('LogisticRegression', LogisticRegression())])
pipeline_svc = Pipeline([('SVC', SVC())])
pipeline_knn = Pipeline([('KNN', KNeighborsClassifier())])
pipeline_nb = Pipeline([('NaiveBayes', GaussianNB())])
pipeline_dt = Pipeline([('DecisionTree', DecisionTreeClassifier(criterion = 'entropy', random_state=0))])
pipeline_rf = Pipeline([('RandomForest', RandomForestClassifier(n_estimators=500, criterion='entropy', random_state=0))])
pipeline_cat = Pipeline([('CatBoost', CatBoostClassifier())])

In [11]:
pipelines = [pipeline_lr, pipeline_svc, pipeline_knn, pipeline_nb, pipeline_dt, pipeline_rf, pipeline_cat]

In [12]:
best_score = 0.0
best_classifier = 0
best_pipeline = ""

In [13]:
pipe_dict = {0:'LogisticRegression', 1:'SVC', 2:'KNN', 3:'NaiveBayes', 4:'DecisionTree', 5: 'RandomForest', 6:'CatBoost'}

for pipe in pipelines:
    pipe.fit(x_train,y_train)

Learning rate set to 0.009366
0:	learn: 0.6914601	total: 62.1ms	remaining: 1m 2s
1:	learn: 0.6893549	total: 69.9ms	remaining: 34.9s
2:	learn: 0.6873765	total: 78.8ms	remaining: 26.2s
3:	learn: 0.6852659	total: 86.7ms	remaining: 21.6s
4:	learn: 0.6831246	total: 94.6ms	remaining: 18.8s
5:	learn: 0.6810082	total: 103ms	remaining: 17s
6:	learn: 0.6791623	total: 111ms	remaining: 15.7s
7:	learn: 0.6767232	total: 122ms	remaining: 15.1s
8:	learn: 0.6749497	total: 140ms	remaining: 15.4s
9:	learn: 0.6731558	total: 160ms	remaining: 15.9s
10:	learn: 0.6719143	total: 207ms	remaining: 18.6s
11:	learn: 0.6707482	total: 263ms	remaining: 21.6s
12:	learn: 0.6683824	total: 294ms	remaining: 22.3s
13:	learn: 0.6664571	total: 313ms	remaining: 22.1s
14:	learn: 0.6646795	total: 329ms	remaining: 21.6s
15:	learn: 0.6633934	total: 372ms	remaining: 22.9s
16:	learn: 0.6619967	total: 416ms	remaining: 24s
17:	learn: 0.6605733	total: 455ms	remaining: 24.8s
18:	learn: 0.6594338	total: 480ms	remaining: 24.8s
19:	learn:

161:	learn: 0.5562415	total: 2.46s	remaining: 12.7s
162:	learn: 0.5557299	total: 2.47s	remaining: 12.7s
163:	learn: 0.5553557	total: 2.48s	remaining: 12.6s
164:	learn: 0.5550261	total: 2.49s	remaining: 12.6s
165:	learn: 0.5545758	total: 2.5s	remaining: 12.6s
166:	learn: 0.5544015	total: 2.51s	remaining: 12.5s
167:	learn: 0.5539432	total: 2.52s	remaining: 12.5s
168:	learn: 0.5534952	total: 2.54s	remaining: 12.5s
169:	learn: 0.5532741	total: 2.55s	remaining: 12.4s
170:	learn: 0.5529420	total: 2.56s	remaining: 12.4s
171:	learn: 0.5526655	total: 2.57s	remaining: 12.4s
172:	learn: 0.5524148	total: 2.58s	remaining: 12.3s
173:	learn: 0.5516301	total: 2.59s	remaining: 12.3s
174:	learn: 0.5512748	total: 2.6s	remaining: 12.2s
175:	learn: 0.5504474	total: 2.6s	remaining: 12.2s
176:	learn: 0.5501508	total: 2.62s	remaining: 12.2s
177:	learn: 0.5496317	total: 2.62s	remaining: 12.1s
178:	learn: 0.5491448	total: 2.63s	remaining: 12.1s
179:	learn: 0.5480311	total: 2.64s	remaining: 12s
180:	learn: 0.547

332:	learn: 0.4977576	total: 4.25s	remaining: 8.51s
333:	learn: 0.4975399	total: 4.26s	remaining: 8.5s
334:	learn: 0.4973355	total: 4.28s	remaining: 8.49s
335:	learn: 0.4970786	total: 4.29s	remaining: 8.48s
336:	learn: 0.4968154	total: 4.3s	remaining: 8.45s
337:	learn: 0.4965860	total: 4.31s	remaining: 8.43s
338:	learn: 0.4962193	total: 4.31s	remaining: 8.41s
339:	learn: 0.4960266	total: 4.33s	remaining: 8.4s
340:	learn: 0.4954190	total: 4.34s	remaining: 8.39s
341:	learn: 0.4952225	total: 4.35s	remaining: 8.36s
342:	learn: 0.4949642	total: 4.36s	remaining: 8.35s
343:	learn: 0.4948023	total: 4.37s	remaining: 8.32s
344:	learn: 0.4945893	total: 4.38s	remaining: 8.31s
345:	learn: 0.4943079	total: 4.38s	remaining: 8.29s
346:	learn: 0.4940778	total: 4.39s	remaining: 8.26s
347:	learn: 0.4934896	total: 4.4s	remaining: 8.24s
348:	learn: 0.4931538	total: 4.41s	remaining: 8.22s
349:	learn: 0.4929734	total: 4.42s	remaining: 8.2s
350:	learn: 0.4927741	total: 4.42s	remaining: 8.18s
351:	learn: 0.492

499:	learn: 0.4581489	total: 5.65s	remaining: 5.65s
500:	learn: 0.4579663	total: 5.66s	remaining: 5.64s
501:	learn: 0.4576345	total: 5.67s	remaining: 5.63s
502:	learn: 0.4574286	total: 5.68s	remaining: 5.61s
503:	learn: 0.4573173	total: 5.69s	remaining: 5.6s
504:	learn: 0.4571053	total: 5.69s	remaining: 5.58s
505:	learn: 0.4569538	total: 5.7s	remaining: 5.57s
506:	learn: 0.4567209	total: 5.71s	remaining: 5.55s
507:	learn: 0.4565759	total: 5.72s	remaining: 5.54s
508:	learn: 0.4563175	total: 5.72s	remaining: 5.52s
509:	learn: 0.4561914	total: 5.73s	remaining: 5.51s
510:	learn: 0.4558751	total: 5.74s	remaining: 5.49s
511:	learn: 0.4555892	total: 5.75s	remaining: 5.48s
512:	learn: 0.4553226	total: 5.75s	remaining: 5.46s
513:	learn: 0.4549907	total: 5.76s	remaining: 5.45s
514:	learn: 0.4548354	total: 5.77s	remaining: 5.43s
515:	learn: 0.4546840	total: 5.78s	remaining: 5.42s
516:	learn: 0.4545484	total: 5.79s	remaining: 5.41s
517:	learn: 0.4543051	total: 5.79s	remaining: 5.39s
518:	learn: 0.

676:	learn: 0.4208851	total: 7.04s	remaining: 3.36s
677:	learn: 0.4207143	total: 7.05s	remaining: 3.35s
678:	learn: 0.4205637	total: 7.07s	remaining: 3.34s
679:	learn: 0.4203676	total: 7.07s	remaining: 3.33s
680:	learn: 0.4200805	total: 7.08s	remaining: 3.32s
681:	learn: 0.4199069	total: 7.09s	remaining: 3.31s
682:	learn: 0.4196426	total: 7.1s	remaining: 3.3s
683:	learn: 0.4193917	total: 7.11s	remaining: 3.29s
684:	learn: 0.4192533	total: 7.12s	remaining: 3.28s
685:	learn: 0.4190192	total: 7.13s	remaining: 3.26s
686:	learn: 0.4187475	total: 7.14s	remaining: 3.25s
687:	learn: 0.4185733	total: 7.15s	remaining: 3.24s
688:	learn: 0.4184664	total: 7.15s	remaining: 3.23s
689:	learn: 0.4183151	total: 7.16s	remaining: 3.22s
690:	learn: 0.4181355	total: 7.17s	remaining: 3.21s
691:	learn: 0.4179997	total: 7.18s	remaining: 3.19s
692:	learn: 0.4178556	total: 7.18s	remaining: 3.18s
693:	learn: 0.4177196	total: 7.19s	remaining: 3.17s
694:	learn: 0.4173441	total: 7.2s	remaining: 3.16s
695:	learn: 0.4

852:	learn: 0.3799772	total: 8.43s	remaining: 1.45s
853:	learn: 0.3798074	total: 8.44s	remaining: 1.44s
854:	learn: 0.3795830	total: 8.45s	remaining: 1.43s
855:	learn: 0.3792810	total: 8.46s	remaining: 1.42s
856:	learn: 0.3790055	total: 8.47s	remaining: 1.41s
857:	learn: 0.3786388	total: 8.47s	remaining: 1.4s
858:	learn: 0.3784095	total: 8.48s	remaining: 1.39s
859:	learn: 0.3779668	total: 8.49s	remaining: 1.38s
860:	learn: 0.3777121	total: 8.5s	remaining: 1.37s
861:	learn: 0.3775793	total: 8.5s	remaining: 1.36s
862:	learn: 0.3773469	total: 8.51s	remaining: 1.35s
863:	learn: 0.3771755	total: 8.52s	remaining: 1.34s
864:	learn: 0.3769159	total: 8.53s	remaining: 1.33s
865:	learn: 0.3767360	total: 8.53s	remaining: 1.32s
866:	learn: 0.3766153	total: 8.54s	remaining: 1.31s
867:	learn: 0.3764218	total: 8.55s	remaining: 1.3s
868:	learn: 0.3762660	total: 8.55s	remaining: 1.29s
869:	learn: 0.3760219	total: 8.56s	remaining: 1.28s
870:	learn: 0.3757841	total: 8.57s	remaining: 1.27s
871:	learn: 0.37

In [14]:
for i,model in enumerate (pipelines):
    print("{}, Accuracy Score: {}".format(pipe_dict[i],model.score(x_test,y_test)))

LogisticRegression, Accuracy Score: 0.825
SVC, Accuracy Score: 0.81
KNN, Accuracy Score: 0.645
NaiveBayes, Accuracy Score: 0.73
DecisionTree, Accuracy Score: 0.71
RandomForest, Accuracy Score: 0.83
CatBoost, Accuracy Score: 0.805


In [15]:
for i, model in enumerate(pipelines):
    if model.score(x_test,y_test) > best_score:
        best_score = model.score(x_test,y_test)
        best_pipeline = model
        best_classifier = i

print('Classifier with best score: {}'.format(pipe_dict[best_classifier]))

Classifier with best score: RandomForest


In [16]:
classifier = RandomForestClassifier(n_estimators=500, criterion='entropy', random_state=0)
classifier.fit(x_train, y_train)

RandomForestClassifier(criterion='entropy', n_estimators=500, random_state=0)

In [17]:
y_pred = classifier.predict(x_test)

In [18]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy: {0:.2f}%'.format(100 * accuracy_score(y_test, y_pred)))
print(classification_report(y_test, y_pred))

[[86 11]
 [23 80]]
Accuracy: 83.00%
              precision    recall  f1-score   support

           0       0.79      0.89      0.83        97
           1       0.88      0.78      0.82       103

    accuracy                           0.83       200
   macro avg       0.83      0.83      0.83       200
weighted avg       0.84      0.83      0.83       200



## Predicting New Single Review

### Positive Review
**Review**: I love this restaurant so much 

In [19]:
new_review = 'I love this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[1]


### Negative Review
**Review**: I hate this restaurant so much 

In [21]:
new_review = 'I hate this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = classifier.predict(new_X_test)
print(new_y_pred)

[0]
