# Restaurant Reviews Classification with NLTK

In [1]:
# dataset Restaurant_Reviews.tsv
file_data = '/home/al/Projects_My/NLP-russian-language/datasets/Restaurant_Reviews.tsv'

In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
dataset = pd.read_csv(file_data, sep = '\t')
print(dataset.head(10))

                                              Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1
5     Now I am getting angry and I want my damn pho.      0
6              Honeslty it didn't taste THAT fresh.)      0
7  The potatoes were like rubber and you could te...      0
8                          The fries were great too.      1
9                                     A great touch.      1


In [4]:
dataset['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

Cleaning Text Data

In [5]:
import nltk
import re

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/al/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords

In [8]:
review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][0])

In [9]:
review

'Wow    Loved this place '

In [10]:
review = review.lower()

In [11]:
review = review.split()

In [12]:
review

['wow', 'loved', 'this', 'place']

In [13]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [14]:
review = [word for word in review if word not in stopwords.words('english') ]

In [15]:
review

['wow', 'loved', 'place']

In [16]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [17]:
review = [ps.stem(word) for word in review]

In [18]:
review

['wow', 'love', 'place']

In [19]:
review = ' '.join(review)

In [20]:
review

'wow love place'

In [22]:
corpus = []
for i in range(len(dataset)):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    review = [word for word in review if word not in stopwords.words('english') ]
    review = [ps.stem(word) for word in review]
    review = ' '.join(review)
    corpus.append(review)

Bag of Word Model

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
x = cv.fit_transform(corpus).toarray()
print(x.shape)

(1000, 1500)


In [26]:
y = dataset.iloc[:, 1].values

Apply Naive Bayes Algoritm

In [28]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.20, random_state = 0) 

In [31]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)

In [32]:
from sklearn.metrics import accuracy_score

In [33]:
accuracy_score(y_test, y_pred)

0.73