In [25]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk
import re

# NLTK section
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

# sklearn section
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB


In [2]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Restaurant_Reviews.tsv", sep='\t')

In [3]:
df

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [5]:
df['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

## Data Cleaning

In [6]:
df['clean_text'] = None

In [7]:
df.head()

Unnamed: 0,Review,Liked,clean_text
0,Wow... Loved this place.,1,
1,Crust is not good.,0,
2,Not tasty and the texture was just nasty.,0,
3,Stopped by during the late May bank holiday of...,1,
4,The selection on the menu was great and so wer...,1,


In [8]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [9]:
ps = PorterStemmer()

In [10]:
for index, row in df.iterrows():
  raw_text = row['Review']
  text_clean_alphabet = re.sub('[^a-zA-Z]',' ',raw_text)
  text_clean_lowercase = text_clean_alphabet.lower()
  text_clean_split = text_clean_lowercase.split()

  # clean text_clean_split
  filtered_text_clean_split = [word for word in text_clean_split if word not in stopwords.words('english')]

  # stemmed review
  stemmed_review = [ps.stem(word) for word in filtered_text_clean_split]

  # clean review
  clean_review = " ".join(stemmed_review)
  df.at[index,'clean_text'] = clean_review


In [11]:
df.head()

Unnamed: 0,Review,Liked,clean_text
0,Wow... Loved this place.,1,wow love place
1,Crust is not good.,0,crust good
2,Not tasty and the texture was just nasty.,0,tasti textur nasti
3,Stopped by during the late May bank holiday of...,1,stop late may bank holiday rick steve recommen...
4,The selection on the menu was great and so wer...,1,select menu great price


## Bag of Word

In [12]:
corpus = df['clean_text'].tolist()

In [13]:
corpus[10]

'servic prompt'

In [14]:
cv = CountVectorizer(max_features=1500)

In [15]:
x = cv.fit_transform(corpus).toarray()

In [16]:
x.shape

(1000, 1500)

In [17]:
y = df.iloc[:,1].values

In [18]:
y.shape

(1000,)

In [20]:
y

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1,
       1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1,

Naive Bayes Algorithm

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size =0.20, random_state=0)

In [23]:
x_train.shape, x_test.shape

((800, 1500), (200, 1500))

In [24]:
y_train.shape, y_test.shape

((800,), (200,))

In [27]:
classifier = GaussianNB()

In [28]:
classifier.fit(x_train, y_train)

GaussianNB()

In [29]:
y_pred = classifier.predict(x_test)

In [31]:
from sklearn.metrics import accuracy_score

In [32]:
accuracy_score(y_test, y_pred)

0.73