# Customer Review Classification

In [68]:
import numpy as np
import pandas as pd

In [69]:
data = pd.read_csv("Restaurant_Reviews.tsv", sep='\t', quoting=3)
data

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [70]:
data["Liked"].value_counts()

Liked
1    500
0    500
Name: count, dtype: int64

## Prepare dataset

In [71]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [72]:
def clean_review(review):
    ps = PorterStemmer()
    review = re.sub('[^A-Za-z]', ' ', review)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in stopwords.words('english')]
    review = " ".join(review)
    return review

In [73]:
data['cleanedReview'] = data['Review'].apply(lambda x: clean_review(x))

In [74]:
data

Unnamed: 0,Review,Liked,cleanedReview
0,Wow... Loved this place.,1,wow love place
1,Crust is not good.,0,crust good
2,Not tasty and the texture was just nasty.,0,tasti textur nasti
3,Stopped by during the late May bank holiday of...,1,stop late may bank holiday rick steve recommen...
4,The selection on the menu was great and so wer...,1,select menu great price
...,...,...,...
995,I think food should have flavor and texture an...,0,think food flavor textur lack
996,Appetite instantly gone.,0,appetit instantli gone
997,Overall I was not impressed and would not go b...,0,overal impress would go back
998,"The whole experience was underwhelming, and I ...",0,whole experi underwhelm think go ninja sushi n...


## Feature engineering using bag of word model

In [75]:
#Convert text to vector
from sklearn.feature_extraction.text import CountVectorizer

In [76]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(data['cleanedReview']).toarray()
y = data['Liked']

## Model training using Naive Bayes

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [78]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [79]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 1500), (200, 1500), (800,), (200,))

In [80]:
classifier = GaussianNB()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)  
accuracy_score(y_test, y_pred)

0.73

## Testing

In [84]:
new_review = "This is a great restaurant! The food was exquisite."

cleaned_review = clean_review(new_review)
X_new = cv.transform([cleaned_review]).toarray()
prediction = classifier.predict(X_new)

sentiment = ['positive' if prediction == [1] else 'negative']
print(sentiment)

['positive']


In [85]:
new_review = "The chocolate cake was dry."

cleaned_review = clean_review(new_review)
X_new = cv.transform([cleaned_review]).toarray()
prediction = classifier.predict(X_new)

sentiment = ['positive' if prediction == [1] else 'negative']
print(sentiment)

['negative']
