In [100]:
# Load required libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [101]:
df=pd.read_csv('Restaurant_Reviews.tsv',delimiter="\t",quoting=3)

In [102]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [103]:
import re
import nltk
# nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

In [104]:
from nltk.corpus import stopwords

In [105]:
corpus=[]
for i in range(0,1000):
    review=re.sub("[^a-zA-Z]",' ',df['Review'][i])
    review=review.lower()
    review=review.split()
    ps=PorterStemmer()# lemmatization give root or base word call for calling,called
    review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]#stopword unwanted words
    if i==1:
        print(review)
    review="  ".join(review)
    corpus.append(review)

['crust', 'good']


In [106]:
corpus[0]

'wow  love  place'

In [107]:
# Machine Learning libraries
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1500)# Bag of words model
X=cv.fit_transform(corpus).toarray()
y=df.iloc[:,1]

In [108]:
# Split data in training and test dataset
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)


### Naive Bayes model to make Prediction

In [109]:
from sklearn.naive_bayes import GaussianNB
classifier=GaussianNB()
classifier.fit(X_train,y_train)

# predicting
pred_y=classifier.predict(X_test)

### Check the accuracy with Confusion Matrix

In [110]:
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_test,pred_y)
cm

array([[55, 42],
       [12, 91]], dtype=int64)

In [111]:
print(np.sum(cm))

200


In [112]:
print(cm[0,0]+cm[1,1])

146


In [113]:
def accuracyCM(cm):
    total=np.sum(cm)
    ap=cm[0,0]+cm[1,1]
    accuracy=(ap*100)/total
    return accuracy


In [114]:
accuracy=accuracyCM(cm)
print(f'The accuracy of Naive Bayes model is {accuracy:.2f}%')

The accuracy of Naive Bayes model is 73.00%


### Test with own review

In [116]:
test_review="The magito pizza was disgusting i 333 just hate.. that."

In [117]:
review=re.sub("[^a-zA-Z]",' ',test_review)
review

'The magito pizza was disgusting i     just hate   that '

In [118]:
review=review.lower()
type(review)

str

In [119]:
review=review.split()
review

['the', 'magito', 'pizza', 'was', 'disgusting', 'i', 'just', 'hate', 'that']

In [120]:
review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
review

['magito', 'pizza', 'disgust', 'hate']

In [121]:
review=" ".join(review)
corpus2=[]
corpus2.append(review)

In [122]:
cv2=CountVectorizer(max_features=1500)
test_x=cv2.fit_transform(corpus+corpus2).toarray()


In [123]:
test_x


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [130]:
my=test_x[-1].reshape(1,-1)
print(test_x[-1])
my

[0 0 0 ... 0 0 0]


array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [125]:
result=classifier.predict(my)
result

array([1], dtype=int64)

In [126]:
from sklearn.externals import joblib



### Save the model and corpus with joblib

In [127]:
joblib.dump([classifier, corpus], 'reviewModel.pkl')

['reviewModel.pkl']

In [None]:
review=re.sub("[^a-zA-Z]",' ',df['Review'][i])
review=review.lower()
review=review.split()
ps=PorterStemmer()# lemmatization give root or base word call for calling,called
review=[ps.stem(word) for word in review if not word in set(stopwords.words('english'))]#