In [1]:
import numpy as np 
import pandas as pd 
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.metrics import *
from sklearn.model_selection import *

from sklearn import metrics
from sklearn import preprocessing
from sklearn import model_selection
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, roc_auc_score,auc, accuracy_score

In [2]:
# this is a tsv file, which means tab seperated file, delimiter = '\t' shows we are dealing
# with a tsv file, note delimiter = TRUE in csv.
# quoting = 3, means no "", or removing all the qoutation in the data.
df = pd.read_csv("Downloads/Restaurant_Reviews.tsv", delimiter='\t', quoting = 3)
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


### Cleaning the texts

In [3]:
import re # re replaces all pontoration with a spacified charachter. eg space 
import nltk               # the stopwords from the nltk labrarie are non relevant words that
nltk.download('stopwords')  # will not add meaning to the model, eg "the", "this" etc, 

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer  # which is use to take only the root of a word,
corpus = []

# replacing the pontoations with space in the data and updating the new varible review
for i in range(0, 1000):    
    review = re.sub('[^a-zA-Z]', ' ', df['Review'][i])
    review = review.lower() # to take the texts to lower case
    review = review.split() # splitting the review to different words
    # applying stemming to each of the words.
    ps = PorterStemmer()
    all_stopwords = stopwords.words('english') #applying the stopwords
    all_stopwords.remove('not') # removing a particular word from the set of stopwords
    review = [ps.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)





[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Creating a bag of words

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values
#print(X)

len(X[0])

1500

### spliting in to the training set and test set

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)



### Impilementing the naive bayes model 

In [6]:
cl = XGBClassifier()
cl.fit(X_train, y_train)

In [7]:
y_pred = cl.predict(X_test)
y_pred

array([0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0])

In [8]:
from sklearn.metrics import f1_score, accuracy_score, mean_squared_error, classification_report, confusion_matrix

In [9]:
print("Accuracy Score: ", accuracy_score(y_test, y_pred) * 100)
print("\n\nMean Squared Error: ", mean_squared_error(y_test, y_pred))
print("\n\nF1 Score: ", f1_score(y_test, y_pred))
print("\n\nClassification Report\n", classification_report(y_test, y_pred))
print("\n\nConfusion Matris\n", confusion_matrix(y_test, y_pred))



Accuracy Score:  80.0


Mean Squared Error:  0.2


F1 Score:  0.7752808988764044


Classification Report
               precision    recall  f1-score   support

           0       0.80      0.84      0.82       108
           1       0.80      0.75      0.78        92

    accuracy                           0.80       200
   macro avg       0.80      0.80      0.80       200
weighted avg       0.80      0.80      0.80       200



Confusion Matris
 [[91 17]
 [23 69]]


In [10]:
# predicting if the statement "i love this restaurant so much is a postive review or a negaive one

new_review = 'I love this restaurant so much'
new_review = re.sub('[^a-zA-Z]', ' ', new_review)
new_review = new_review.lower()
new_review = new_review.split()
ps = PorterStemmer()
all_stopwords = stopwords.words('english')
all_stopwords.remove('not')
new_review = [ps.stem(word) for word in new_review if not word in set(all_stopwords)]
new_review = ' '.join(new_review)
new_corpus = [new_review]
new_X_test = cv.transform(new_corpus).toarray()
new_y_pred = cl.predict(new_X_test)
print(new_y_pred)

[1]
