**Restaurant Reviews Sentiment Analysis using Stemming and Bag of Words**

# Importing the Libraries and reading the tsv data file

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
#title input file is in tsv format
data=pd.read_csv('../input/reviews/Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3) 
#delimeter or sep='\t' 
#quoting =3 no quote or ignore quotes while processing

# Data Cleaning

In [None]:
#Data Cleaning
# Cleaning the Text
import nltk
import re
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [None]:
corpus = []
for i in range(len(data)):
    review = re.sub('[^a-zA-Z]', ' ', data['Review'][i])
    review = review.lower()
    review = review.split()
    all_stopwords = stopwords.words('english')
    all_stopwords.remove('not') 
    #remove negative word 'not' as it is closest word to help determine whether the review is good or not 
    review = [stemmer.stem(word) for word in review if not word in set(all_stopwords)]
    review = ' '.join(review)
    corpus.append(review)
print(corpus)

# Create a BOW Model

In [None]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = data.iloc[:, -1].values

# Train, Test Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

# Classifier Model Training

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
kfold = StratifiedKFold(n_splits=12)
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

1. Logistic Regression

In [None]:
#Logistic Regression
lr = LogisticRegression(random_state=0)
cv = cross_val_score(lr,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
lr.fit(X_train,y_train)
y_pred_lr=lr.predict(X_test)
print('The accuracy of the Logistic Regression is',metrics.accuracy_score(y_pred_lr,y_test)*100)
cm=confusion_matrix(y_test, y_pred_lr)
print(cm)
classification_report(y_test, y_pred_lr)

2. Naive Bayes

In [None]:
#GaussianNB
gnb = GaussianNB()
cv = cross_val_score(gnb,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
gnb.fit(X_train,y_train)
y_pred_gnb=gnb.predict(X_test)
print('The accuracy of the Naive Bayes is', metrics.accuracy_score(y_pred_gnb,y_test)*100)
cm=confusion_matrix(y_test, y_pred_gnb)
print(cm)
classification_report(y_test, y_pred_gnb)

In [None]:
#MultinomialNB
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
cv = cross_val_score(mnb,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
mnb.fit(X_train,y_train)
y_pred_mnb=mnb.predict(X_test)
print('The accuracy of the Naive Bayes is', metrics.accuracy_score(y_pred_mnb,y_test)*100)
cm=confusion_matrix(y_test, y_pred_mnb)
print(cm)
classification_report(y_test, y_pred_mnb)

In [None]:
#Bernoulli NB
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
cv = cross_val_score(bnb,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
mnb.fit(X_train,y_train)
y_pred_bnb=mnb.predict(X_test)
print('The accuracy of the Naive Bayes is', metrics.accuracy_score(y_pred_bnb,y_test)*100)
cm=confusion_matrix(y_test, y_pred_bnb)
print(cm)
classification_report(y_test, y_pred_bnb)

3. Random Forest Classifier

In [None]:
#Random Forest Classifier
'''rf = RandomForestClassifier(random_state=0)
rf.fit(X_train, y_train)
cv = cross_val_score(rf,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_rf = rf.predict(X_test)
print('The accuracy of the RandomForestClassifier is',metrics.accuracy_score(y_pred_rf,y_test)*100)
cm=confusion_matrix(y_test, y_pred_rf)
print(cm)
classification_report(y_test, y_pred_rf)'''

In [None]:
'''#Hyperparameter Tuning
rf = RandomForestClassifier(random_state = 0)
param_grid =  {'n_estimators': [200,400,800,1200,1600,2000,2500,3000,2500,4000,5000,6000,7000,8000,9000], 
                                  'bootstrap': [True,False],
                                  'max_depth': [5,10,15,20,30,40,50,80,100,None],
                                  'max_features': [3,'auto','sqrt','log2'],
                                  'bootstrap': [False, True],
                                  'criterion': ['gini', 'entropy'],
                                  'min_samples_leaf': [1,2,3,4,5,8,10],
                                  'min_samples_split': [1,2,3,4,5,8,10]}
                                  
clf_rf_rnd = RandomizedSearchCV(rf, param_distributions = param_grid, n_iter = 200, 
cv = kfold, verbose = True, n_jobs = -1)
best_clf_rf_rnd = clf_rf_rnd.fit(X_train,y_train)
best_clf_rf_rnd.best_estimator_'''

In [None]:
#Random Forest Classifier
rf = RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=30,
                       max_features='log2', min_samples_leaf=2,
                       n_estimators=500, random_state=0)
rf.fit(X_train, y_train)
cv = cross_val_score(rf,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_rf = rf.predict(X_test)
print('The accuracy of the RandomForestClassifier is',metrics.accuracy_score(y_pred_rf,y_test)*100)
cm=confusion_matrix(y_test, y_pred_rf)
print(cm)
classification_report(y_test, y_pred_rf)

4. SVC

In [None]:
#Linear SVC
svcl = SVC(kernel = 'linear', random_state = 0, probability=True)
svcl.fit(X_train, y_train)
cv = cross_val_score(svcl,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_svcl = svcl.predict(X_test)
print('The accuracy of the Linear SVC is',metrics.accuracy_score(y_pred_svcl,y_test)*100)
cm=confusion_matrix(y_test, y_pred_svcl)
print(cm)
classification_report(y_test, y_pred_svcl)

In [None]:
#rbf SVC
from sklearn.svm import SVC
svck = SVC(kernel = 'rbf', random_state = 0, probability=True)
svck.fit(X_train, y_train)
cv = cross_val_score(svck,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_svck = svck.predict(X_test)
print('The accuracy of the Kernel SVC is',metrics.accuracy_score(y_pred_svck,y_test)*100)
cm=confusion_matrix(y_test, y_pred_svck)
print(cm)
classification_report(y_test, y_pred_svck)

5. Decision Tree Classifier

In [None]:
#Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=0)
dt.fit(X_train, y_train)
cv = cross_val_score(dt,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_dt = dt.predict(X_test)
print('The accuracy of the Decision Tree Classifier is',metrics.accuracy_score(y_pred_dt,y_test)*100)
cm=confusion_matrix(y_test, y_pred_dt)
print(cm)
classification_report(y_test, y_pred_dt)

6. KNN

In [None]:
#knn
'''knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
cv = cross_val_score(knn,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_knn = knn.predict(X_test)
print('The accuracy of the K-Neighbors Classifier is',metrics.accuracy_score(y_pred_knn,y_test)*100)
cm=confusion_matrix(y_test, y_pred_knn)
print(cm)
classification_report(y_test, y_pred_knn)'''

In [None]:
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2, leaf_size = 20)
knn.fit(X_train, y_train)
cv = cross_val_score(knn,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_knn = knn.predict(X_test)
print('The accuracy of the K-Neighbors Classifier is',metrics.accuracy_score(y_pred_knn,y_test)*100)
cm=confusion_matrix(y_test, y_pred_knn)
print(cm)
classification_report(y_test, y_pred_knn)

Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = [('lr', lr),('gnb',gnb),('bnb',bnb),('mnb',mnb),
                                            ('knn',knn),('dt',dt),
                                            ('rf',rf),('svck',svck),('svcl',svcl)], voting = 'soft') 
voting_clf.fit(X_train, y_train)
cv = cross_val_score(voting_clf,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_vclf = voting_clf.predict(X_test)
print('The accuracy of the Voting Classifier is',metrics.accuracy_score(y_pred_vclf,y_test)*100)
cm=confusion_matrix(y_test, y_pred_vclf)
print(cm)
classification_report(y_test, y_pred_vclf)

In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = [('lr', lr),('bnb',bnb),('mnb',mnb),
                                            ('rf',rf),('svck',svck),('svcl',svcl)], voting = 'soft') 
voting_clf.fit(X_train, y_train)
cv = cross_val_score(voting_clf,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_vclf = voting_clf.predict(X_test)
print('The accuracy of the Voting Classifier is',metrics.accuracy_score(y_pred_vclf,y_test)*100)
cm=confusion_matrix(y_test, y_pred_vclf)
print(cm)
classification_report(y_test, y_pred_vclf)

In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = [('lr', lr),('gnb',gnb),('bnb',bnb),('mnb',mnb),
                                            ('knn',knn),('dt',dt),
                                            ('rf',rf),('svck',svck),('svcl',svcl)], voting = 'hard') 
voting_clf.fit(X_train, y_train)
cv = cross_val_score(voting_clf,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_vclf = voting_clf.predict(X_test)
print('The accuracy of the Voting Classifier is',metrics.accuracy_score(y_pred_vclf,y_test)*100)
cm=confusion_matrix(y_test, y_pred_vclf)
print(cm)
classification_report(y_test, y_pred_vclf)

In [None]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators = [('lr', lr),('bnb',bnb),('mnb',mnb),
                                            ('rf',rf),('svck',svck),('svcl',svcl)], voting = 'hard') 
voting_clf.fit(X_train, y_train)
cv = cross_val_score(voting_clf,X_train,y_train,cv=kfold)
print(cv)
print(cv.mean()*100)
y_pred_vclf = voting_clf.predict(X_test)
print('The accuracy of the Voting Classifier is',metrics.accuracy_score(y_pred_vclf,y_test)*100)
cm=confusion_matrix(y_test, y_pred_vclf)
print(cm)
classification_report(y_test, y_pred_vclf)