# ***Install Packages***

In [1]:
!pip install bnlp_toolkit
!pip install bnltk
!pip install -U bnlp_toolkit

# ***Import Libraries***

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from bnlp.corpus import stopwords
from bnlp.corpus.util import remove_stopwords
from bnltk.stemmer import BanglaStemmer

from sklearn import metrics

from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from statistics import *
from tabulate import tabulate

# ***Dataset Load***

In [3]:
data = pd.read_csv('../input/resturant-review-bangla7612/resturant_review(7612).csv')
data.head(20)

# ***Data Ratio***

In [4]:
import seaborn as sns
target_vc = data["Sentiment"].value_counts(normalize=False)
print("Negative: {:.2%}, Positive: {:.2%}".format(target_vc[0], target_vc[1]))
sns.barplot(x=target_vc.index, y=target_vc)
plt.title("Dataset Distribution")
plt.xlabel("0 = Negative, 1 = Positive")
plt.ylabel("Number of data")
plt.show()

In [5]:
data.shape

# ***Data Pre-processing***

In [6]:
#cleaning the texts
import re
corpus=[]
for i in range(0,7612):
    review = re.sub('[\!"#$%&()*,./:;<=>?@[\\]^`{|}~\t\n।+-]', ' ', data['Text'][i])
    review=review.replace('_',' ')
    review=review.replace('ঃ',' ')
    review = re.sub('[a-zA-Z0-9]+', ' ', review)    
    review = re.sub('[''````£|¢|/=।!“’<>‘॥”‰\']', ' ', review)               
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\u00C0-\u017F"          #latin
                           u"\u2000-\u206F"          #generalPunctuations 
                           u"\\U0001f90f" 
                           u"\\U0001f9cf"
                           u"\U0001fa78"  
                           "]+", flags=re.UNICODE)
    review=emoji_pattern.sub(r'', review)
    bn_stemmer = BanglaStemmer()
    review=bn_stemmer.stem(review)
    review = remove_stopwords(review, stopwords)
    review=' '.join(review)
    corpus.append(review)
corpus

# **Bag of Words**

In [7]:
# Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range = (1,2))
BM= vectorizer.fit_transform(corpus).toarray()
y= data.iloc[:, 1].values

# **Spliting data for BoW 80:20 Ratio**

In [8]:
#Splitting the dataset into training set and test set
X_train,X_test,y_train,y_test= train_test_split(BM,y,test_size=0.2,random_state=2, stratify=data['Sentiment'])
print(BM)

# ***Machine Learning Classifier using K-fold Cross Validation***

# **Result Table**

In [9]:
evalution_table = []
evalution_table.append(['Classifier Name','Precision','Recall','Accuracy','F1-Score'])

# **DT 5-fold cross validation using BoW**

In [10]:
#decision tree for 5-fold
cv = KFold(n_splits=5,shuffle=True, random_state=0)
classifier = DecisionTreeClassifier(criterion="entropy", random_state=5)

acc_DT = cross_val_score(classifier, BM, y, scoring='accuracy', cv=cv)
re_DT = cross_val_score(classifier, BM, y, scoring='recall', cv=cv)
pr_DT = cross_val_score(classifier, BM, y, scoring='precision', cv=cv)
f1_DT = cross_val_score(classifier, BM, y, scoring='f1', cv=cv)

y_pred_DT = cross_val_predict(classifier, BM, y, cv=cv)
cm_DT = confusion_matrix(y, y_pred_DT)
        
from mlxtend.plotting import plot_confusion_matrix
print("Confusion Matrix for Model 4 (DT)")
plot_confusion_matrix(conf_mat=cm_DT,show_absolute=True,
                                show_normed=True,
                                colorbar=True)

evalution_table.append(['DT',pr_DT[0],re_DT[0],acc_DT,f1_DT[0]])

# **RF 5-fold cross validation using BoW**

In [11]:
#Random Forest#NB for 5-fold
cv = KFold(n_splits=5,shuffle=True, random_state=0)
classifier = RandomForestClassifier(n_estimators=10,criterion='entropy',random_state=2)

acc_RF = cross_val_score(classifier, BM, y, scoring='accuracy', cv=cv)
re_RF = cross_val_score(classifier, BM, y, scoring='recall', cv=cv)
pr_RF = cross_val_score(classifier, BM, y, scoring='precision', cv=cv)
f1_RF = cross_val_score(classifier, BM, y, scoring='f1', cv=cv)

y_pred_RF = cross_val_predict(classifier, BM, y, cv=cv)
cm_RF = confusion_matrix(y, y_pred_RF)

from mlxtend.plotting import plot_confusion_matrix
print("Confusion Matrix for Model 4 (RF)")
plot_confusion_matrix(conf_mat=cm_RF,show_absolute=True,
                                show_normed=True,
                                colorbar=True)

evalution_table.append(['RF',pr_RF[0],re_RF[0],acc_RF,f1_RF[0]])

# **SVM 5-fold cross validation using BoW**

In [12]:
#SVM for 5-fold
cv = KFold(n_splits=5,shuffle=True, random_state=0)
classifier = SVC(kernel='linear',gamma='auto')

acc_SVM = cross_val_score(classifier, BM, y, scoring='accuracy', cv=cv)
re_SVM = cross_val_score(classifier, BM, y, scoring='recall', cv=cv)
pr_SVM = cross_val_score(classifier, BM, y, scoring='precision', cv=cv)
f1_SVM = cross_val_score(classifier, BM, y, scoring='f1', cv=cv)

y_pred_SVM = cross_val_predict(classifier, BM, y, cv=cv)
cm_SVM = confusion_matrix(y, y_pred_SVM)

from mlxtend.plotting import plot_confusion_matrix
print("Confusion Matrix for Model 4 (SVM)")
plot_confusion_matrix(conf_mat=cm_SVM,show_absolute=True,
                                show_normed=True,
                                colorbar=True)

evalution_table.append(['SVM',pr_SVM[0],re_SVM[0],acc_SVM,f1_SVM[0]])

# **KNN 5-fold cross validation using BoW**

In [13]:
#KNN for 5-fold
cv = KFold(n_splits=5,shuffle=True, random_state=0)
classifier = KNeighborsClassifier(n_neighbors=4,metric='minkowski',p=2)

acc_KNN = cross_val_score(classifier, BM, y, scoring='accuracy', cv=cv)
re_KNN = cross_val_score(classifier, BM, y, scoring='recall', cv=cv)
pr_KNN = cross_val_score(classifier, BM, y, scoring='precision', cv=cv)
f1_KNN = cross_val_score(classifier, BM, y, scoring='f1', cv=cv)

y_pred_KNN = cross_val_predict(classifier, BM, y, cv=cv)
cm_KNN = confusion_matrix(y, y_pred_KNN)

from mlxtend.plotting import plot_confusion_matrix
print("Confusion Matrix for Model 4 (KNN)")
plot_confusion_matrix(conf_mat=cm_KNN,show_absolute=True,
                                show_normed=True,
                                colorbar=True)

evalution_table.append(['KNN',pr_KNN[0],re_KNN[0],acc_KNN,f1_KNN[0]])

# **NB 5-fold cross validation using BoW**

In [14]:
#NB for 5-fold
cv = KFold(n_splits=5,shuffle=True, random_state=0)
classifier=GaussianNB()

acc_NB = cross_val_score(classifier, BM, y, scoring='accuracy', cv=cv)
re_NB = cross_val_score(classifier, BM, y, scoring='recall', cv=cv)
pr_NB = cross_val_score(classifier, BM, y, scoring='precision', cv=cv)
f1_NB = cross_val_score(classifier, BM, y, scoring='f1', cv=cv)

y_pred_NB = cross_val_predict(classifier, BM, y, cv=cv)
cm_NB = confusion_matrix(y, y_pred_NB)

from mlxtend.plotting import plot_confusion_matrix
print("Confusion Matrix for Model 4 (NB)")
plot_confusion_matrix(conf_mat=cm_NB,show_absolute=True,
                                show_normed=True,
                                colorbar=True)

evalution_table.append(['NB',pr_NB[0],re_NB[0],acc_NB,f1_NB[0]])

# ***Boosting Classifier using K-fold Cross Validation***

# ***AdaB 5-fold cross validation using BoW***

In [15]:
#Ada Boosting for 5-fold
cv = KFold(n_splits=5,shuffle=True, random_state=0)
classifier = AdaBoostClassifier(n_estimators=100, learning_rate=0.1)

acc_adab = cross_val_score(classifier, BM, y, scoring='accuracy', cv=cv)
re_adab = cross_val_score(classifier, BM, y, scoring='recall', cv=cv)
pr_adab = cross_val_score(classifier, BM, y, scoring='precision', cv=cv)
f1_adab = cross_val_score(classifier, BM, y, scoring='f1', cv=cv)

y_pred_adab = cross_val_predict(classifier, BM, y, cv=cv)
cm_adab = confusion_matrix(y, y_pred_adab)

from mlxtend.plotting import plot_confusion_matrix
print("Confusion Matrix for Model 4 (AdaB)")
plot_confusion_matrix(conf_mat=cm_adab,show_absolute=True,
                                show_normed=True,
                                colorbar=True)

evalution_table.append(['AdaB',pr_adab[0],re_adab[0],acc_adab,f1_adab[0]])

# ***GB 5-fold cross validation using BoW***

In [16]:
#GB Boosting for 5-fold

cv = KFold(n_splits=5,shuffle=True, random_state=0)
classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1)

acc_gb = cross_val_score(classifier, BM, y, scoring='accuracy', cv=cv)
re_gb = cross_val_score(classifier, BM, y, scoring='recall', cv=cv)
pr_gb = cross_val_score(classifier, BM, y, scoring='precision', cv=cv)
f1_gb = cross_val_score(classifier, BM, y, scoring='f1', cv=cv)

y_pred_gb = cross_val_predict(classifier, BM, y, cv=cv)
cm_gb = confusion_matrix(y, y_pred_gb)

from mlxtend.plotting import plot_confusion_matrix
print("Confusion Matrix for Model 4 (GB)")
plot_confusion_matrix(conf_mat=cm_gb,show_absolute=True,
                                show_normed=True,
                                colorbar=True)

evalution_table.append(['GB',pr_gb[0],re_gb[0],acc_gb,f1_gb[0]])

# **Result Analysis after 5-fold cross validation for BoW**

In [17]:
print("Results 5-fold Cross Validation Using BoW")
print(tabulate(evalution_table))

# ***ROC Curve for 5-fold cross validation***

In [18]:
fpr, tpr, _ = metrics.roc_curve(y, y_pred_SVM)
auc = round(metrics.roc_auc_score(y, y_pred_SVM), 4)
plt.plot(fpr,tpr,label="SVM, AUC="+str(auc))

fpr, tpr, _ = metrics.roc_curve(y, y_pred_DT)
auc = round(metrics.roc_auc_score(y, y_pred_DT), 4)
plt.plot(fpr,tpr,label="DT, AUC="+str(auc))

fpr, tpr, _ = metrics.roc_curve(y, y_pred_RF)
auc = round(metrics.roc_auc_score(y, y_pred_RF), 4)
plt.plot(fpr,tpr,label="RF, AUC="+str(auc))

fpr, tpr, _ = metrics.roc_curve(y, y_pred_KNN)
auc = round(metrics.roc_auc_score(y, y_pred_KNN), 4)
plt.plot(fpr,tpr,label="SVM, AUC="+str(auc))

fpr, tpr, _ = metrics.roc_curve(y, y_pred_NB)
auc = round(metrics.roc_auc_score(y, y_pred_NB), 4)
plt.plot(fpr,tpr,label="DT, AUC="+str(auc))

fpr, tpr, _ = metrics.roc_curve(y, y_pred_adab)
auc = round(metrics.roc_auc_score(y, y_pred_adab), 4)
plt.plot(fpr,tpr,label="RF, AUC="+str(auc))

fpr, tpr, _ = metrics.roc_curve(y, y_pred_gb)
auc = round(metrics.roc_auc_score(y, y_pred_gb), 4)
plt.plot(fpr,tpr,label="GB, AUC="+str(auc))

plt.legend()