In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
amznreviews=pd.read_csv("/content/amazon.csv")

In [None]:
amznreviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4915 entries, 0 to 4914
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Unnamed: 0            4915 non-null   int64  
 1   reviewerName          4914 non-null   object 
 2   overall               4915 non-null   int64  
 3   reviewText            4914 non-null   object 
 4   reviewTime            4915 non-null   object 
 5   day_diff              4915 non-null   int64  
 6   helpful_yes           4915 non-null   int64  
 7   helpful_no            4915 non-null   int64  
 8   total_vote            4915 non-null   int64  
 9   score_pos_neg_diff    4915 non-null   int64  
 10  score_average_rating  4915 non-null   float64
 11  wilson_lower_bound    4915 non-null   float64
dtypes: float64(2), int64(7), object(3)
memory usage: 460.9+ KB


In [None]:
amznreviews.head(2)

Unnamed: 0.1,Unnamed: 0,reviewerName,overall,reviewText,reviewTime,day_diff,helpful_yes,helpful_no,total_vote,score_pos_neg_diff,score_average_rating,wilson_lower_bound
0,0,,4,No issues.,23-07-2014,138,0,0,0,0,0.0,0.0
1,1,0mie,5,"Purchased this for my device, it worked as adv...",25-10-2013,409,0,0,0,0,0.0,0.0


In [None]:
reviewdf=amznreviews[['reviewText','overall']]

In [None]:
reviewdf=reviewdf.dropna()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
DTM=CountVectorizer(max_features=200,stop_words='english',token_pattern=r"[^\W\d_]+")

In [None]:
X_DTM=DTM.fit_transform(reviewdf['reviewText'])

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logistic=LogisticRegression(max_iter=1000).fit(X_DTM,reviewdf['overall'])

In [None]:
logistic.score(X_DTM,reviewdf['overall'])

0.8353683353683353

In [None]:
logistic_predict=logistic.predict(X_DTM)

In [None]:
pd.crosstab(reviewdf['overall'],logistic_predict)

col_0,1,2,3,4,5
overall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,135,1,1,4,103
2,11,29,2,0,38
3,15,1,29,4,93
4,10,2,6,31,478
5,19,1,3,17,3881


In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(reviewdf['overall'],logistic_predict))

# predictions of overall 1,2,3,4 are bad as the algorithm do not have enough data
# to train and learn patterns.
#  recall, f1-score of overall 1,2,3,4 are worst due to Imbalance in data

              precision    recall  f1-score   support

           1       0.71      0.55      0.62       244
           2       0.85      0.36      0.51        80
           3       0.71      0.20      0.32       142
           4       0.55      0.06      0.11       527
           5       0.84      0.99      0.91      3921

    accuracy                           0.84      4914
   macro avg       0.73      0.43      0.49      4914
weighted avg       0.80      0.84      0.79      4914



In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
nbmodel=MultinomialNB().fit(X_DTM,reviewdf['overall'])

In [None]:
nbmodel.score(X_DTM,reviewdf['overall'])

0.8068783068783069

In [None]:
nb_predict=nbmodel.predict(X_DTM)

In [None]:
pd.crosstab(reviewdf['overall'],nb_predict)

col_0,1,2,3,4,5
overall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,150,12,4,5,73
2,30,22,6,3,19
3,31,4,24,5,78
4,26,3,15,61,422
5,82,24,19,88,3708


In [None]:
print(classification_report(reviewdf['overall'],nb_predict))

              precision    recall  f1-score   support

           1       0.47      0.61      0.53       244
           2       0.34      0.28      0.30        80
           3       0.35      0.17      0.23       142
           4       0.38      0.12      0.18       527
           5       0.86      0.95      0.90      3921

    accuracy                           0.81      4914
   macro avg       0.48      0.42      0.43      4914
weighted avg       0.77      0.81      0.78      4914



In [None]:
from imblearn.over_sampling import SMOTEN

In [None]:
X_smote,y_smote=SMOTEN().fit_resample(X_DTM,reviewdf['overall'])



In [None]:
print(pd.DataFrame(y_smote).value_counts()) # All 5 ratings/overall are 3921

# SMOTE is Oversampling technique, creates and adds new synthetic datapoints in
# minority classes there by increasing their occurence equivalent to majority class
# (SMOTE stands for )

overall
1          3921
2          3921
3          3921
4          3921
5          3921
Name: count, dtype: int64


In [None]:
logistic_smote=LogisticRegression(max_iter=1000).fit(X_smote,y_smote)

In [None]:
logistic_smote.score(X_smote,y_smote)

0.6186687069625095

In [None]:
logistic_smote_predict=logistic_smote.predict(X_smote)

In [None]:
pd.crosstab(y_smote,logistic_smote_predict)

col_0,1,2,3,4,5
overall,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1786,818,1170,89,58
2,213,2215,1412,59,22
3,137,573,3078,70,63
4,344,235,1180,1809,353
5,94,66,103,417,3241


In [None]:
nbmodel_smote=MultinomialNB().fit(X_smote,y_smote)

In [None]:
nbmodel_smote.score(X_smote,y_smote)

0.5526651364447845

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
treemodel=DecisionTreeClassifier().fit(X_smote,y_smote)

In [None]:
treemodel.score(X_smote,y_smote)

0.7828615149196634

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
RFmodel=RandomForestClassifier(n_estimators=1000).fit(X_smote,y_smote)

In [None]:
RFmodel.score(X_smote,y_smote)

0.7828615149196634