In [1]:
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import warnings
warnings.filterwarnings('ignore')
import spacy
nlp = spacy.load("en_core_web_sm")

In [2]:
#read the combined data
cmb_data = pd.read_csv(r'C:\Users\Sithalrao\Downloads\combined_data_11.csv')

In [3]:
cmb_data['reviewText'] = cmb_data['reviewText'].astype(str)

In [4]:
# Split train and test
train_data, test_data = train_test_split(cmb_data, test_size=0.2, random_state=42)

# for the category overall if x==5 label it as 1 and rest all as 0
train_data['binary_labels'] = train_data['overall'].apply(lambda x: 1 if x == 5 else 0)
test_data['binary_labels'] = test_data['overall'].apply(lambda x: 1 if x == 5 else 0)


In [5]:
# TFIDF features (with 50K vocabulary) 
vectorizer = TfidfVectorizer(max_features=50000)
X_train = vectorizer.fit_transform(train_data['reviewText'])
X_test = vectorizer.transform(test_data['reviewText'])


# Applying Multinomial Na√Øve Bayes model 

In [13]:
binary = MultinomialNB()
binary.fit(X_train, train_data['binary_labels'])
binary_pred = binary.predict(X_test)

# binary classification

In [20]:
print("Binary classification problem report:")
print(classification_report(test_data['binary_labels'], binary_pred))
print("Accuracy:", accuracy_score(test_data['binary_labels'], binary_pred))


Binary classification problem report:
              precision    recall  f1-score   support

           0       0.83      0.79      0.81     83750
           1       0.82      0.85      0.83     92978

    accuracy                           0.82    176728
   macro avg       0.82      0.82      0.82    176728
weighted avg       0.82      0.82      0.82    176728

Accuracy: 0.8223428092888506


# five-class classification

In [17]:
five_multiclass = MultinomialNB()
five_multiclass.fit(X_train, train_data['overall'])
five_multiclass_pred = five_multiclass.predict(X_test)

In [23]:
print("Five Class Classification problem report:")
print(classification_report(test_data['overall'], five_multiclass_pred))
print("Accuracy:", accuracy_score(test_data['overall'], five_multiclass_pred))

Five Class Classification problem report:
              precision    recall  f1-score   support

         1.0       0.66      0.59      0.62     21625
         2.0       0.45      0.00      0.01     13048
         3.0       0.38      0.11      0.17     19333
         4.0       0.41      0.16      0.23     29744
         5.0       0.65      0.97      0.78     92978

    accuracy                           0.62    176728
   macro avg       0.51      0.37      0.36    176728
weighted avg       0.56      0.62      0.54    176728

Accuracy: 0.6233590602507809
