### Import Libraries

In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import xgboost as xgb
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.pipeline import Pipeline
import pickle
import os
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
import joblib

[nltk_data] Downloading package stopwords to C:\Users\Gowtham
[nltk_data]     Kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
root_dir = r"F:\PROJECT\ml-streamlit-showcase"

### Import Dataset

In [5]:
# dataset from https://www.kaggle.com/datasets/niraliivaghani/flipkart-product-customer-reviews-dataset
dataset = pd.read_csv('Dataset-SA.csv')

In [6]:
dataset.head()

Unnamed: 0,product_name,product_price,Rate,Review,Summary,Sentiment
0,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,super!,great cooler excellent air flow and for this p...,positive
1,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,5,awesome,best budget 2 fit cooler nice cooling,positive
2,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,the quality is good but the power of air is de...,positive
3,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,1,useless product,very bad product its a only a fan,negative
4,Candes 12 L Room/Personal Air Cooler??????(Whi...,3999,3,fair,ok ok product,neutral


In [7]:
# Drop unnecessary columns
dataset.drop(columns=['product_name', 'product_price'], inplace=True)

In [8]:
# Concatenate 'Review' and 'Summary' into a single 'Review text' column
dataset['Review text'] = dataset['Review'] + ' ' + dataset['Summary']
dataset.drop(columns=['Review', 'Summary'], inplace=True)

# Re-order columns
dataset = dataset[['Review text', 'Rate', 'Sentiment']]
dataset.head()

Unnamed: 0,Review text,Rate,Sentiment
0,super! great cooler excellent air flow and for...,5,positive
1,awesome best budget 2 fit cooler nice cooling,5,positive
2,fair the quality is good but the power of air ...,3,positive
3,useless product very bad product its a only a fan,1,negative
4,fair ok ok product,3,neutral


In [9]:
dataset['Rate'].unique()

# Remove rows for 'Rate' that are not equal to 1, 2, 3, 4, or 5
dataset = dataset[pd.to_numeric(dataset['Rate'], errors='coerce').between(1, 5)]

In [10]:
dataset['Sentiment'].unique()


array(['positive', 'negative', 'neutral'], dtype=object)

### Data Pre-processing

In [11]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [12]:
# print first 10 rows of X
print("First 10 rows of X:")
print(X[:10])

First 10 rows of X:
[['super! great cooler excellent air flow and for this price its so amazing and unbelievablejust love it'
  '5']
 ['awesome best budget 2 fit cooler nice cooling' '5']
 ['fair the quality is good but the power of air is decent' '3']
 ['useless product very bad product its a only a fan' '1']
 ['fair ok ok product' '3']
 ['awesome the cooler is really fantastic and provides good air flow highly recommended'
  '5']
 ['highly recommended very good product' '5']
 ['nice very nice' '3']
 ['unsatisfactory very bad cooler' '1']
 ['worth the money very good' '4']]


In [13]:
# Process the text data
corpus = []
ps = PorterStemmer()

# Get default English stopwords
default_stopwords = set(stopwords.words('english'))

# Define exception words to keep
exceptions = {'not', 'no', 'nor', 'very', 'too', 'so', 'never', 'don', 't', 'i', 'you', 'we', 'could', 'would'}

# Create a customized list
custom_stopwords = default_stopwords - exceptions

for review in tqdm(X, desc="Preprocessing reviews"):
    review = re.sub('[^a-zA-Z]', ' ', str(review))  # Keep only letters
    review = review.lower().split()
    review = [ps.stem(word) for word in review if word not in custom_stopwords]
    corpus.append(' '.join(review))

Preprocessing reviews: 100%|██████████| 205049/205049 [00:23<00:00, 8806.43it/s] 


In [14]:
# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=1500)
X = tfidf.fit_transform(corpus).toarray()

In [15]:
y

array(['positive', 'positive', 'positive', ..., 'positive', 'positive',
       'neutral'], dtype=object)

In [16]:
# Preparing test and train datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((164039, 1500), (41010, 1500), (164039,), (41010,))

### Logistic Regression

In [56]:
logistic_regressor = LogisticRegression()
logistic_regressor.fit(X_train, y_train)

In [57]:
y_pred = logistic_regressor.predict(X_test)

In [58]:
# Evaluate Logistic Regression model
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")

Confusion Matrix:
[[ 4175   105  1198]
 [  367   689  1022]
 [  423   173 32858]]

Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.76      0.80      5478
     neutral       0.71      0.33      0.45      2078
    positive       0.94      0.98      0.96     33454

    accuracy                           0.92     41010
   macro avg       0.83      0.69      0.74     41010
weighted avg       0.91      0.92      0.91     41010

Accuracy: 0.92


### Naive bayes

In [None]:
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)
y_pred_nb = naive_bayes.predict(X_test)

In [62]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9198244330651061

Classification Report:
               precision    recall  f1-score   support

    negative       0.84      0.76      0.80      5478
     neutral       0.71      0.33      0.45      2078
    positive       0.94      0.98      0.96     33454

    accuracy                           0.92     41010
   macro avg       0.83      0.69      0.74     41010
weighted avg       0.91      0.92      0.91     41010



### SVC (linear)

In [None]:
svm_linear = SVC(kernel='linear')
svm_linear.fit(X_train, y_train)
y_pred_svm = svm_linear.predict(X_test)

In [None]:
print("Linear SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("\nClassification Report:\n", classification_report(y_test, y_pred_svm))

### XGBoost

In [20]:
# Encoding the target variable
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [21]:
# Create DMatrix
dtrain = xgb.DMatrix(X_train, label=y_train_encoded)
dtest = xgb.DMatrix(X_test, label=y_test_encoded)

In [22]:
params = {
    'objective': 'multi:softmax',
    'num_class': len(le.classes_),
    'eval_metric': 'mlogloss',
    'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

num_rounds = 100

In [23]:
xgb_model = xgb.train(params, dtrain, num_rounds)
y_pred = xgb_model.predict(dtest)

In [24]:
# Decode Predictions
y_pred_decoded = le.inverse_transform(y_pred.astype(int))

In [26]:
# Evaluation of the XGBoost model
print("Accuracy:", accuracy_score(y_test_encoded, y_pred))
print("\nClassification Report:\n", classification_report(y_test_encoded, y_pred))

Accuracy: 0.9178736893440624

Classification Report:
               precision    recall  f1-score   support

           0       0.86      0.75      0.80      5478
           1       0.80      0.28      0.41      2078
           2       0.93      0.99      0.96     33454

    accuracy                           0.92     41010
   macro avg       0.86      0.67      0.72     41010
weighted avg       0.91      0.92      0.91     41010



### SGD Classifier

In [27]:
sgd_clf = SGDClassifier(loss='hinge', max_iter=1000, tol=1e-3, random_state=42)

In [None]:
# Train model
sgd_clf.fit(X_train, y_train)

# Predict
y_pred = sgd_clf.predict(X_test)

In [None]:
# Evaluate SGD Classifier model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9161424042916362
Classification Report:
               precision    recall  f1-score   support

    negative       0.84      0.76      0.80      5478
     neutral       0.70      0.27      0.39      2078
    positive       0.93      0.98      0.96     33454

    accuracy                           0.92     41010
   macro avg       0.82      0.67      0.72     41010
weighted avg       0.91      0.92      0.91     41010



### Choosing the model and Exporting

- Based on evaluation metrics and model performance, Naive Bayes and Logistic Regression is prefered

In [None]:
#Tuning Naive Bayes model using GridSearchCV

# Define model
lg = LogisticRegression()

# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=1500)
X = tfidf.fit_transform(corpus).toarray()

# Preparing test and train datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# param_grid for Logistic Regression
param_grid_lg = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2', 'l1'],  # l1 is only available for 'liblinear' solver
    'solver': ['liblinear', 'saga']  # saga supports l1 penalty
}

# Setup grid search
grid = GridSearchCV(estimator=lg, param_grid=param_grid_lg, cv=3, scoring='accuracy', n_jobs=-1)

# Fit grid search on training data
grid.fit(X_train, y_train)

# Best params and best score
print("Best parameters:", grid.best_params_)
print("Best cross-validation accuracy:", grid.best_score_)

Best parameters: {'C': 1, 'penalty': 'l1', 'solver': 'saga'}
Best cross-validation accuracy: 0.9183060239603106




In [39]:
# After grid search finishes
best_nb = grid.best_estimator_

# Predict on test data
y_pred = best_nb.predict(X_test)

# Accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix (optional, helps understand error types)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Test Set Accuracy: 0.9199

Classification Report:
              precision    recall  f1-score   support

    negative       0.84      0.77      0.80      5478
     neutral       0.71      0.33      0.45      2078
    positive       0.94      0.98      0.96     33454

    accuracy                           0.92     41010
   macro avg       0.83      0.69      0.74     41010
weighted avg       0.91      0.92      0.91     41010


Confusion Matrix:
[[ 4191    99  1188]
 [  370   682  1026]
 [  426   176 32852]]


In [40]:
# Tuning naive Bayes model using GridSearchCV
nb = MultinomialNB()

# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=1500)
X = tfidf.fit_transform(corpus).toarray()

# Preparing test and train datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define hyperparameter grid
param_grid = {
    'alpha': [1e-5, 5e-5, 1e-4, 5e-4, 1e-3],
    'fit_prior': [True, False]
}

# Setup grid search
grid = GridSearchCV(estimator=nb, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1)

# Fit grid search on training data
grid.fit(X_train, y_train)

# Best params and best score
print("Best parameters:", grid.best_params_)
print("Best cross-validation accuracy:", grid.best_score_)

Best parameters: {'alpha': 0.0005, 'fit_prior': True}
Best cross-validation accuracy: 0.9037789911253405


In [41]:
# After grid search finishes
best_nb = grid.best_estimator_

# Predict on test data
y_pred = best_nb.predict(X_test)

# Accuracy
test_accuracy = accuracy_score(y_test, y_pred)
print(f"Test Set Accuracy: {test_accuracy:.4f}")

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix (optional, helps understand error types)
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Test Set Accuracy: 0.9065

Classification Report:
              precision    recall  f1-score   support

    negative       0.83      0.72      0.77      5478
     neutral       0.73      0.16      0.26      2078
    positive       0.92      0.98      0.95     33454

    accuracy                           0.91     41010
   macro avg       0.83      0.62      0.66     41010
weighted avg       0.90      0.91      0.89     41010


Confusion Matrix:
[[ 3917    35  1526]
 [  365   330  1383]
 [  440    86 32928]]


In [43]:
model_path = os.path.join(root_dir , "models\sentiment_predictor.pkl")
model_path

'F:\\PROJECT\\ml-streamlit-showcase\\models\\sentiment_predictor.pkl'

In [44]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=500)),
    ('clf', LogisticRegression(C=1, penalty='l1', solver='saga', max_iter=1000))
])

# Train on full dataset
pipeline.fit(corpus, y)

# Save pipeline
joblib.dump(pipeline, model_path)

['F:\\PROJECT\\ml-streamlit-showcase\\models\\sentiment_predictor.pkl']

### Loading the model and testing

In [46]:
# load the model
with open(model_path, "rb") as f:
    loaded_model = joblib.load(f)

In [55]:

# Test input
Review_text = ["This product is amazing! I love it."]

# Make prediction
prediction = loaded_model.predict(Review_text)
probabilities = loaded_model.predict_proba(Review_text)[0]
classes = loaded_model.classes_

# Display results
print(f"Predicted Sentiment: {prediction[0]}")
print("Probabilities:")
for sentiment, prob in zip(classes, probabilities):
    print(f"{sentiment}:{prob*100:.2f}%")

Predicted Sentiment: positive
Probabilities:
negative:0.05%
neutral:0.61%
positive:99.35%
