In [1]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score, classification_report
print("All imports ready!")

All imports ready!


In [2]:
# Data Loading + Analysis 
# Load dataset
df = pd.read_csv('C:/Users/chinm/OneDrive/Desktop/flipkart_sentiment/data/data.csv', encoding='latin-1')

# Basic analysis
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nRating distribution:")
print(df['Ratings'].value_counts().sort_index())
print("\nNull values:")
print(df.isnull().sum())

# Create binary sentiment (positive=4-5, negative=1-3)
df = df.dropna(subset=['Review text', 'Ratings'])
df['sentiment'] = df['Ratings'].apply(lambda x: 1 if x >= 4 else 0)
print(f"\nSentiment balance: {df['sentiment'].value_counts().to_dict()}")

Dataset shape: (8518, 8)

Columns: ['Reviewer Name', 'Review Title', 'Place of Review', 'Up Votes', 'Down Votes', 'Month', 'Review text', 'Ratings']

Rating distribution:
Ratings
1     769
2     308
3     615
4    1746
5    5080
Name: count, dtype: int64

Null values:
Reviewer Name       10
Review Title        10
Place of Review     50
Up Votes            10
Down Votes          10
Month              465
Review text          8
Ratings              0
dtype: int64

Sentiment balance: {1: 6823, 0: 1687}


In [3]:
# Data Cleaning (Text Cleaning & Normalization)
def clean_text(text):
    """Remove special chars, normalize (satisfies preprocessing req)"""
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text)  # URLs
    text = re.sub(r'[^a-z\s]', ' ', text)                 # Special chars
    text = re.sub(r'\s+', ' ', text).strip()              # Spaces
    return text

# Clean, combine title & review text
df['Review Title'] = df['Review Title'].fillna('')
df['full_text'] = (df['Review Title'] + ' ' + df['Review text']).apply(clean_text)
df = df[df['full_text'].str.len() > 20]  # Drop too short

print("Cleaned data:", df.shape)
print("Sample cleaned:", df['full_text'].iloc[0][:100] + '...')

Cleaned data: (8359, 10)
Sample cleaned: nice product nice product good quality but price is now rising which is a bad sign was an affordable...


In [None]:
# Text Embedding - TF-IDF 
X = df['full_text']
y = df['sentiment']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# TF-IDF (Bag-of-Words + TF-IDF required)
tfidf = TfidfVectorizer(
    max_features=8000,
    ngram_range=(1, 3),  # Up to trigrams
    min_df=3,
    max_df=0.9
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print("TF-IDF features:")
print(f"Train: {X_train_tfidf.shape}")
print(f"Test: {X_test_tfidf.shape}")
print("Train balance:", dict(y_train.value_counts()))

TF-IDF features:
Train: (6687, 5004)
Test: (1672, 5004)
Train balance: {1: np.int64(5357), 0: np.int64(1330)}


In [5]:
# Model Training (ML Models)
# Logistic Regression 
classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weights = {cls: w for cls, w in zip(classes, weights)}

model = LogisticRegression(
    max_iter=3000, 
    random_state=42,
    C=0.5,                    
    class_weight=class_weights 
)
model.fit(X_train_tfidf, y_train)
print("Model trained!")

Model trained!


In [6]:
# Model Evaluation - F1-Score 
y_pred = model.predict(X_test_tfidf)
f1 = f1_score(y_test, y_pred)

print("MODEL PERFORMANCE:")
print(f"F1-Score: {f1:.3f}")
print("\nDetailed Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("\nConfusion Matrix:")
print(pd.crosstab(y_test, y_pred, rownames=['Actual'], colnames=['Predicted']))

MODEL PERFORMANCE:
F1-Score: 0.918

Detailed Report:
              precision    recall  f1-score   support

           0       0.65      0.78      0.71       333
           1       0.94      0.90      0.92      1339

    accuracy                           0.87      1672
   macro avg       0.80      0.84      0.81      1672
weighted avg       0.88      0.87      0.88      1672


Confusion Matrix:
Predicted    0     1
Actual              
0          259    74
1          139  1200


In [7]:
# Pain Points Analysis (Objective: dissatisfaction insights)
neg_reviews = df[df['sentiment'] == 0]['full_text']

# CLEAN STOPWORDS + "READ MORE" artifacts
stop_words = {
    'the', 'and', 'for', 'are', 'but', 'this', 'with', 'one', 'get', 'now', 'very', 
    'more', 'good', 'product', 'read', 'shuttle', 'will', 'from', 'just', 'nice',
    'time', 'only', 'purchase', 'worth', 'terrific', 'awesome', 'mavis'
}

# SPLIT "readmore" artifacts
neg_words = []
for text in neg_reviews:
    # Fix "goodread" â†’ "good" + "read"
    text = re.sub(r'read$', '', text)  # Remove "read" suffix
    words = text.split()
    words = [w.rstrip('read') for w in words]  # Strip "read" endings
    words = [w for w in words if len(w) > 3 and w not in stop_words]
    neg_words.extend(words)

print("TRUE NEGATIVE PAIN POINTS:")
pain_points = Counter(neg_words).most_common(15)
for word, count in pain_points:
    print(f"  {word:<12} {count:3d}")

print("\nNegative review examples:")
for i, review in enumerate(neg_reviews.head(3)):
    print(f"{i+1}. {review[:120]}...")


TRUE NEGATIVE PAIN POINTS:
  quality      518
  shuttl       286
  shuttles     168
  money        156
  worst        144
  purchas      119
  wast         102
  damag         95
  awesom        79
  flipkart      72
  bett          70
  original      69
  days          67
  disappoint    66
  recommen      59

Negative review examples:
1. don t waste your money they didn t supplied yonex mavis outside cover was yonex ad inside was a cheapest sad to hear thi...
2. did not meet expectations worst product damaged shuttlecocks packed in new box it s not a original yonex product don t b...
3. fair quite o k but nowadays the quality of the corks like not as before to years back i am using mavis for more than yea...


In [8]:
# Save 
os.makedirs('../models', exist_ok=True)
joblib.dump(model, '../models/model.pkl')
joblib.dump(tfidf, '../models/tfidf.pkl')

print("SAVED")

SAVED
