In [10]:
pip install textblob

Collecting textblob
  Downloading textblob-0.19.0-py3-none-any.whl.metadata (4.4 kB)
Downloading textblob-0.19.0-py3-none-any.whl (624 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m624.3/624.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m-:--:--[0m
[?25hInstalling collected packages: textblob
Successfully installed textblob-0.19.0
Note: you may need to restart the kernel to use updated packages.


In [12]:
# =========================
# Imports
# =========================
import pandas as pd
import numpy as np
import re

from textblob import TextBlob

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [17]:
# =========================
# Data Loading
# =========================
df = pd.read_csv("WhatsApp.csv", encoding="utf-8", on_bad_lines="skip")
df.head()


Unnamed: 0,user,message,year,month,day,hour,minute
0,group_notification,Messages and calls are end-to-end encrypted. N...,2021,March,27,0,0
1,group_notification,"Group creator created group ""JGEC MECH 2K17-21""\n",2018,January,8,16,43
2,group_notification,You were added\n,2018,January,8,16,43
3,+91 95648 92981,Keo clg jabi?\n,2021,March,26,21,41
4,+91 6296 534 775,Kobe bol??\n,2021,March,26,21,46


In [19]:
# =========================
# Data Cleaning & Filtering
# =========================

# Remove media messages
df = df[df["message"] != "<Media omitted>\n"]

# Optional: filter selected users
selected_users = [
    "Debsrijan", "Sourav Kase", "Ojha", "Sailesh",
    "Navid 2", "Riju", "Rakesh Mandal",
    "Vivek Ekka", "Laha", "Pranay Manna ME", "Av(mota)"
]

df = df[df["user"].isin(selected_users)]
df.shape


(2724, 7)

In [21]:
# =========================
# Text Preprocessing
# =========================
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = text.replace("\n", "").replace("\r", "")
    return text

df["clean_message"] = df["message"].apply(clean_text)
df[["message", "clean_message"]].head()


Unnamed: 0,message,clean_message
23,@919123094581\n,
24,@919123094581\n,
25,@919123094581\n,
32,Eta te a6e dekh last page e\n,eta te ae dekh last page e
34,31 theke khela suru\n,theke khela suru


In [23]:
# =========================
# Sentiment Label Generation (Binary)
# =========================
def get_sentiment(text):
    polarity = TextBlob(text).sentiment.polarity
    if polarity > 0:
        return "positive"
    elif polarity < 0:
        return "negative"
    else:
        return None

df["sentiment"] = df["clean_message"].apply(get_sentiment)

# Remove neutral / undefined samples
df = df.dropna(subset=["sentiment"])
df["sentiment"].value_counts()


sentiment
positive    178
negative     73
Name: count, dtype: int64

In [25]:
# =========================
# Feature Extraction (TF-IDF)
# =========================
X = df["clean_message"]
y = df["sentiment"]

vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words="english"
)

X_vec = vectorizer.fit_transform(X)


In [27]:
# =========================
# Train-Test Split
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X_vec, y, test_size=0.2, random_state=42
)


In [29]:
# =========================
# Model Training
# =========================
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


In [31]:
# =========================
# Model Evaluation
# =========================
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.6666666666666666
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00        17
    positive       0.67      1.00      0.80        34

    accuracy                           0.67        51
   macro avg       0.33      0.50      0.40        51
weighted avg       0.44      0.67      0.53        51



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
