## SvaraAI Reply Classification Pipeline

A clean, end-to-end notebook to:
- Load and preprocess the dataset
- Train and compare baseline models (TF-IDF + classical ML)
- Fine-tune DistilBERT
- Evaluate with Accuracy and F1, then compare and conclude

Dataset file expected at `reply_classification_dataset.csv` in the same directory.


In [2]:
# If running in a fresh environment, uncomment the following to install deps
# %pip install -q numpy pandas scikit-learn matplotlib seaborn
# %pip install -q transformers datasets evaluate torch accelerate
# %pip install -q lightgbm

import os
import numpy as np
import pandas as pd
import re
from typing import Tuple, Dict

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix

from lightgbm import LGBMClassifier

from datasets import Dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification, TrainingArguments,Trainer,DataCollatorWithPadding

RANDOM_STATE = 42
DATA_PATH = "reply_classification_dataset.csv"

# Ensure dataset exists
assert os.path.exists(DATA_PATH), f"Dataset not found at {DATA_PATH}. Place the CSV next to this notebook."


ModuleNotFoundError: No module named 'numpy'

In [None]:
# Load and preprocess dataset
RAW = pd.read_csv(DATA_PATH)
print(RAW.head())
print("Rows:", len(RAW))

# Normalize column names
RAW.columns = [c.strip().lower() for c in RAW.columns]
assert set(["reply", "label"]).issubset(RAW.columns), RAW.columns

# Label normalization map (handle casing/typos)
label_map = {
    "positive": "positive",
    "pos": "positive",
    "negative": "negative",
    "neg": "negative",
    "neutral": "neutral",
}

# Clean label strings
RAW["label"] = (
    RAW["label"].astype(str).str.strip().str.lower()
)

# Try to coerce known misspellings/casing to canonical labels
RAW["label"] = RAW["label"].replace({
    "positive": "positive",
    "pos": "positive",
    "neg": "negative",
    "negative": "negative",
    "neutral": "neutral",
})

# Filter unknown labels
VALID_LABELS = {"positive", "negative", "neutral"}
RAW = RAW[RAW["label"].isin(VALID_LABELS)].copy()

# Text cleaning function
CLEAN_RE = re.compile(r"[^a-z0-9\s']+")
MULTISPACE = re.compile(r"\s+")


def clean_text(text: str) -> str:
    if not isinstance(text, str):
        return ""
    t = text.strip().lower()
    t = CLEAN_RE.sub(" ", t)
    t = MULTISPACE.sub(" ", t)
    return t.strip()

RAW["reply"] = RAW["reply"].astype(str).apply(clean_text)

# Drop missing/empty rows
RAW = RAW[(RAW["reply"].str.len() > 0) & (RAW["label"].str.len() > 0)].copy()

RAW["label"].value_counts(normalize=True).rename("proportion").to_frame()


In [None]:
# Train/val/test split

X = RAW["reply"].values
y = RAW["label"].values

# train (70%), temp (30%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, stratify=y, random_state=RANDOM_STATE
)
# val (15%), test (15%)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=RANDOM_STATE
)

len(X_train), len(X_val), len(X_test)


In [None]:
# Baseline models: TF-IDF + classical ML

# Vectorizer
vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    sublinear_tf=True,
)

# Candidate models
candidates = {
    "logreg": LogisticRegression(max_iter=1000, class_weight=None, n_jobs=None),
    "linear_svc": LinearSVC(),
    "sgd_log": SGDClassifier(loss="log_loss", max_iter=1000),
    "mnb": MultinomialNB(),
    "cnb": ComplementNB(),
    "lgbm": LGBMClassifier(
        objective="multiclass",
        num_class=3,
        n_estimators=300,
        learning_rate=0.1,
        random_state=RANDOM_STATE,
        n_jobs=-1,
    ),
}

results = []
best_name = None
best_f1 = -1.0
best_model = None

for name, estimator in candidates.items():
    pipe = Pipeline([
        ("tfidf", vectorizer),
        ("clf", estimator),
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average="macro")
    results.append({"model": name, "val_accuracy": acc, "val_f1_macro": f1})
    if (f1 > best_f1) or (np.isclose(f1, best_f1) and acc > [r["val_accuracy"] for r in results if r["model"]==best_name][0] if best_name else False):
        best_f1 = f1
        best_name = name
        best_model = pipe

pd.DataFrame(results).sort_values("val_f1_macro", ascending=False).reset_index(drop=True)


In [None]:
# Evaluate best baseline on test set

# Refit best model on train+val for final testing
best_model.fit(np.concatenate([X_train, X_val]), np.concatenate([y_train, y_val]))

y_pred_test = best_model.predict(X_test)
print("Best baseline:", best_name)
print(classification_report(y_test, y_pred_test, digits=4))

cm = confusion_matrix(y_test, y_pred_test, labels=["negative", "neutral", "positive"])
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=["negative", "neutral", "positive"],
            yticklabels=["negative", "neutral", "positive"])
plt.title(f"Baseline ({best_name}) Confusion Matrix")
plt.ylabel("True")
plt.xlabel("Predicted")
plt.tight_layout()
plt.show()


In [None]:
# Fine-tune DistilBERT

label_list = ["negative", "neutral", "positive"]
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

train_ds = Dataset.from_dict({
    "text": list(X_train),
    "label": [label_to_id[l] for l in y_train],
})
val_ds = Dataset.from_dict({
    "text": list(X_val),
    "label": [label_to_id[l] for l in y_val],
})
test_ds = Dataset.from_dict({
    "text": list(X_test),
    "label": [label_to_id[l] for l in y_test],
})

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(batch):
    return tokenizer(batch["text"], truncation=True, max_length=128)

train_tok = train_ds.map(tokenize_function, batched=True)
val_tok = val_ds.map(tokenize_function, batched=True)
test_tok = test_ds.map(tokenize_function, batched=True)

cols_to_remove = [c for c in ["text"] if c in train_tok.column_names]
train_tok = train_tok.remove_columns(cols_to_remove)
val_tok = val_tok.remove_columns(cols_to_remove)
test_tok = test_tok.remove_columns(cols_to_remove)

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id,
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "f1": f1}

# Backward-compatible TrainingArguments (supports older transformers without evaluation_strategy)
try:
    args = TrainingArguments(
        output_dir="bert_out",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=32,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_steps=50,
        load_best_model_at_end=True,
        metric_for_best_model="eval_f1",
        greater_is_better=True,
    )
except TypeError:
    try:
        args = TrainingArguments(
            output_dir="bert_out",
            learning_rate=2e-5,
            per_device_train_batch_size=16,
            per_device_eval_batch_size=32,
            num_train_epochs=3,
            weight_decay=0.01,
            logging_steps=50,
        )
    except TypeError:
        # Very old versions may require per_gpu_* arguments
        args = TrainingArguments(
            output_dir="bert_out",
            learning_rate=2e-5,
            per_gpu_train_batch_size=16,
            per_gpu_eval_batch_size=32,
            num_train_epochs=3,
            logging_steps=50,
        )

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

train_result = trainer.train()
val_metrics = trainer.evaluate()
print(val_metrics)

test_metrics = trainer.evaluate(test_tok)
print(test_metrics)

bert_val_acc = val_metrics.get("eval_accuracy")
bert_val_f1 = val_metrics.get("eval_f1")
bert_test_acc = test_metrics.get("eval_accuracy")
bert_test_f1 = test_metrics.get("eval_f1")


In [None]:
# Compare and conclude

# Capture baseline metrics (from validation/test)
baseline_val_pred = best_model.predict(X_val)
baseline_val_acc = accuracy_score(y_val, baseline_val_pred)
baseline_val_f1 = f1_score(y_val, baseline_val_pred, average="macro")

baseline_test_acc = accuracy_score(y_test, y_pred_test)
baseline_test_f1 = f1_score(y_test, y_pred_test, average="macro")

comparison = pd.DataFrame([
    {"model": f"baseline_{best_name}", "split": "val", "accuracy": baseline_val_acc, "f1": baseline_val_f1},
    {"model": f"baseline_{best_name}", "split": "test", "accuracy": baseline_test_acc, "f1": baseline_test_f1},
    {"model": "distilbert", "split": "val", "accuracy": bert_val_acc, "f1": bert_val_f1},
    {"model": "distilbert", "split": "test", "accuracy": bert_test_acc, "f1": bert_test_f1},
])
comparison


### Conclusion
- If DistilBERT outperforms the best baseline on F1 (especially macro-F1), prefer it in production for better generalization on nuanced language.
- If the dataset is small and latency/cost constraints are strict, the TF-IDF + Linear model is simpler, faster, and may be sufficient.
- Consider model monitoring and periodic re-training as data drifts.

