In [None]:
# # -------------------------
# # Comparison table
# # -------------------------
# results = []
# models = ["NaiveBayes","LogisticReg","TextCNN","BiLSTM","BERT","DistilBERT","Ensemble"]
# preds_list = [nb_preds, lr_preds, cnn_preds, lstm_preds, preds_bert, preds_distil, ensemble_preds]

# for name,preds in zip(models, preds_list):
#     results.append({
#         "model": name,
#         "accuracy": float(accuracy_score(test_labels, preds)),
#         "f1_spam": float(f1_score(test_labels, preds, pos_label=1))
#     })

# df = pd.DataFrame(results).round(4)
# print("\n=== Comparison ===")
# print(df)

# print("\nDone.")
# print("If the dataset load failed, ensure you ran 'huggingface-cli login' in your terminal.")




In [None]:
"""
Adjusted and robust version of spam_all_models.py
Run: python spam_all_models_fixed.py

Notes:
 - Handles various label formats robustly
 - Fixes undefined variables (train_texts, etc.)
 - Adds safer HF dataset loading and mapping
 - Keeps same model set but with clearer data flow and minor training stability tweaks
"""

import os
import sys
import math
import re
import warnings
from collections import Counter

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    set_seed
)
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack, csr_matrix

# -------------------------
# Basic config & seeds
# -------------------------
warnings.filterwarnings("ignore")
SEED = 42
set_seed(SEED)
torch.manual_seed(SEED)
np.random.seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -------------------------
# Try loading dataset robustly
# -------------------------
HF_DATASET = "mshenoda/spam-messages"
try:
    print(f"Attempting to load dataset: {HF_DATASET}")
    ds = load_dataset(HF_DATASET)
except Exception as e:
    print("Failed to load dataset from HuggingFace:\n", e)
    print("If this is a permission issue, try running: huggingface-cli login")
    raise

# Some HF datasets may store label as int or str, and text column may be named differently
# Find text column name
example_cols = ds['train'].column_names
print("Dataset columns (train):", example_cols)

# Heuristics for column names
if 'text' in example_cols:
    text_col = 'text'
elif 'message' in example_cols:
    text_col = 'message'
elif 'sms' in example_cols:
    text_col = 'sms'
else:
    # fallback to first string column
    text_col = next((c for c in example_cols if ds['train'].features[c].dtype == 'string'), 'text')
print('Using text column:', text_col)

# Normalize labels into integers 0 (ham) / 1 (spam)
LABEL_TO_ID = {"ham": 0, "spam": 1, "0": 0, "1": 1}

def normalize_label(example):
    lab = example.get('label') if 'label' in example else example.get('Label')
    if lab is None:
        # Try to infer from other columns
        for c in example:
            if isinstance(example[c], (int, float)) and example[c] in (0,1):
                example['label'] = int(example[c])
                return example
    # If label is already int-like
    try:
        example['label'] = int(lab)
        return example
    except Exception:
        # try mapping string
        lab_s = str(lab).lower()
        if lab_s in LABEL_TO_ID:
            example['label'] = LABEL_TO_ID[lab_s]
            return example
    raise ValueError(f"Unable to normalize label: {lab}")

# Apply normalization
ds = ds.map(normalize_label)

# Split aliases
train = ds['train']
val = ds['validation'] if 'validation' in ds else ds['test'].train_test_split(test_size=0.2, seed=SEED)['train']
test = ds['test'] if 'test' in ds else (ds['validation'] if 'validation' in ds else val)

# Convert to lists for the classic models
train_texts = [t[text_col] for t in train]
train_labels = [int(t['label']) for t in train]
val_texts = [t[text_col] for t in val]
val_labels = [int(t['label']) for t in val]
test_texts = [t[text_col] for t in test]
test_labels = [int(t['label']) for t in test]

# -------------------------
# Simple tokenizer (NB, CNN, LSTM)
# -------------------------
def tokenize_simple(text):
    if text is None:
        return []
    return re.findall(r"\w+|[!?.]", str(text).lower())

# -------------------------
# MODEL 1: Manual Naive Bayes (with explicit priors & likelihoods)
# -------------------------
print("\n--- Model 1: Manual Naive Bayes ---")
# build vocab
vocab = set()
for t in train_texts:
    vocab.update(tokenize_simple(t))
vocab = sorted(vocab)
V = len(vocab)
vocab_index = {w:i for i,w in enumerate(vocab)}

wc_spam = Counter()
wc_ham = Counter()
spam_docs = 0
ham_docs = 0

for text, label in zip(train_texts, train_labels):
    toks = tokenize_simple(text)
    if label == 1:
        spam_docs += 1
        wc_spam.update(toks)
    else:
        ham_docs += 1
        wc_ham.update(toks)

total_docs = len(train_labels)
P_spam = spam_docs / total_docs if total_docs>0 else 0.5
P_ham = ham_docs / total_docs if total_docs>0 else 0.5

total_spam_words = sum(wc_spam.values())
total_ham_words = sum(wc_ham.values())

# Laplace smoothing (alpha=1)
alpha = 1.0
den_spam = total_spam_words + alpha * V
den_ham = total_ham_words + alpha * V

loglik_spam = {}
loglik_ham = {}

for w in vocab:
    loglik_spam[w] = math.log((wc_spam[w] + alpha) / den_spam)
    loglik_ham[w]  = math.log((wc_ham[w]  + alpha) / den_ham)

# UNK log-prob for words not seen in training
log_unk_spam = math.log(alpha / den_spam)
log_unk_ham  = math.log(alpha / den_ham)

def predict_nb(text):
    toks = tokenize_simple(text)
    score_spam = math.log(P_spam + 1e-12)
    score_ham = math.log(P_ham + 1e-12)
    for t in toks:
        score_spam += loglik_spam.get(t, log_unk_spam)
        score_ham += loglik_ham.get(t, log_unk_ham)
    return 1 if score_spam > score_ham else 0

nb_preds = [predict_nb(t) for t in test_texts]
print(classification_report(test_labels, nb_preds, digits=4))

# Print priors and top features (weights) to show the math
print(f"P(spam) = {P_spam:.4f}, P(ham) = {P_ham:.4f}")
llr = {w: (loglik_spam[w] - loglik_ham[w]) for w in vocab}
top_spam_words = sorted(llr.items(), key=lambda x: x[1], reverse=True)[:20]
print("\nTop words favoring 'spam' (word, log-likelihood-ratio):")
for w,score in top_spam_words[:20]:
    print(f"{w}\t{score:.4f}")

# -------------------------
# MODEL 2: Logistic Regression (Handcrafted Features + TF-IDF)
# -------------------------
print("\n--- Model 2: Logistic Regression (Handcrafted + TF-IDF) ---")

def extract_features(text):
    txt = str(text)
    return {
        "len": len(txt),
        "exclaim": txt.count("!"),
        "digits": sum(ch.isdigit() for ch in txt),
        "uppercase": sum(1 for ch in txt if ch.isupper()),
        "urls": len(re.findall(r"http|www|\\.com", txt.lower())),
        "spam_kw": sum(1 for k in ["free","win","winner","cash","urgent","click"] if k in txt.lower())
    }

X_train_feats = pd.DataFrame([extract_features(t) for t in train_texts])
X_test_feats  = pd.DataFrame([extract_features(t) for t in test_texts])

# TF-IDF
tf = TfidfVectorizer(ngram_range=(1,2), min_df=2)
X_train_tfidf = tf.fit_transform(train_texts)
X_test_tfidf = tf.transform(test_texts)

X_train_dense = csr_matrix(X_train_feats.values)
X_test_dense  = csr_matrix(X_test_feats.values)

X_train_combined = hstack([X_train_tfidf, X_train_dense])
X_test_combined  = hstack([X_test_tfidf, X_test_dense])

lr = LogisticRegression(max_iter=2000, random_state=SEED, solver='saga', n_jobs=-1)
lr.fit(X_train_combined, train_labels)
lr_preds = lr.predict(X_test_combined)
print(classification_report(test_labels, lr_preds, digits=4))

# -------------------------
# Prepare vocab & encoding for CNN/LSTM
# -------------------------
print("\nPreparing vocab for CNN/LSTM...")
vocab2 = {"<PAD>":0, "<UNK>":1}
for t in train_texts:
    for tok in tokenize_simple(t):
        if tok not in vocab2:
            vocab2[tok] = len(vocab2)

MAX_LEN = 40

def encode_text(text, max_len=MAX_LEN):
    toks = tokenize_simple(text)
    ids = [vocab2.get(tok, 1) for tok in toks]
    ids = ids[:max_len]
    if len(ids) < max_len:
        ids += [0] * (max_len - len(ids))
    return ids

class TorchTextDataset(Dataset):
    def __init__(self, texts, labels, max_len=MAX_LEN):
        self.X = [encode_text(t, max_len) for t in texts]
        self.y = labels
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return torch.tensor(self.X[idx], dtype=torch.long), torch.tensor(int(self.y[idx]), dtype=torch.long)

BATCH = 32
train_dl = DataLoader(TorchTextDataset(train_texts, train_labels), batch_size=BATCH, shuffle=True)
test_dl  = DataLoader(TorchTextDataset(test_texts, test_labels), batch_size=BATCH)

counts = Counter(train_labels)
total_samples = len(train_labels)
class_weights = torch.tensor([
    total_samples / (2.0 * counts.get(0, 1)),
    total_samples / (2.0 * counts.get(1, 1))
], dtype=torch.float).to(device)

# -------------------------
# MODEL 3: TextCNN
# -------------------------
print("\n--- Model 3: TextCNN ---")
class TextCNN(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, num_filters=100):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.conv3 = nn.Conv1d(embed_dim, num_filters, kernel_size=3)
        self.conv4 = nn.Conv1d(embed_dim, num_filters, kernel_size=4)
        self.conv5 = nn.Conv1d(embed_dim, num_filters, kernel_size=5)
        self.fc = nn.Linear(num_filters*3, 2)
    def forward(self, x):
        x = self.embed(x).transpose(1,2)
        c3 = torch.relu(self.conv3(x)).max(dim=2)[0]
        c4 = torch.relu(self.conv4(x)).max(dim=2)[0]
        c5 = torch.relu(self.conv5(x)).max(dim=2)[0]
        out = torch.cat([c3,c4,c5], dim=1)
        return self.fc(out)

cnn_model = TextCNN(len(vocab2)).to(device)
optimizer = optim.Adam(cnn_model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss(weight=class_weights)

EPOCHS = 2
for ep in range(EPOCHS):
    cnn_model.train()
    total_loss = 0.0
    for X,y in train_dl:
        X = X.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        logits = cnn_model(X)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"[CNN] Epoch {ep+1}/{EPOCHS} loss: {total_loss/len(train_dl):.4f}")

cnn_model.eval()
cnn_preds = []
with torch.no_grad():
    for X,y in test_dl:
        X = X.to(device)
        logits = cnn_model(X)
        preds = logits.argmax(dim=1).cpu().numpy()
        cnn_preds.extend(preds.tolist())
print(classification_report(test_labels, cnn_preds, digits=4))

print("\nExample conv3 filter weights (first filter):")
print(cnn_model.conv3.weight.data[0][:10].cpu().numpy())

# -------------------------
# MODEL 4: BiLSTM
# -------------------------
print("\n--- Model 4: BiLSTM ---")
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embed_dim=100, hidden=128):
        super().__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden*2, 2)
    def forward(self, x):
        x = self.embed(x)
        out, (h, c) = self.lstm(x)
        final = torch.cat([h[-2], h[-1]], dim=1)
        return self.fc(final)

lstm_model = BiLSTM(len(vocab2)).to(device)
optimizer_l = optim.Adam(lstm_model.parameters(), lr=1e-3)
criterion_l = nn.CrossEntropyLoss(weight=class_weights)

for ep in range(EPOCHS):
    lstm_model.train()
    total_loss = 0.0
    for X,y in train_dl:
        X = X.to(device); y = y.to(device)
        optimizer_l.zero_grad()
        logits = lstm_model(X)
        loss = criterion_l(logits, y)
        loss.backward()
        optimizer_l.step()
        total_loss += loss.item()
    print(f"[LSTM] Epoch {ep+1}/{EPOCHS} loss: {total_loss/len(train_dl):.4f}")

lstm_model.eval()
lstm_preds = []
with torch.no_grad():
    for X,y in test_dl:
        X = X.to(device)
        preds = lstm_model(X).argmax(dim=1).cpu().numpy()
        lstm_preds.extend(preds.tolist())
print(classification_report(test_labels, lstm_preds, digits=4))


In [None]:
# ================================
# MANUAL INPUT PREDICTION SECTION
# ================================

def predict_manual(text):
    print("\n=== Manual Prediction ===")

    # 1) Naive Bayes prediction
    nb_pred = predict_nb(text)

    # 2) Logistic Regression TF-IDF prediction
    X_feats = csr_matrix(pd.DataFrame([extract_features(text)]).values)
    X_tfidf = tf.transform([text])
    X_combined = hstack([X_tfidf, X_feats])
    lr_pred = lr.predict(X_combined)[0]

    # 3) CNN prediction
    cnn_model.eval()
    X_enc = torch.tensor([encode_text(text)], dtype=torch.long).to(device)
    with torch.no_grad():
        cnn_pred = cnn_model(X_enc).argmax(dim=1).item()

    # 4) LSTM prediction
    lstm_model.eval()
    with torch.no_grad():
        lstm_pred = lstm_model(X_enc).argmax(dim=1).item()

    label_map = {0: "HAM", 1: "SPAM"}

    print(f"Naive Bayes:          {label_map[nb_pred]}")
    print(f"Logistic Regression:  {label_map[lr_pred]}")
    print(f"TextCNN:              {label_map[cnn_pred]}")
    print(f"BiLSTM:               {label_map[lstm_pred]}")

    return lr_pred  # default model


# ================================
# CLI MODE (runs when script is executed)
# ================================
if __name__ == "__main__":
    while True:
        msg = input("\nType a message (or 'exit'): ")
        if msg.lower() == "exit":
            break
        predict_manual(msg)
