In [19]:
# ==============================
# Test Emails Dictionary
# ==============================
test_emails = {
    "email1": {
        "text": "Subject: Congratulations! You have won a free iPhone. Click here to claim your prize.",
        "expected": "Spam"
    },
    "email2": {
        "text": "Subject: Work from home opportunity\nEarn $5000 per week with no experience required. Apply now!",
        "expected": "Spam"
    },
    "email3": {
        "text": "Subject: Urgent account alert\nYour bank account has been suspended. Verify your details immediately.",
        "expected": "Spam"
    },
    "email4": {
        "text": "Subject: Meeting reminder\nDon’t forget our project meeting tomorrow at 10 AM.",
        "expected": "Ham"
    },
    "email5": {
        "text": "Subject: Dinner plans\nHey, are we still on for dinner tonight?",
        "expected": "Ham"
    },
    "email6": {
        "text": "Subject: Monthly statement\nYour monthly account statement is now available. Please log in to view details.",
        "expected": "Ham"
    }
}

test_emails_extended = {
    # Spam examples
    "email7": {
        "text": "Subject: Claim your lottery winnings\nYou have won $1,000,000 in the global lottery. Send your details to claim.",
        "expected": "Spam"
    },
    "email8": {
        "text": "Subject: Limited time discount\nGet 70% off all electronics today only. Visit our store now!",
        "expected": "Spam"
    },
    "email9": {
        "text": "Subject: Suspicious login attempt\nWe detected a login from an unknown device. Verify your account immediately.",
        "expected": "Spam"
    },
    "email10": {
        "text": "Subject: Free vacation package\nBook now and enjoy a free trip to the Bahamas. Offer ends soon!",
        "expected": "Spam"
    },

    # Ham examples
    "email11": {
        "text": "Subject: Weekly team update\nHi team, please find attached the weekly progress report. Let’s discuss tomorrow.",
        "expected": "Ham"
    },
    "email12": {
        "text": "Subject: Family gathering\nDon’t forget the family dinner this Sunday at grandma’s house.",
        "expected": "Ham"
    },
    "email13": {
        "text": "Subject: Doctor appointment reminder\nThis is a reminder for your scheduled appointment on January 5th at 3 PM.",
        "expected": "Ham"
    },
    "email14": {
        "text": "Subject: Travel itinerary\nYour flight to Rome is confirmed. Please check in online 24 hours before departure.",
        "expected": "Ham"
    }
}

test_emails.update(test_emails_extended)

In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# ==============================
# 1. Load dataset
# ==============================
df = pd.read_csv("spam_ham_dataset.csv", usecols=["text", "label_num"])

# ==============================
# 2. Preprocess dataset
# ==============================
df["raw_text"] = df["text"].fillna("")
df["label"] = df["label_num"].astype(int)

# Deduplicate within each label
df = df.groupby('label', group_keys=False).apply(
    lambda x: x.drop_duplicates(subset=['raw_text'])
).reset_index(drop=True)

print("Label distribution AFTER dedup:")
print(df["label"].value_counts())

# ==============================
# 3. Text cleaning function
# ==============================
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", " URL ", text)   # replace URLs
    text = re.sub(r"\d+", " NUMBER ", text)    # replace numbers
    text = re.sub(r"[^\w\s]", " ", text)       # remove punctuation
    text = re.sub(r"\s+", " ", text).strip()   # remove extra spaces
    return text

df["clean_text"] = df["raw_text"].apply(clean_text)
df["clean_text"] = df["clean_text"].fillna("")
df = df[df["clean_text"].str.strip() != ""].reset_index(drop=True)

# ==============================
# 4. Train/Test Split
# ==============================
X_train, X_test, y_train, y_test = train_test_split(
    df["clean_text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"]
)

# ==============================
# 5. Build Pipelines
# ==============================
pipelines = {
    "Decision Tree": Pipeline([
        ("tfidf", TfidfVectorizer(max_features=30000, ngram_range=(1,3), analyzer='char_wb')),
        ("clf", DecisionTreeClassifier(max_depth=50, random_state=42, class_weight="balanced"))
    ]),
    "Random Forest": Pipeline([
        ("tfidf", TfidfVectorizer(max_features=30000, ngram_range=(1,3), analyzer='char_wb')),
        ("clf", RandomForestClassifier(n_estimators=200, max_depth=50, random_state=42, class_weight="balanced"))
    ])
}

# ==============================
# 6. Train & Evaluate Models
# ==============================
for name, model in pipelines.items():
    print(f"\n=== {name} ===")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred))

# ==============================
# 7. Prediction Helper (using Random Forest by default)
# ==============================
def predict_email(text, threshold=0.7, model=pipelines["Random Forest"]):
    """Predict whether an email is Spam or Ham."""
    text = clean_text(text)
    probs = model.predict_proba([text])[0]
    print(f"Ham probability : {probs[0]:.4f}")
    print(f"Spam probability: {probs[1]:.4f}")
    label = "Spam" if probs[1] >= threshold else "Ham"
    print("Prediction:", label)

# ==============================
# 8. Test with Examples
# ==============================
spam_email = """
FREE FREE FREE!!! 
WIN MONEY NOW!!! 
CLICK THIS LINK TO CLAIM YOUR PRIZE!!!
"""

ham_email = """
Subject: Your Monthly Statement Is Ready

Your monthly account statement is now available.
Please log in to your account to view the details.
Thank you for choosing our service.
"""

print("\n--- Spam Example ---")
predict_email(spam_email)

print("\n--- Ham Example ---")
predict_email(ham_email)


print("\n=== Testing Dictionary Emails ===")
for key, email in test_emails.items():
    print(f"\n{key} (Expected: {email['expected']})")
    predict_email(email["text"], threshold=0.8)  # stricter threshold\

# ==============================
# 9. Leakage Check (Optional)
# ==============================
train_set = set(X_train)
test_set = set(X_test)
print("\nExact text overlap:", len(train_set & test_set))


# ==============================
# 10. Create a large synthetic test set
# ==============================
test_emails = {
    f"Spam_{i}": {
        "text": f"Congratulations! You won ${100+i*10} cash! Click here to claim your prize: URL",
        "expected": "Spam"
    } for i in range(50)  # 50 spam examples
}

# Add some typical ham emails
ham_texts = [
    "Subject: Meeting Reminder\nDon't forget the meeting at 10 AM tomorrow.",
    "Subject: Monthly Report\nPlease find attached the monthly report.",
    "Subject: Greetings\nHope you are doing well. Let's catch up soon.",
    "Subject: Invoice\nYour invoice for last month is attached.",
    "Subject: Appointment\nYour doctor's appointment is scheduled for next week."
]

for i, text in enumerate(ham_texts*10):  # 50 ham examples
    test_emails[f"Ham_{i}"] = {"text": text, "expected": "Ham"}

# Convert test_emails to lists for evaluation
X_test_large = [v["text"] for v in test_emails.values()]
y_test_large = [1 if v["expected"] == "Spam" else 0 for v in test_emails.values()]

# ==============================
# 11. Evaluate each model on synthetic test set
# ==============================
for name, model in pipelines.items():
    y_pred_large = model.predict(X_test_large)
    print(f"\n=== {name} on Large Test Set ===")
    print("Accuracy:", accuracy_score(y_test_large, y_pred_large))
    print(classification_report(y_test_large, y_pred_large, target_names=["Ham", "Spam"]))

# ==============================
# 12. Optional: test individual emails
# ==============================
print("\n--- Example Predictions ---")
for key, email in list(test_emails.items())[:5]:  # first 5 examples
    print(f"\n{key} (Expected: {email['expected']})")
    predict_email(email["text"])


  df = df.groupby('label', group_keys=False).apply(


Label distribution AFTER dedup:
label
0    3531
1    1462
Name: count, dtype: int64

=== Logistic Regression ===
Accuracy: 0.908908908908909
              precision    recall  f1-score   support

           0       0.99      0.88      0.93       706
           1       0.77      0.98      0.86       293

    accuracy                           0.91       999
   macro avg       0.88      0.93      0.90       999
weighted avg       0.93      0.91      0.91       999


=== Decision Tree ===
Accuracy: 0.9019019019019019
              precision    recall  f1-score   support

           0       0.94      0.92      0.93       706
           1       0.82      0.86      0.84       293

    accuracy                           0.90       999
   macro avg       0.88      0.89      0.88       999
weighted avg       0.90      0.90      0.90       999


=== Random Forest ===
