In [17]:
import kagglehub
import pandas as pd
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [20]:
# Loading the dataset
path = kagglehub.dataset_download("ganiyuolalekan/spam-assassin-email-classification-dataset")
df = pd.read_csv(os.path.join(path, "spam_assassin.csv"))

In [21]:
# Droping rows with missing target or text
df = df.dropna(subset=['target', 'text'])
df['target'] = df['target'].astype(int)

In [22]:
# Making test and train split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

In [23]:
# Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [24]:
#Model training and evaluation
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42)
}

for name, model in models.items():
    model.fit(X_train_vec, y_train)
    preds = model.predict(X_test_vec)
    print(f"\n--- {name} ---")
    print("Accuracy:", accuracy_score(y_test, preds))
    print(classification_report(y_test, preds, target_names=["Not Spam", "Spam"]))


--- Logistic Regression ---
Accuracy: 0.9887931034482759
              precision    recall  f1-score   support

    Not Spam       0.98      1.00      0.99       779
        Spam       1.00      0.97      0.98       381

    accuracy                           0.99      1160
   macro avg       0.99      0.98      0.99      1160
weighted avg       0.99      0.99      0.99      1160


--- Naive Bayes ---
Accuracy: 0.9853448275862069
              precision    recall  f1-score   support

    Not Spam       0.98      1.00      0.99       779
        Spam       1.00      0.96      0.98       381

    accuracy                           0.99      1160
   macro avg       0.99      0.98      0.98      1160
weighted avg       0.99      0.99      0.99      1160


--- Random Forest ---
Accuracy: 0.9939655172413793
              precision    recall  f1-score   support

    Not Spam       0.99      1.00      1.00       779
        Spam       1.00      0.98      0.99       381

    accuracy          