In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import GridSearchCV

# Naive Bayes Classifiers for Spam/Ham classification

## Problem Description

The goal is to build a machine learning model to classify emails as spam or not spam. 

## Data Description and Analyses

The dataset is taken from https://www.kaggle.com/datasets/ozlerhakan/spam-or-not-spam-dataset

In [2]:
# load teh data
data = pd.read_csv('../datasets/spam_or_not_spam.csv')

# see first and last 3 rows
pd.concat([data.head(3), data.tail(3)])

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
2997,thank you for shopping with us gifts for all ...,1
2998,the famous ebay marketing e course learn to s...,1
2999,hello this is chinese traditional 子 件 NUMBER世...,1


In [3]:
# Count the occurrences of 0 and 1 in the 'label' column
label_counts = data['label'].value_counts()
label_counts

label
0    2500
1     500
Name: count, dtype: int64

In [4]:
#check for NA values
data.isna().sum()

email    1
label    0
dtype: int64

The dataset contains 3,000 entries, with 2,500 labeled as non-spam (0) and 500 labeled as spam (1). 
There is one row with a missing value in the 'email' column, which will be removed to ensure data consistency."

In [5]:
# remove row with NA in email:
data.dropna(subset=['email'], inplace=True)

## Split the data into training and testing sets

In [6]:
X_train, X_test, y_train, y_test = train_test_split(data['email'], data['label'], test_size=0.2, random_state=42)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2399,)
(2399,)
(600,)
(600,)


## Convert text data into TF-IDF features

TF-IDF transformer must be fitted only on the training data, to avoid data leakage. This mimics real-world scenarios where the model only has access to training data and must generalize to unseen data.

In [7]:
vectorizer = TfidfVectorizer(stop_words='english')

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

## Train and Tune Classifiers Using GridSearchCV

We will train and evaluate three different Naive Bayes classifiers to identify the most effective one for this classification task.

Note: GaussianNB requires a dense matrix, whereas MultinomialNB and BernoulliNB can efficiently work with sparse matrices.

In [8]:
# Initialise classifiers and parameters for grid search
models = [
    {
        'name': 'GaussianNB',
        'classifier': GaussianNB(),
        'param_grid': {'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6, 1e-5]}
    },
    {
        'name': 'MultinomialNB',
        'classifier': MultinomialNB(),
        'param_grid': {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0]}
    },
    {
        'name': 'BernoulliNB',
        'classifier': BernoulliNB(),
        'param_grid': {'alpha': [0.1, 0.5, 1.0, 1.5, 2.0], 'binarize': [0.0, 0.5, 1.0]}
    }
]

In [9]:
# Train and Test each classifier
for model in models:
    print(f'Tuning hyperparameters for: {model["name"]}')

    # Convert sparse arrays to dense for GaussianNB
    if model['name'] == 'GaussianNB':
        X_train = X_train.toarray()
        X_test = X_test.toarray()


    # Perform GridSearchCV
    grid_search = GridSearchCV(
        estimator=model['classifier'],
        param_grid=model['param_grid'],
        cv=5, scoring='f1'
    )
    grid_search.fit(X_train, y_train)

    # Best parameters and score
    print(f'Best parameters for {model["name"]}: {grid_search.best_params_}')
    print(f'Best F1 score for {model["name"]}: {grid_search.best_score_}')

    # use the best estimator to predict on the test set
    best_clf = grid_search.best_estimator_

    # Evaluate on the test set (assuming X_test is already split and prepared)
    y_pred = best_clf.predict(X_test)

    # Print evaluation metrics
    print(f'Accuracy for {model["name"]}: {accuracy_score(y_test, y_pred)}')
    print(classification_report(y_test, y_pred))
    print('-' * 50)


Tuning hyperparameters for: GaussianNB
Best parameters for GaussianNB: {'var_smoothing': 1e-05}
Best F1 score for GaussianNB: 0.8646142721397524
Accuracy for GaussianNB: 0.9716666666666667
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       500
           1       0.94      0.89      0.91       100

    accuracy                           0.97       600
   macro avg       0.96      0.94      0.95       600
weighted avg       0.97      0.97      0.97       600

--------------------------------------------------
Tuning hyperparameters for: MultinomialNB
Best parameters for MultinomialNB: {'alpha': 0.1}
Best F1 score for MultinomialNB: 0.9399704785554219
Accuracy for MultinomialNB: 0.99
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       500
           1       1.00      0.94      0.97       100

    accuracy                           0.99       600
   macro avg       0.99      0.97     