# DS 634 Final Project: Spam Classification

Author: Christian Laggui  
Date: 4/6/2025  
NJIT Email: cl623@njit.edu  

Required packages: pip install scikit-learn pandas numpy torch

In [95]:
# Required packages: pip install scikit-learn pandas numpy torch
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import os

def get_data_path():
    """
    Returns the path to the data file, working in both scripts and Jupyter notebooks.
    Assumes the data file is in the same directory as the script/notebook.
    """
    try:
        # For regular Python scripts
        script_dir = os.path.dirname(os.path.abspath(__file__))
        return os.path.join(script_dir, 'spambase.data')
    except NameError:
        # For Jupyter notebooks
        return 'spambase.data'

# Load the data
# Note: The last column is the target variable (spam = 1, non-spam = 0)
try:
    data = pd.read_csv(get_data_path(), header=None)
except FileNotFoundError:
    print("Error: Could not find 'spambase.data'")
    print("Please ensure 'spambase.data' is in the same directory as this script/notebook")
    raise

# Split into features and target
X = data.iloc[:, :-1]  # All columns except the last one
y = data.iloc[:, -1]   # Last column (target)

# Scale the features for SVM and GRU
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Convert data to PyTorch tensors for GRU
X_tensor = torch.FloatTensor(X_scaled)
y_tensor = torch.FloatTensor(y.values)



Code for evaluating Random Forest, SVM, and GRU models

In [97]:
# Custom Dataset for GRU
class SpamDataset(Dataset):
    def __init__(self, X, y):
        self.X = X
        self.y = y
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

# GRU Model
class GRUClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2):
        super(GRUClassifier, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        
        # GRU layer
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        
        # Fully connected layer
        self.fc = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        # Reshape input for GRU: [batch_size, sequence_length, input_size]
        x = x.unsqueeze(1)
        
        # Initialize hidden state
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        # Forward propagate GRU
        out, _ = self.gru(x, h0)
        
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        out = self.sigmoid(out)
        return out.squeeze()

def evaluate_gru_model(X, y, model_name):
    """
    Evaluates GRU model using 10-fold stratified cross-validation.
    """
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    fold_metrics = []

    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        # Split data
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Create data loaders
        train_dataset = SpamDataset(X_train, y_train)
        test_dataset = SpamDataset(X_test, y_test)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
        test_loader = DataLoader(test_dataset, batch_size=32)
        
        # Initialize model
        model = GRUClassifier(input_size=X.shape[1], hidden_size=64).to(device)
        criterion = nn.BCELoss()
        optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
        
        # Training
        model.train()
        for epoch in range(10):  # 10 epochs per fold
            for batch_X, batch_y in train_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                optimizer.zero_grad()
                outputs = model(batch_X)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()
        
        # Evaluation
        model.eval()
        y_pred = []
        y_true = []
        with torch.no_grad():
            for batch_X, batch_y in test_loader:
                batch_X = batch_X.to(device)
                outputs = model(batch_X)
                predictions = (outputs > 0.5).float().cpu().numpy()
                y_pred.extend(predictions)
                y_true.extend(batch_y.numpy())
        
        # Calculate metrics
        cm = confusion_matrix(y_true, y_pred)
        tn, fp, fn, tp = cm.ravel()
        
        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
        tnr = tn / (tn + fp) if (tn + fp) > 0 else 0
        fpr = fp / (tn + fp) if (tn + fp) > 0 else 0
        fnr = fn / (tp + fn) if (tp + fn) > 0 else 0
        tss = tpr + tnr - 1
        hss = (2 * (tp * tn - fp * fn)) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)) if ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)) > 0 else 0

        # Calculate additional metrics
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0
        f1 = 2 * tp / (2 * tp + fp + fn) if (2 * tp + fp + fn) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        fdr = fp / (fp + tp) if (fp+tp) > 0 else 0
        
        fold_metrics.append({
            'Fold': fold + 1,
            'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn,
            'TPR': tpr, 'TNR': tnr, 'FPR': fpr, 'FNR': fnr,
            'TSS': tss, 'HSS': hss,
            'Prec.': precision, 
            'NPV': npv,    
            'F1': f1,         
            'Sens.': tpr,  
            'Specif.': tnr,   
            'Acc.': accuracy,
            'FDR': fdr,
            'Confusion Matrix': [[tp, fn], [fp, tn]]  
        })
        
    fold_metrics_df = pd.DataFrame(fold_metrics)
    average_metrics = fold_metrics_df.mean(numeric_only=True).to_dict()
    average_metrics['Fold'] = 'Average'
    average_metrics_df = pd.DataFrame([average_metrics])
    results_df = pd.concat([fold_metrics_df, average_metrics_df], ignore_index=True)
    return results_df


# Load the data
# Note: The last column is the target variable (spam = 1, non-spam = 0)
data = pd.read_csv('c:/Users/clagg/Downloads/spambase/spambase.data', header=None)

# Split into features and target
X = data.iloc[:, :-1]  # All columns except the last one
y = data.iloc[:, -1]   # Last column (target)

# Scale the features for SVM
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

def evaluate_model(model, X, y, model_name):
    """
    Evaluates a given model using 10-fold stratified cross-validation and calculates
    performance metrics.

    Args:
        model: The machine learning model to evaluate (e.g., RandomForestClassifier, SVC).
        X: The feature data.
        y: The target data.
        model_name (str):  The name of the model
    Returns:
        pandas.DataFrame: A DataFrame containing the performance metrics for each fold
                          and the average metrics.
    """
    skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
    fold_metrics = []

    for fold, (train_index, test_index) in enumerate(skf.split(X, y)):
        # Convert indices to numpy arrays if they aren't already
        train_index = np.array(train_index)
        test_index = np.array(test_index)
        
        # Handle both numpy arrays and pandas DataFrames
        if isinstance(X, pd.DataFrame):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        else:
            X_train, X_test = X[train_index], X[test_index]
            
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        cm = confusion_matrix(y_test, y_pred)
        tn, fp, fn, tp = cm.ravel()

        tpr = tp / (tp + fn) if (tp + fn) > 0 else 0
        tnr = tn / (tn + fp) if (tn + fp) > 0 else 0
        fpr = fp / (tn + fp) if (tn + fp) > 0 else 0
        fnr = fn / (tp + fn) if (tp + fn) > 0 else 0
        tss = tpr + tnr - 1
        hss = (2 * (tp * tn - fp * fn)) / ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)) if ((tp + fn) * (fn + tn) + (tp + fp) * (fp + tn)) > 0 else 0

        # Calculate additional metrics
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        npv = tn / (tn + fn) if (tn + fn) > 0 else 0
        f1 = 2 * tp / (2 * tp + fp + fn) if (2 * tp + fp + fn) > 0 else 0
        accuracy = (tp + tn) / (tp + tn + fp + fn)
        fdr = fp / (fp + tp) if (fp + tp) > 0 else 0
        
        fold_metrics.append({
            'Fold': fold + 1,
            'TP': tp, 'TN': tn, 'FP': fp, 'FN': fn,
            'TPR': tpr, 'TNR': tnr, 'FPR': fpr, 'FNR': fnr,
            'TSS': tss, 'HSS': hss,
            'Prec.': precision,
            'NPV': npv,       
            'F1': f1,           
            'Sens.': tpr,  
            'Specif.': tnr,   
            'Acc.': accuracy,     
            'FDR': fdr,
            'Confusion Matrix': [[tp, fn], [fp, tn]]
        })

    fold_metrics_df = pd.DataFrame(fold_metrics)
    average_metrics = fold_metrics_df.mean(numeric_only=True).to_dict()
    average_metrics['Fold'] = 'Average'
    average_metrics_df = pd.DataFrame([average_metrics])
    results_df = pd.concat([fold_metrics_df, average_metrics_df], ignore_index=True)
    return results_df


#### Random Forest Implementation
The parameters used are:
* n_estimators: 100
* max_depth: None
* random_state: 42

In [99]:
# 1. Random Forest Implementation
print("Random Forest Results:")
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
rf_results = evaluate_model(rf_classifier, X, y, "Random Forest")
print(rf_results.to_string(index=False, formatters={
    'Sens.': '{:.4f}'.format,
    'Specif.': '{:.4f}'.format,
    'Prec.': '{:.4f}'.format,
    'NPV': '{:.4f}'.format,
    'FPR': '{:.4f}'.format,
    'FDR': '{:.4f}'.format,
    'FNR': '{:.4f}'.format,
    'Acc.': '{:.4f}'.format,
    'F1': '{:.4f}'.format,
    'TSS': '{:.4f}'.format,
    'HSS': '{:.4f}'.format
}))

Random Forest Results:
   Fold    TP    TN   FP   FN      TPR      TNR    FPR    FNR    TSS    HSS  Prec.    NPV     F1  Sens. Specif.   Acc.    FDR       Confusion Matrix
      1 169.0 270.0  9.0 13.0 0.928571 0.967742 0.0323 0.0714 0.8963 0.8998 0.9494 0.9541 0.9389 0.9286  0.9677 0.9523 0.0506  [[169, 13], [9, 270]]
      2 169.0 271.0  7.0 13.0 0.928571 0.974820 0.0252 0.0714 0.9034 0.9086 0.9602 0.9542 0.9441 0.9286  0.9748 0.9565 0.0398  [[169, 13], [7, 271]]
      3 171.0 271.0  7.0 11.0 0.939560 0.974820 0.0252 0.0604 0.9144 0.9179 0.9607 0.9610 0.9500 0.9396  0.9748 0.9609 0.0393  [[171, 11], [7, 271]]
      4 171.0 274.0  5.0 10.0 0.944751 0.982079 0.0179 0.0552 0.9268 0.9313 0.9716 0.9648 0.9580 0.9448  0.9821 0.9674 0.0284  [[171, 10], [5, 274]]
      5 169.0 266.0 13.0 12.0 0.933702 0.953405 0.0466 0.0663 0.8871 0.8862 0.9286 0.9568 0.9311 0.9337  0.9534 0.9457 0.0714 [[169, 12], [13, 266]]
      6 169.0 271.0  8.0 12.0 0.933702 0.971326 0.0287 0.0663 0.9050 0.9086 0.9548 

#### SVM Implementation
The parameters used are:
* Kernel: RBF
* C: 100
* random_state: 42

In [101]:
# 2. SVM Implementation
print("\nSupport Vector Machine Results:")
# Using RBF kernel with optimized parameters
svm_classifier = SVC(kernel='rbf', C=10, gamma='scale', random_state=42)
svm_results = evaluate_model(svm_classifier, X_scaled, y, "SVM")
print(svm_results.to_string(index=False, formatters={
    'Sens.': '{:.4f}'.format,
    'Specif.': '{:.4f}'.format,
    'Prec.': '{:.4f}'.format,
    'NPV': '{:.4f}'.format,
    'FPR': '{:.4f}'.format,
    'FDR': '{:.4f}'.format,
    'FNR': '{:.4f}'.format,
    'Acc.': '{:.4f}'.format,
    'F1': '{:.4f}'.format,
    'TSS': '{:.4f}'.format,
    'HSS': '{:.4f}'.format
}))


Support Vector Machine Results:
   Fold    TP    TN   FP   FN      TPR      TNR    FPR    FNR    TSS    HSS  Prec.    NPV     F1  Sens. Specif.   Acc.    FDR       Confusion Matrix
      1 161.0 264.0 15.0 21.0 0.884615 0.946237 0.0538 0.1154 0.8309 0.8356 0.9148 0.9263 0.8994 0.8846  0.9462 0.9219 0.0852 [[161, 21], [15, 264]]
      2 164.0 265.0 13.0 18.0 0.901099 0.953237 0.0468 0.0989 0.8543 0.8584 0.9266 0.9364 0.9136 0.9011  0.9532 0.9326 0.0734 [[164, 18], [13, 265]]
      3 170.0 264.0 14.0 12.0 0.934066 0.949640 0.0504 0.0659 0.8837 0.8820 0.9239 0.9565 0.9290 0.9341  0.9496 0.9435 0.0761 [[170, 12], [14, 264]]
      4 157.0 266.0 13.0 24.0 0.867403 0.953405 0.0466 0.1326 0.8208 0.8297 0.9235 0.9172 0.8946 0.8674  0.9534 0.9196 0.0765 [[157, 24], [13, 266]]
      5 167.0 265.0 14.0 14.0 0.922652 0.949821 0.0502 0.0773 0.8725 0.8725 0.9227 0.9498 0.9227 0.9227  0.9498 0.9391 0.0773 [[167, 14], [14, 265]]
      6 162.0 270.0  9.0 19.0 0.895028 0.967742 0.0323 0.1050 0.8628 0.87

#### The GRU model architecture consists of:
* An embedding layer.
* A GRU layer with 64 hidden units and 2 layers.
* A dense output layer with a sigmoid activation function.

The paramaters used are:
* Adam Optimizer
    * Learning Rate: 0.001
    * Binary Cross Entropy Loss
* Epochs: 10
* Batch size: 32
* random_state: 42

In [103]:
# 3. GRU evaluation
print("\nGRU Results:")
gru_results = evaluate_gru_model(X_tensor, y_tensor, "GRU")
print(gru_results.to_string(index=False, formatters={
    'Sens.': '{:.4f}'.format,
    'Specif.': '{:.4f}'.format,
    'Prec.': '{:.4f}'.format,
    'NPV': '{:.4f}'.format,
    'FPR': '{:.4f}'.format,
    'FDR': '{:.4f}'.format,
    'FNR': '{:.4f}'.format,
    'Acc.': '{:.4f}'.format,
    'F1': '{:.4f}'.format,
    'TSS': '{:.4f}'.format,
    'HSS': '{:.4f}'.format
}))


GRU Results:
   Fold    TP    TN   FP   FN      TPR      TNR    FPR    FNR    TSS    HSS  Precision    NPV     F1  Sensitivity  Specificity  Accuracy    FDR       Confusion Matrix
      1 163.0 260.0 19.0 19.0 0.895604 0.931900 0.0681 0.1044 0.8275 0.8275   0.895604 0.9319 0.8956     0.895604     0.931900  0.917570 0.1044 [[163, 19], [19, 260]]
      2 169.0 264.0 14.0 13.0 0.928571 0.949640 0.0504 0.0714 0.8782 0.8774   0.923497 0.9531 0.9260     0.928571     0.949640  0.941304 0.0765 [[169, 13], [14, 264]]
      3 171.0 261.0 17.0 11.0 0.939560 0.938849 0.0612 0.0604 0.8784 0.8734   0.909574 0.9596 0.9243     0.939560     0.938849  0.939130 0.0904 [[171, 11], [17, 261]]
      4 164.0 267.0 12.0 17.0 0.906077 0.956989 0.0430 0.0939 0.8631 0.8673   0.931818 0.9401 0.9188     0.906077     0.956989  0.936957 0.0682 [[164, 17], [12, 267]]
      5 174.0 260.0 19.0  7.0 0.961326 0.931900 0.0681 0.0387 0.8932 0.8829   0.901554 0.9738 0.9305     0.961326     0.931900  0.943478 0.0984  [[174,

In [104]:
# Show feature importance for Random Forest
rf_classifier.fit(X, y)
feature_importances = pd.DataFrame({
    'feature': range(X.shape[1]),
    'importance': rf_classifier.feature_importances_
}).sort_values('importance', ascending=False)
print("\nTop 10 Most Important Features (Random Forest):")
print(feature_importances.head(10).to_string(index=False))


Top 10 Most Important Features (Random Forest):
 feature  importance
      51    0.122275
      52    0.095507
       6    0.080334
      15    0.063160
      54    0.059750
      55    0.056913
      56    0.051001
      24    0.043059
      20    0.042029
      18    0.033238
