In [21]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score
from ucimlrepo import fetch_ucirepo
import joblib
import os
import warnings
import re

In [22]:

warnings.filterwarnings('ignore')
try:
    X_scaled = pd.read_csv('../data/processed/scaled_features.csv')
    X_pca = pd.read_csv('../data/processed/pca_features.csv').drop('Class', axis=1)
    y = pd.read_csv('../data/processed/targets.csv').squeeze()
    print("Processed data loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading processed data: {e}")
    print("Please ensure the data exploration notebook has been run first.")
    X_scaled, X_pca, y = pd.DataFrame(), pd.DataFrame(), pd.Series()

spambase = fetch_ucirepo(id=94)
X_raw = spambase.data.features
y_raw = spambase.data.targets.squeeze()
X_raw.columns = [re.sub(r'[\[\]<]', '_', col) for col in X_raw.columns]

if not X_scaled.empty:
    X_scaled.columns = [re.sub(r'[\[\]<]', '_', col) for col in X_scaled.columns]


Processed data loaded successfully.


In [23]:

def training_and_evaluation(model, X, y, test_size=0.2, random_state=42):
    if X.empty or y.empty:
        print(f"Skipping {model.__class__.__name__} due to missing data.")
        return None, 0, 0
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    
    print(f"Model: {model.__class__.__name__}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print("-" * 30)
    
    return model, accuracy, precision

results = []

In [24]:
print("\nTraining on Raw Data")
lr_raw = LogisticRegression(max_iter=1000, random_state=42)
lr_raw_model, acc_lr_raw, prec_lr_raw = training_and_evaluation(lr_raw, X_raw, y_raw)
results.append({'Data': 'Raw', 'Model': 'Logistic Regression', 'Accuracy': acc_lr_raw, 'Precision': prec_lr_raw, 'Model_Object': lr_raw_model})

xgb_raw = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_raw_model, acc_xgb_raw, prec_xgb_raw = training_and_evaluation(xgb_raw, X_raw, y_raw)
results.append({'Data': 'Raw', 'Model': 'XGBoost', 'Accuracy': acc_xgb_raw, 'Precision': prec_xgb_raw, 'Model_Object': xgb_raw_model})




Training on Raw Data
Model: LogisticRegression
Accuracy: 0.9283
Precision: 0.9160
------------------------------
Model: XGBClassifier
Accuracy: 0.9490
Precision: 0.9365
------------------------------


In [25]:

print("\nTraining on Scaled Data")
lr_scaled = LogisticRegression(max_iter=1000, random_state=42)
lr_scaled_model, acc_lr_scaled, prec_lr_scaled = training_and_evaluation(lr_scaled, X_scaled, y)
results.append({'Data': 'Scaled', 'Model': 'Logistic Regression', 'Accuracy': acc_lr_scaled, 'Precision': prec_lr_scaled, 'Model_Object': lr_scaled_model})

xgb_scaled = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_scaled_model, acc_xgb_scaled, prec_xgb_scaled = training_and_evaluation(xgb_scaled, X_scaled, y)
results.append({'Data': 'Scaled', 'Model': 'XGBoost', 'Accuracy': acc_xgb_scaled, 'Precision': prec_xgb_scaled, 'Model_Object': xgb_scaled_model})



Training on Scaled Data
Model: LogisticRegression
Accuracy: 0.9294
Precision: 0.9209
------------------------------
Model: XGBClassifier
Accuracy: 0.9490
Precision: 0.9365
------------------------------


In [26]:

print("\nTraining on PCA Data")
lr_pca = LogisticRegression(random_state=42)
lr_pca_model, acc_lr_pca, prec_lr_pca = training_and_evaluation(lr_pca, X_pca, y)
results.append({'Data': 'PCA', 'Model': 'Logistic Regression', 'Accuracy': acc_lr_pca, 'Precision': prec_lr_pca, 'Model_Object': lr_pca_model})

xgb_pca = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_pca_model, acc_xgb_pca, prec_xgb_pca = training_and_evaluation(xgb_pca, X_pca, y)
results.append({'Data': 'PCA', 'Model': 'XGBoost', 'Accuracy': acc_xgb_pca, 'Precision': prec_xgb_pca, 'Model_Object': xgb_pca_model})



Training on PCA Data
Model: LogisticRegression
Accuracy: 0.8599
Precision: 0.8634
------------------------------
Model: XGBClassifier
Accuracy: 0.8675
Precision: 0.8414
------------------------------


In [27]:
if results:
    results_df = pd.DataFrame(results)
    print("\nModel Performance Comparison")
    print(results_df[['Data', 'Model', 'Accuracy', 'Precision']].sort_values(by=['Accuracy', 'Precision'], ascending=False))
    best_model_entry = results_df.sort_values(by=['Accuracy', 'Precision'], ascending=False).iloc[0]
    best_model = best_model_entry['Model_Object']
    best_model_name = f"{best_model_entry['Model']} ({best_model_entry['Data']})"

    print(f"\nBest Performing Model")
    print(f"The best model is {best_model_name} with:")
    print(f"  - Accuracy: {best_model_entry['Accuracy']:.4f}")
    print(f"  - Precision: {best_model_entry['Precision']:.4f}")
    model_dir = '../models/'
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    model_path = os.path.join(model_dir, 'best_spam_classifier.joblib')
    joblib.dump(best_model, model_path)
    print(f"\nBest model saved to: {model_path}")
else:
    print("\nModel training was skipped due to missing data. No model was saved.")


Model Performance Comparison
     Data                Model  Accuracy  Precision
1     Raw              XGBoost  0.948969   0.936464
3  Scaled              XGBoost  0.948969   0.936464
2  Scaled  Logistic Regression  0.929425   0.920904
0     Raw  Logistic Regression  0.928339   0.915966
5     PCA              XGBoost  0.867535   0.841360
4     PCA  Logistic Regression  0.859935   0.863354

Best Performing Model
The best model is XGBoost (Raw) with:
  - Accuracy: 0.9490
  - Precision: 0.9365

Best model saved to: ../models/best_spam_classifier.joblib


#### HyperParameter Tuning

In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(X_raw, y_raw, test_size=0.2, random_state=42, stratify=y_raw)
print(f"Data split into training ({len(X_train)} samples) and testing ({len(X_test)} samples).")


scale_pos_weight_value = y_train.value_counts()[0] / y_train.value_counts()[1]
print(f"Calculated scale_pos_weight: {scale_pos_weight_value:.2f}")

param_grid = {
    'max_depth': [5, 7],
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.2],
    'gamma': [0.1, 0.2],
    'scale_pos_weight': [scale_pos_weight_value, 2.0, 2.5] 
}

xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

print("\nStarting GridSearchCV to find the best hyperparameters for recall...")
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring='recall', 
    cv=3,             
    verbose=1,
    n_jobs=-1         
)

grid_search.fit(X_train, y_train)

print("\nGridSearchCV complete.")
print(f"Best parameters found: {grid_search.best_params_}")


print("\n--- Training and Evaluating Final Optimized Model ---")
best_xgb_tuned = grid_search.best_estimator_
y_pred_tuned = best_xgb_tuned.predict(X_test)

print("\nClassification Report on Test Set:")
print(classification_report(y_test, y_pred_tuned, target_names=['Not Spam', 'Spam']))

model_path_tuned = os.path.join('../models/', 'best_tuned_spam_classifier.joblib')
joblib.dump(best_xgb_tuned, model_path_tuned)
print(f"\nBest tuned model saved to: {model_path_tuned}")

Data split into training (3680 samples) and testing (921 samples).
Calculated scale_pos_weight: 1.54

Starting GridSearchCV to find the best hyperparameters for recall...
Fitting 3 folds for each of 48 candidates, totalling 144 fits

GridSearchCV complete.
Best parameters found: {'gamma': 0.1, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 100, 'scale_pos_weight': 2.5}

--- Training and Evaluating Final Optimized Model ---

Classification Report on Test Set:
              precision    recall  f1-score   support

    Not Spam       0.97      0.95      0.96       558
        Spam       0.92      0.95      0.94       363

    accuracy                           0.95       921
   macro avg       0.94      0.95      0.95       921
weighted avg       0.95      0.95      0.95       921


Best tuned model saved to: ../models/best_tuned_spam_classifier.joblib
