In [1]:
# ============================================================================
# 03_modeling.ipynb
# Model Training and Evaluation for Multi-Class Diabetes Classification
# Dataset: BRFSS 2015 - Diabetes Health Indicators (3 Classes)
# 
# OPTIMIZATION GOAL: HIGH RECALL
# Medical Context: In diabetes screening, it is more important to identify
# all potential diabetes cases (high recall) even if it means some false 
# positives. Missing a diabetes case (false negative) has more serious 
# health consequences than a false alarm (false positive).
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path to import custom modules
sys.path.append('../src/core')

# Import custom modules for feature engineering and modeling
from feature_engineering import apply_all_feature_engineering
from modeling import (
    train_logistic_regression,
    train_random_forest,
    train_xgboost,
    train_svm,
    evaluate_model,
    plot_confusion_matrix,
    plot_classification_report,
    plot_roc_curves,
    compare_models,
    save_model
)

# Scikit-learn imports for scaling
from sklearn.preprocessing import StandardScaler

# Configure plot style for consistent visualizations
sns.set_style("whitegrid")
sns.set_palette("colorblind")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Define output directories for saving visualizations and models
output_dir = "../outputs/figures/modeling"
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory created: {output_dir}")

models_dir = "../outputs/models"
os.makedirs(models_dir, exist_ok=True)
print(f"Models directory created: {models_dir}")


Output directory created: ../outputs/figures/modeling
Models directory created: ../outputs/models


# Load Preprocessed Data

In [2]:
# =============================================================================
# 1. LOAD PREPROCESSED DATA
# =============================================================================
print("\n" + "="*80)
print("STEP 1: LOAD PREPROCESSED DATA")
print("="*80)

# Load scaled training and test data
# These datasets have continuous features scaled (StandardScaler)
# Binary and ordinal features remain unchanged
features_train = pd.read_csv("../data/processed/features_train_scaled.csv")
features_test = pd.read_csv("../data/processed/features_test_scaled.csv")

# Load target variables
# squeeze() converts single-column DataFrame to Series
target_train = pd.read_csv("../data/processed/target_train.csv").squeeze()
target_test = pd.read_csv("../data/processed/target_test.csv").squeeze()

# Load SMOTE-resampled training data
# SMOTE (Synthetic Minority Over-sampling Technique) creates synthetic samples
# for minority classes to balance the dataset
features_train_smote = pd.read_csv("../data/processed/features_train_smote.csv")
target_train_smote = pd.read_csv("../data/processed/target_train_smote.csv").squeeze()

# Display data shapes to verify successful loading
print("\nDATA SHAPES:")
print("-" * 60)
print(f"Training Set (Original):     {features_train.shape}")
print(f"Training Set (SMOTE):        {features_train_smote.shape}")
print(f"Test Set:                    {features_test.shape}")

# Display target distribution to understand class imbalance
# Class 0 = No Diabetes
# Class 1 = Prediabetes
# Class 2 = Diabetes
print("\nTARGET DISTRIBUTION:")
print("-" * 60)
print("Training Set (Original):")
print(target_train.value_counts().sort_index()) # pyright: ignore[reportAttributeAccessIssue]
print("\nTraining Set (SMOTE):")
print(target_train_smote.value_counts().sort_index()) # pyright: ignore[reportAttributeAccessIssue]
print("\nTest Set:")
print(target_test.value_counts().sort_index()) # pyright: ignore[reportAttributeAccessIssue]



STEP 1: LOAD PREPROCESSED DATA

DATA SHAPES:
------------------------------------------------------------
Training Set (Original):     (183824, 21)
Training Set (SMOTE):        (456129, 21)
Test Set:                    (45957, 21)

TARGET DISTRIBUTION:
------------------------------------------------------------
Training Set (Original):
Diabetes_012
0.0    152043
1.0      3703
2.0     28078
Name: count, dtype: int64

Training Set (SMOTE):
Diabetes_012
0.0    152043
1.0    152043
2.0    152043
Name: count, dtype: int64

Test Set:
Diabetes_012
0.0    38012
1.0      926
2.0     7019
Name: count, dtype: int64


# Feature Engineering

In [3]:
# =============================================================================
# 2. FEATURE ENGINEERING
# =============================================================================
print("\n" + "="*80)
print("STEP 2: FEATURE ENGINEERING")
print("="*80)

print("\nAPPLYING FEATURE ENGINEERING:")
print("-" * 60)
print("Creating new features:")
print("  • HealthRiskScore: Sum of risk factors (HighBP, HighChol, Stroke, etc.)")
print("  • LifestyleScore: Sum of positive health habits (PhysActivity, Fruits, Veggies)")
print("  • BMI Categories: One-hot encoded BMI groups (Underweight, Normal, Overweight, Obese)")
print("  • Age Groups: One-hot encoded age groups (Young, Middle, Senior)")
print("  • Interaction Features: Product of related features (BMI×HighBP, Age×BMI, GenHlth×PhysActivity)")

# Apply feature engineering to all three datasets
# This creates composite features that may have better predictive power
# than individual features alone
features_train_eng = apply_all_feature_engineering(features_train)
features_test_eng = apply_all_feature_engineering(features_test)
features_train_smote_eng = apply_all_feature_engineering(features_train_smote)

# Display feature engineering results
print("\nFeature engineering completed")
print(f"Original features:  {features_train.shape[1]}")
print(f"Engineered features: {features_train_eng.shape[1]}")
print(f"New features added:  {features_train_eng.shape[1] - features_train.shape[1]}")



STEP 2: FEATURE ENGINEERING

APPLYING FEATURE ENGINEERING:
------------------------------------------------------------
Creating new features:
  • HealthRiskScore: Sum of risk factors (HighBP, HighChol, Stroke, etc.)
  • LifestyleScore: Sum of positive health habits (PhysActivity, Fruits, Veggies)
  • BMI Categories: One-hot encoded BMI groups (Underweight, Normal, Overweight, Obese)
  • Age Groups: One-hot encoded age groups (Young, Middle, Senior)
  • Interaction Features: Product of related features (BMI×HighBP, Age×BMI, GenHlth×PhysActivity)
Feature engineering completed. New shape: (183824, 33)
Feature engineering completed. New shape: (45957, 33)
Feature engineering completed. New shape: (456129, 33)

Feature engineering completed
Original features:  21
Engineered features: 33
New features added:  12


# Scale New Continuous Features

In [4]:
# =============================================================================
# 3. SCALE NEW CONTINUOUS FEATURES
# =============================================================================
print("\n" + "="*80)
print("STEP 3: SCALE NEW CONTINUOUS FEATURES")
print("="*80)

# Identify new continuous features that need scaling
# These are the newly created features that have continuous values
# Scaling ensures all features have similar ranges (mean=0, std=1)
new_continuous_features = ['HealthRiskScore', 'LifestyleScore', 'BMI_x_HighBP', 
                           'Age_x_BMI', 'GenHlth_x_PhysActivity']

print("\nSCALING NEW CONTINUOUS FEATURES:")
print("-" * 60)
print(f"Features to scale: {new_continuous_features}")

# Initialize StandardScaler for new features
# StandardScaler: (x - mean) / std
scaler_new = StandardScaler()

# Fit scaler on training data and transform training set
# fit_transform() calculates mean and std from training data
features_train_eng[new_continuous_features] = scaler_new.fit_transform(
    features_train_eng[new_continuous_features]
)

# Transform test set using training statistics
# Important: Use transform() only (not fit_transform()) to avoid data leakage
features_test_eng[new_continuous_features] = scaler_new.transform(
    features_test_eng[new_continuous_features]
)

# Transform SMOTE training set using same statistics
# This ensures consistency across all datasets
features_train_smote_eng[new_continuous_features] = scaler_new.transform(
    features_train_smote_eng[new_continuous_features]
)

print("\nNew continuous features scaled")



STEP 3: SCALE NEW CONTINUOUS FEATURES

SCALING NEW CONTINUOUS FEATURES:
------------------------------------------------------------
Features to scale: ['HealthRiskScore', 'LifestyleScore', 'BMI_x_HighBP', 'Age_x_BMI', 'GenHlth_x_PhysActivity']

New continuous features scaled
