In [1]:
# ============================================================================
# 03_modeling.ipynb
# Model Training and Evaluation for Multi-Class Diabetes Classification
# Dataset: BRFSS 2015 - Diabetes Health Indicators (3 Classes)
# 
# OPTIMIZATION GOAL: HIGH RECALL
# Medical Context: In diabetes screening, it is more important to identify
# all potential diabetes cases (high recall) even if it means some false 
# positives. Missing a diabetes case (false negative) has more serious 
# health consequences than a false alarm (false positive).
# ============================================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path to import custom modules
sys.path.append('../src/core')

# Import custom modules for feature engineering and modeling
from feature_engineering import apply_all_feature_engineering
from modeling import (
    train_logistic_regression,
    train_random_forest,
    train_xgboost,
    train_svm,
    evaluate_model,
    plot_confusion_matrix,
    plot_classification_report,
    plot_roc_curves,
    compare_models,
    save_model
)

# Scikit-learn imports for scaling
from sklearn.preprocessing import StandardScaler

# Configure plot style for consistent visualizations
sns.set_style("whitegrid")
sns.set_palette("colorblind")
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Define output directories for saving visualizations and models
output_dir = "../outputs/figures/modeling"
os.makedirs(output_dir, exist_ok=True)
print(f"Output directory created: {output_dir}")

models_dir = "../outputs/models"
os.makedirs(models_dir, exist_ok=True)
print(f"Models directory created: {models_dir}")


Output directory created: ../outputs/figures/modeling
Models directory created: ../outputs/models


# Load Preprocessed Data

In [2]:
# =============================================================================
# 1. LOAD PREPROCESSED DATA
# =============================================================================
print("\n" + "="*80)
print("STEP 1: LOAD PREPROCESSED DATA")
print("="*80)

# Load scaled training and test data
# These datasets have continuous features scaled (StandardScaler)
# Binary and ordinal features remain unchanged
features_train = pd.read_csv("../data/processed/features_train_scaled.csv")
features_test = pd.read_csv("../data/processed/features_test_scaled.csv")

# Load target variables
# squeeze() converts single-column DataFrame to Series
target_train = pd.read_csv("../data/processed/target_train.csv").squeeze()
target_test = pd.read_csv("../data/processed/target_test.csv").squeeze()

# Load SMOTE-resampled training data
# SMOTE (Synthetic Minority Over-sampling Technique) creates synthetic samples
# for minority classes to balance the dataset
features_train_smote = pd.read_csv("../data/processed/features_train_smote.csv")
target_train_smote = pd.read_csv("../data/processed/target_train_smote.csv").squeeze()

# Display data shapes to verify successful loading
print("\nDATA SHAPES:")
print("-" * 60)
print(f"Training Set (Original):     {features_train.shape}")
print(f"Training Set (SMOTE):        {features_train_smote.shape}")
print(f"Test Set:                    {features_test.shape}")

# Display target distribution to understand class imbalance
# Class 0 = No Diabetes
# Class 1 = Prediabetes
# Class 2 = Diabetes
print("\nTARGET DISTRIBUTION:")
print("-" * 60)
print("Training Set (Original):")
print(target_train.value_counts().sort_index()) # pyright: ignore[reportAttributeAccessIssue]
print("\nTraining Set (SMOTE):")
print(target_train_smote.value_counts().sort_index()) # pyright: ignore[reportAttributeAccessIssue]
print("\nTest Set:")
print(target_test.value_counts().sort_index()) # pyright: ignore[reportAttributeAccessIssue]



STEP 1: LOAD PREPROCESSED DATA

DATA SHAPES:
------------------------------------------------------------
Training Set (Original):     (183824, 21)
Training Set (SMOTE):        (456129, 21)
Test Set:                    (45957, 21)

TARGET DISTRIBUTION:
------------------------------------------------------------
Training Set (Original):
Diabetes_012
0.0    152043
1.0      3703
2.0     28078
Name: count, dtype: int64

Training Set (SMOTE):
Diabetes_012
0.0    152043
1.0    152043
2.0    152043
Name: count, dtype: int64

Test Set:
Diabetes_012
0.0    38012
1.0      926
2.0     7019
Name: count, dtype: int64
