# Imports

In [12]:
# ============================================================================
# 03_modeling.ipynb
# Modeling and Evaluation for Diabetes Binary Classification
# Extended with Hyperparameter Tuning, Multiple Models, ANN, and Cost-Benefit Analysis
# Integrated with preprocessing module
# ============================================================================
"""
Notes:
- RandomizedSearchCV is used for efficiency.
- For expensive models (Random Forest, XGBoost) tuning runs on a 30% subsample, then the best estimator is refit on the full training set.
- CV for RandomizedSearchCV is set to cv=5 as requested.
"""
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import RandomizedSearchCV, cross_val_score, train_test_split
from sklearn.calibration import CalibratedClassifierCV
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.neural_network import MLPClassifier

import xgboost as xgb

from scipy.stats import randint, uniform

from pyexpat import features

# Add src directory to Python path to import preprocessing module
project_root = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, project_root)

# Import preprocessing module
try:
    from dpp.preprocessing import get_preprocessed_data
    PREPROCESSING_AVAILABLE = True
except ImportError as e:
    PREPROCESSING_AVAILABLE = False
    print(f"Warning: Could not import preprocessing module. Error: {e}")
    print("Using placeholder data for demonstration.")

# Load Preprocessed Data

In [13]:
# ============================================================================
# 1. LOAD PREPROCESSED DATA
# ============================================================================
print("="*80)
print("LOAD PREPROCESSED DATA")
print("="*80)

if PREPROCESSING_AVAILABLE:
    # Load preprocessed data from preprocessing module
    try:
        data = get_preprocessed_data()
        features_train = data['features_train']
        target_train = data['target_train']
        features_test = data['features_test']
        target_test = data['target_test']

        print(f"Training set size: {len(features_train):,}")
        print(f"Test set size: {len(features_test):,}")
        print(f"Number of features: {features_train.shape[1]}")
    except Exception as e:
        print(f"Error loading preprocessed data: {e}")
        print("Using placeholder data for demonstration.")
        PREPROCESSING_AVAILABLE = False
else:
    print("Preprocessing module not available.")

LOAD PREPROCESSED DATA
[preprocessing] Loading data from: C:\Users\Eyyub\Desktop\StackFuel\PortfolioProjekt\DPP-Stackfuel-Data-Science-Projekt\data\raw\diabetes-health-indicators-dataset\diabetes_binary_health_indicators_BRFSS2015.csv
Training set size: 311,002
Test set size: 45,895
Number of features: 24


# Model Initialization

In [14]:
# ============================================================================
# 2. MODEL INITIALIZATION (base estimators)
# ============================================================================
print("\n" + "="*80)
print("MODEL INITIALIZATION (BASE ESTIMATORS)")
print("="*80)

base_estimators = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    # Use LinearSVC as base; wrap in CalibratedClassifierCV later if you need probabilities
    'SVM': CalibratedClassifierCV(LinearSVC(random_state=42, max_iter=10000, dual=False), cv=5),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier(random_state=42, max_iter=500)
}



MODEL INITIALIZATION (BASE ESTIMATORS)


# Hyperparameter Tuning

In this section, hyperparameter tuning is performed only for four selected models:
Random Forest, XGBoost, Logistic Regression, and Neural Network.

The rationale behind this selective tuning approach is based on a balance between 
computational efficiency and expected impact on model performance:

1. Effectiveness of Hyperparameters:
   These four models have hyperparameters that significantly influence their 
   predictive performance. For example, the number of trees and depth in Random Forest,
   learning rate and tree depth in XGBoost, regularization strength in Logistic Regression,
   and architecture parameters in Neural Networks can greatly affect results.

2. Computational Cost:
   Exhaustive hyperparameter tuning (e.g., GridSearchCV) is computationally expensive,
   especially when applied to many models with large parameter grids. Focusing on 
   models with the highest potential gain optimizes resource usage.

3. Simplicity and Speed of Other Models:
   Other models like Decision Trees, K-Nearest Neighbors, Naive Bayes, and Linear SVM 
   typically have fewer or less impactful hyperparameters, or are inherently faster to train.
   Their default parameters often provide reasonable baseline performance.

4. Practical Workflow:
   This approach allows for a manageable and efficient modeling pipeline, prioritizing 
   tuning efforts where they are most likely to yield substantial improvements.

If desired, hyperparameter tuning can be extended to additional models using more 
efficient search strategies (e.g., RandomizedSearchCV) or by tuning a smaller subset 
of parameters to balance performance gains with computational cost.


In [None]:
# ============================================================================
# 3. HYPERPARAMETER TUNING (RandomizedSearchCV, CV=5)
# ============================================================================
print("\n" + "="*80)
print("HYPERPARAMETER TUNING")
print("Note: Tuning for expensive models (RF, XGB) runs on a 30% subsample to save time.")
print("="*80)

# Create a 30% subsample for expensive model tuning
features_sub, _, target_sub, _ = train_test_split(features_train, target_train, train_size=0.30, stratify=target_train, random_state=42)

# Parameter distributions for RandomizedSearch
param_distributions = {
    'Random Forest': {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [8, 12, 20, None],
        'min_samples_split': [2, 5, 10],
        'max_features': ['sqrt', 'log2', 0.5]
    },
    'XGBoost': {
        'n_estimators': [100, 200],
        'max_depth': [3, 6],
        'learning_rate': [0.05, 0.1, 0.2],
        'subsample': [0.7, 0.8, 1.0]
    },
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear'],
        'class_weight': [None, 'balanced']
    },
    'Neural Network': {
        'hidden_layer_sizes': [(50,), (100,), (50, 50)],
        'activation': ['relu', 'tanh'],
        'alpha': [0.0001, 0.001],
        'learning_rate': ['constant', 'adaptive']
    }
}

# We will store the final trained models here
trained_models = {}




HYPERPARAMETER TUNING
Note: Tuning for expensive models (RF, XGB) runs on a 30% subsample to save time.


In [16]:
# Helper function to run RandomizedSearchCV and refit best estimator on full training set
def tune_and_refit(name, estimator, param_dist, X_tune, y_tune, X_full, y_full, n_iter=20, cv=5):
    """
    Tune estimator with RandomizedSearchCV on X_tune/y_tune (subsample for speed),
    then refit the best estimator on the full training data X_full/y_full.
    Returns the refitted best estimator and the best CV score.
    """
    print(f"[tuning] {name}: Starting RandomizedSearchCV (n_iter={n_iter}, cv={cv}) ...")
    rs = RandomizedSearchCV(
        estimator=estimator,
        param_distributions=param_dist,
        n_iter=n_iter,
        scoring='roc_auc',
        cv=cv,
        random_state=42,
        n_jobs=-1,
        verbose=1,
        refit=True
    )
    t0 = time.time()
    rs.fit(X_tune, y_tune)
    t1 = time.time()
    print(f"[tuning] {name}: RandomizedSearchCV finished in {(t1-t0)/60:.2f} min. Best CV score: {rs.best_score_:.4f}")
    print(f"[tuning] {name}: Best params: {rs.best_params_}")

    # Refit best estimator on full training data for final model
    best_est = rs.best_estimator_
    print(f"[tuning] {name}: Refitting best estimator on the full training set ...")
    t2 = time.time()
    best_est.fit(X_full, y_full)
    t3 = time.time()
    print(f"[tuning] {name}: Refit finished in {(t3-t2)/60:.2f} min.")
    return best_est, rs.best_score_



In [18]:
# Tuning loop for selected models
tuned_models = {}
tuned_scores = {}

# Random Forest tuning (on subsample, then refit on full)
if 'Random Forest' in base_estimators:
    rf_est = RandomForestClassifier(random_state=42, n_jobs=-1)
    rf_param_dist = param_distributions['Random Forest']
    best_rf, best_rf_score = tune_and_refit('Random Forest', rf_est, rf_param_dist, features_sub, target_sub, features_train, target_train, n_iter=20, cv=5)
    trained_models['Random Forest'] = best_rf
    tuned_scores['Random Forest'] = best_rf_score



[tuning] Random Forest: Starting RandomizedSearchCV (n_iter=20, cv=5) ...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[tuning] Random Forest: RandomizedSearchCV finished in 15.08 min. Best CV score: 0.9598
[tuning] Random Forest: Best params: {'n_estimators': 100, 'min_samples_split': 5, 'max_features': 'log2', 'max_depth': None}
[tuning] Random Forest: Refitting best estimator on the full training set ...
[tuning] Random Forest: Refit finished in 0.55 min.


In [19]:
# XGBoost tuning (if available)
if 'XGBoost' in param_distributions:
    xgb_est = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss', n_jobs=-1)
    xgb_param_dist = param_distributions['XGBoost']
    best_xgb, best_xgb_score = tune_and_refit('XGBoost', xgb_est, xgb_param_dist, features_sub, target_sub, features_train, target_train, n_iter=20, cv=5)
    trained_models['XGBoost'] = best_xgb
    tuned_scores['XGBoost'] = best_xgb_score



[tuning] XGBoost: Starting RandomizedSearchCV (n_iter=20, cv=5) ...
Fitting 5 folds for each of 20 candidates, totalling 100 fits
[tuning] XGBoost: RandomizedSearchCV finished in 2.36 min. Best CV score: 0.9635
[tuning] XGBoost: Best params: {'subsample': 0.7, 'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1}
[tuning] XGBoost: Refitting best estimator on the full training set ...
[tuning] XGBoost: Refit finished in 0.13 min.


In [20]:
# Logistic Regression tuning
if 'Logistic Regression' in param_distributions:
    lr_est = LogisticRegression(random_state=42, max_iter=1000)
    lr_param_dist = param_distributions['Logistic Regression']
    best_lr, best_lr_score = tune_and_refit('Logistic Regression', lr_est, lr_param_dist, features_sub, target_sub, features_train, target_train, n_iter=20, cv=5)
    trained_models['Logistic Regression'] = best_lr
    tuned_scores['Logistic Regression'] = best_lr_score



[tuning] Logistic Regression: Starting RandomizedSearchCV (n_iter=20, cv=5) ...
Fitting 5 folds for each of 16 candidates, totalling 80 fits
[tuning] Logistic Regression: RandomizedSearchCV finished in 21.08 min. Best CV score: 0.8135
[tuning] Logistic Regression: Best params: {'solver': 'liblinear', 'penalty': 'l2', 'class_weight': None, 'C': 0.1}
[tuning] Logistic Regression: Refitting best estimator on the full training set ...
[tuning] Logistic Regression: Refit finished in 0.05 min.


In [None]:
# Neural Network tuning (MLP)
if 'Neural Network' in param_distributions:
    nn_est = MLPClassifier(random_state=42, max_iter=500)
    nn_param_dist = param_distributions['Neural Network']
    best_nn, best_nn_score = tune_and_refit('Neural Network', nn_est, nn_param_dist, features_sub, target_sub, features_train, target_train, n_iter=20, cv=5)
    trained_models['Neural Network'] = best_nn
    tuned_scores['Neural Network'] = best_nn_score



[tuning] Neural Network: Starting RandomizedSearchCV (n_iter=20, cv=5) ...
Fitting 5 folds for each of 20 candidates, totalling 100 fits


In [None]:
# For any remaining models not tuned, fit defaults on the full training set
for name, est in base_estimators.items():
    if name in trained_models:
        continue
    if est is None:
        continue
    print(f"[fit-default] Fitting default estimator for {name} on full training set ...")
    t0 = time.time()
    est.fit(features_train, target_train)
    t1 = time.time()
    print(f"[fit-default] {name} fitted in {(t1-t0)/60:.2f} min.")
    trained_models[name] = est
