# Imports

In [1]:
# ============================================================================
# 03_modeling.ipynb
# Modeling and Evaluation for Diabetes Binary Classification
# Extended with Hyperparameter Tuning, Multiple Models, ANN, and Cost-Benefit Analysis
# Integrated with preprocessing module
# ============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.utils.class_weight import compute_class_weight
from sklearn.calibration import CalibratedClassifierCV
import xgboost as xgb
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.neural_network import MLPClassifier
import sys
import os
import warnings
warnings.filterwarnings('ignore')

# Add src directory to Python path to import preprocessing module
project_root = os.path.dirname(os.path.dirname(os.getcwd()))
sys.path.insert(0, project_root)

# Import preprocessing module
try:
    from dpp.preprocessing import get_preprocessed_data
    PREPROCESSING_AVAILABLE = True
except ImportError as e:
    PREPROCESSING_AVAILABLE = False
    print(f"Warning: Could not import preprocessing module. Error: {e}")
    print("Using placeholder data for demonstration.")

# Load Preprocessed Data

In [2]:
# ============================================================================
# 1. LOAD PREPROCESSED DATA
# ============================================================================
print("="*80)
print("LOAD PREPROCESSED DATA")
print("="*80)

if PREPROCESSING_AVAILABLE:
    # Load preprocessed data from preprocessing module
    try:
        data = get_preprocessed_data()
        features_train = data['features_train']
        target_train = data['target_train']
        features_test = data['features_test']
        target_test = data['target_test']

        print(f"Training set size: {len(features_train):,}")
        print(f"Test set size: {len(features_test):,}")
        print(f"Number of features: {features_train.shape[1]}")
    except Exception as e:
        print(f"Error loading preprocessed data: {e}")
        print("Using placeholder data for demonstration.")
        PREPROCESSING_AVAILABLE = False
else:
    print("Preprocessing module not available.")

LOAD PREPROCESSED DATA
[preprocessing] Loading data from: C:\Users\Eyyub\Desktop\StackFuel\PortfolioProjekt\DPP-Stackfuel-Data-Science-Projekt\data\raw\diabetes-health-indicators-dataset\diabetes_binary_health_indicators_BRFSS2015.csv
Training set size: 311,002
Test set size: 45,895
Number of features: 24


# Model Initialization

In [4]:
# ============================================================================
# 2. MODEL INITIALIZATION AND TRAINING
# ============================================================================
print("\n" + "="*80)
print("MODEL INITIALIZATION AND TRAINING")
print("="*80)

# Initialize multiple models with base parameters
# Using LinearSVC with CalibratedClassifierCV for faster SVM training
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': CalibratedClassifierCV(
        LinearSVC(random_state=42, max_iter=10000, dual=False),
        cv=3
    ),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Naive Bayes': GaussianNB(),
    'Neural Network': MLPClassifier(random_state=42, max_iter=500)
}

# Train all models
trained_models = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(features_train, target_train)
    trained_models[name] = model

print("All models trained successfully.")


MODEL INITIALIZATION AND TRAINING
Training Random Forest...
Training XGBoost...
Training Logistic Regression...
Training SVM...
Training K-Nearest Neighbors...
Training Decision Tree...
Training Naive Bayes...
Training Neural Network...
All models trained successfully.
