# Gradient boosting model

One, two, you know what to do...

## Notebook set up

### Imports

In [None]:
# Standard library imports
from itertools import combinations, permutations
from pathlib import Path

# Third party imports
import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.feature_selection import SelectPercentile
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, RandomizedSearchCV
from sklearn.metrics import make_scorer, roc_auc_score, ConfusionMatrixDisplay
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, KBinsDiscretizer

### Configuration

In [None]:
# Sample sizes, use 1 for full dataset
OPTIMIZATION_SAMPLE = 0.1
EVALUATION_SAMPLE = 0.2
CV_FOLDS = 3
N_JOBS = 8

### Data loading

In [None]:
train_df_path = 'https://gperdrizet.github.io/FSA_devops/assets/data/unit3/diabetes_prediction_train.csv'

# Load the training dataset
df = pd.read_csv(train_df_path)

# Split test set for internal evaluation
train_df, test_df = train_test_split(df, test_size=0.2, random_state=315)
train_df.reset_index(inplace=True, drop=True)
test_df.reset_index(inplace=True, drop=True)

# Display first few rows of training data
train_df.head().transpose()

In [None]:
# Display dataset information (columns, dtypes, non-null counts)
train_df.info()

In [None]:
train_df.nunique()

## 1. Data Preprocessing

In [None]:
# Drop ID column
train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

print(f'ID column removed')
print(f'Remaining columns: {list(train_df.columns)}')

## 2. Preprocessing

### 2.1. Column definitions

In [None]:
# Define the label
label = 'diagnosed_diabetes'

# Define numerical features to apply IQR clipping
numerical_features = [
    'age', 'physical_activity_minutes_per_week', 'diet_score',
    'sleep_hours_per_day', 'screen_time_hours_per_day', 'bmi',
    'waist_to_hip_ratio', 'systolic_bp', 'diastolic_bp', 'heart_rate',
    'cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides',
]

# Define ordinal features to encode
ordinal_features = ['education_level', 'income_level', 'alcohol_consumption_per_week']

# Define ordinal categories in order
education_categories = [['No formal', 'Highschool', 'Graduate', 'Postgraduate']]
income_categories = [['Low', 'Lower-Middle', 'Middle', 'Upper-Middle', 'High']]
alcohol_categories = [[1, 2, 3, 4, 5, 6, 7, 8, 9]]

# Define features for one-hot encoding
nominal_features = [
    'gender', 'ethnicity', 'smoking_status', 'employment_status',
    'family_history_diabetes', 'hypertension_history', 'cardiovascular_history'
]

### 2.2. Ordinal encoding

In [None]:
# Create ordinal encoder with categories
ordinal_encoder = OrdinalEncoder(
    categories=education_categories + income_categories + alcohol_categories,
    handle_unknown='use_encoded_value',
    unknown_value=-1
)

# Fit and transform ordinal features
ordinal_encoded = ordinal_encoder.fit_transform(train_df[ordinal_features])
train_df.drop(columns=ordinal_features, inplace=True)
train_df[ordinal_features] = ordinal_encoded

# And the test data
ordinal_encoded = ordinal_encoder.transform(test_df[ordinal_features])
test_df.drop(columns=ordinal_features, inplace=True)
test_df[ordinal_features] = ordinal_encoded

### 2.3. Nominal encoding

In [None]:
# Create one-hot encoder
onehot_encoder = OneHotEncoder(
    drop='first',
    sparse_output=False,
    handle_unknown='ignore'
)

# Convert encoded features to DataFrame
encoded_features_df = pd.DataFrame(
    onehot_encoder.fit_transform(train_df[nominal_features]),
    columns=onehot_encoder.get_feature_names_out(nominal_features)
)

# Remove original nominal features and add encoded versions
train_df = pd.concat([train_df.drop(columns=nominal_features), encoded_features_df], axis=1)

# And the test data
encoded_features_df = pd.DataFrame(
    onehot_encoder.transform(test_df[nominal_features]),
    columns=onehot_encoder.get_feature_names_out(nominal_features)
)

test_df = pd.concat([test_df.drop(columns=nominal_features), encoded_features_df], axis=1)

## 3. Feature engineering

In [None]:
features = train_df.drop(columns=[label]).columns.tolist()

### 3.1. Feature discretization

In [None]:
# Create a KbinsDiscretizer
binning_transformer = KBinsDiscretizer(n_bins=5, encode='ordinal')

# Bin the numerical features
binned_features = binning_transformer.fit_transform(train_df[numerical_features])

# Add new binned features to the training DataFrame
binned_features_df = pd.DataFrame(binned_features, columns=[f'binned_{feature}' for feature in numerical_features])
train_df = pd.concat([train_df, binned_features_df], axis=1)

# And the test data
binned_features = binning_transformer.transform(test_df[numerical_features])
binned_features_df = pd.DataFrame(binned_features, columns=[f'binned_{feature}' for feature in numerical_features])
test_df = pd.concat([test_df, binned_features_df], axis=1)

### 3.2. Clustering

#### 3.2.1. Heart health clusters

In [None]:
heart_features = ['systolic_bp', 'diastolic_bp', 'heart_rate', 'hypertension_history', 'cardiovascular_history']

# Fit KMean clustering model on the training data
kmeans_model = KMeans(n_clusters=4, random_state=315)
kmeans_model.fit(train_df[heart_features], train_df[label])

# Add cluster membership as a new feature
train_df['heart_cluster'] = kmeans_model.predict(train_df[heart_features])
test_df['heart_cluster'] = kmeans_model.predict(test_df[heart_features])

#### 3.2.2. Cholesterol clusters

In [None]:
cholesterol_features = ['cholesterol_total', 'hdl_cholesterol', 'ldl_cholesterol', 'triglycerides']

# Fit KMean clustering model on the training data
kmeans_model = KMeans(n_clusters=4, random_state=315)
kmeans_model.fit(train_df[cholesterol_features], train_df[label])

# Add cluster membership as a new feature
train_df['cholesterol_cluster'] = kmeans_model.predict(train_df[cholesterol_features])
test_df['cholesterol_cluster'] = kmeans_model.predict(test_df[cholesterol_features])

#### 3.2.3. Lifestyle clusters

In [None]:
lifestyle_features = [
    'physical_activity_minutes_per_week', 'diet_score', 'sleep_hours_per_day',
    'screen_time_hours_per_day', 'alcohol_consumption_per_week'
]

# Fit KMean clustering model on the training data
kmeans_model = KMeans(n_clusters=4, random_state=315)
kmeans_model.fit(train_df[lifestyle_features], train_df[label])

# Add cluster membership as a new feature
train_df['lifestyle_cluster'] = kmeans_model.predict(train_df[lifestyle_features])
test_df['lifestyle_cluster'] = kmeans_model.predict(test_df[lifestyle_features])

### 3.3. Other synthetic features

In [None]:
# Dictionary to collect new features and add at the end
new_train_features = {}
new_test_features = {}

#### 3.3.1. Difference features

In [None]:
for feature_a, feature_b in combinations(features, 2):
    feature_name = f'{feature_a}-{feature_b}'
    new_train_features[feature_name] = train_df[feature_a] - train_df[feature_b]
    new_test_features[feature_name] = test_df[feature_a] - test_df[feature_b]

#### 3.3.2. Sum features

In [None]:
for feature_a, feature_b in combinations(features, 2):
    feature_name = f'{feature_a}+{feature_b}'
    new_train_features[feature_name] = train_df[feature_a] + train_df[feature_b]
    new_test_features[feature_name] = test_df[feature_a] + test_df[feature_b]

#### 3.3.3. Ratio features

In [None]:
for feature_a, feature_b in permutations(features, 2):
    feature_name = f'{feature_a}/{feature_b}'
    new_train_features[feature_name] = train_df[feature_a] / (train_df[feature_b] + train_df[feature_b].min() + 1)
    new_test_features[feature_name] = test_df[feature_a] / (test_df[feature_b] + test_df[feature_b].min() + 1)

#### 3.3.4. Reciprocal features

In [None]:
for feature in features:
    feature_name = f'1/{feature}'
    new_train_features[feature_name] = 1 / (train_df[feature] + train_df[feature].min() + 1)
    new_test_features[feature_name] = 1 / (test_df[feature] + test_df[feature].min() + 1)

#### 3.3.5. Log features

In [None]:
for feature in features:
    feature_name = f'log{feature}'
    new_train_features[feature_name] = np.log(train_df[feature] + train_df[feature].min() + 1)
    new_test_features[feature_name] = np.log(test_df[feature] + test_df[feature].min() + 1)

#### 3.3.6. Square root features

In [None]:
for feature in features:
    feature_name = f'root{feature}'
    new_train_features[feature_name] = (train_df[feature] + train_df[feature].min() + 1) ** (1/2)
    new_test_features[feature_name] = (test_df[feature] + test_df[feature].min() + 1) ** (1/2)

#### 3.3.7. Square features

In [None]:
for feature in features:
    feature_name = f'root{feature}'
    new_train_features[feature_name] = (train_df[feature] + train_df[feature].min() + 1) ** 2
    new_test_features[feature_name] = (test_df[feature] + test_df[feature].min() + 1) ** 2
    

#### 3.3.8. Add new features

In [None]:
train_df = pd.concat([train_df, pd.DataFrame(new_train_features)], axis=1)
test_df = pd.concat([test_df, pd.DataFrame(new_test_features)], axis=1)
train_df.info()

## 4. Model training and optimization

In [None]:
train_eval_df = train_df.sample(frac=EVALUATION_SAMPLE, random_state=315).reset_index(drop=True)
train_optimization_df = train_df.sample(frac=OPTIMIZATION_SAMPLE, random_state=315).reset_index(drop=True)

### 4.1. Baseline model performance

In [None]:
# Create baseline model with default parameters
baseline_model = Pipeline([
    ('scaler', MinMaxScaler()),
    ('feature_selector', SelectPercentile()),
    ('classifier', HistGradientBoostingClassifier())
])

# Estimate AUC with cross-validation
baseline_scores = cross_val_score(
    baseline_model,
    train_eval_df.drop(columns=['diagnosed_diabetes']).sample(frac=OPTIMIZATION_SAMPLE, random_state=315),
    train_eval_df['diagnosed_diabetes'],
    cv=CV_FOLDS,
    scoring=make_scorer(roc_auc_score),
    n_jobs=N_JOBS
)

print(f'Baseline model mean cross-validation score (ROC-AUC): {np.mean(baseline_scores):.4f}')

### 4.1. Hyperparameter optimization with RandomizedSearchCV

In [None]:
%%time

from scipy.stats import uniform, randint, loguniform

# Define the pipeline
pipeline = Pipeline([
    ('scaler', MinMaxScaler()),
    ('feature_selector', SelectPercentile()),
    ('classifier', HistGradientBoostingClassifier(random_state=315))
])

# Best cross-validation score (ROC-AUC): 0.7083

# Best parameters:
#   classifier__early_stopping: True
#   classifier__l2_regularization: 62.63475886951615
#   classifier__learning_rate: 0.03649801250450118
#   classifier__max_bins: 64
#   classifier__max_depth: None
#   classifier__max_iter: 413
#   classifier__min_samples_leaf: 43
#   classifier__n_iter_no_change: 30
#   classifier__validation_fraction: 0.1
#   feature_selector__percentile: 25

# Took 171 minutes on 100,000 samples with n_jobs=1

# Define parameter distributions for randomized search
param_distributions = {
    'feature_selector__percentile': uniform(0.25, 0.95),
    'classifier__learning_rate': loguniform(0.001, 0.3),
    'classifier__max_iter': randint(100, 501),
    'classifier__max_depth': [10, 15, 20, 25, None],
    'classifier__min_samples_leaf': randint(5, 51),
    'classifier__l2_regularization': loguniform(1e-4, 100.0),
    'classifier__max_bins': [64, 128, 255],
    'classifier__early_stopping': [True],
    'classifier__validation_fraction': [0.1],
    'classifier__n_iter_no_change': [30]
}

# Create RandomizedSearchCV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions=param_distributions,
    n_iter=200,
    cv=CV_FOLDS,
    scoring='roc_auc',
    n_jobs=N_JOBS,
    random_state=315,
    verbose=0
)

# Fit the randomized search
random_search.fit(
    train_optimization_df.drop(columns=['diagnosed_diabetes']),
    train_optimization_df['diagnosed_diabetes']
)

# Get the best model from random search
optimized_model = random_search.best_estimator_

print(f'\nBest mean cross-validation score (ROC-AUC): {random_search.best_score_:.4f}')
print(f'\nBest parameters:')

for param, value in random_search.best_params_.items():
    print(f'  {param}: {value}')

print()

### 4.2. Evaluate optimized model

In [None]:
# Evaluate with cross-validation to verify performance
scores = cross_val_score(
    optimized_model,
    train_eval_df.drop(columns=['diagnosed_diabetes']),
    train_eval_df['diagnosed_diabetes'],
    cv=CV_FOLDS,
    scoring=make_scorer(roc_auc_score),
    n_jobs=N_JOBS
)

print(f'Optimized model mean cross-validation score (ROC-AUC): {np.mean(scores):.4f}')

### 4.4. Compare baseline vs optimized performance

In [None]:
# Prepare data for boxplot
comparison_data = [baseline_scores, scores]
labels = ['Baseline\n(Unoptimized)', 'Optimized']

# Create boxplot
plt.title('Cross-Validation Performance')
plt.boxplot(comparison_data, tick_labels=labels, patch_artist=True, widths=0.6)
plt.ylabel('ROC-AUC Score')
plt.show()

# Print summary statistics
print(f'Baseline  - mean score (ROC-AUC): {np.mean(baseline_scores):.4f}, Std: {np.std(baseline_scores):.4f}')
print(f'Optimized - mean score (ROC-AUC): {np.mean(scores):.4f}, Std: {np.std(scores):.4f}')

### 4.5. Confusion matrices

In [None]:
# Get predictions for both models using cross-validation
baseline_predictions = cross_val_predict(
    baseline_model,
    train_eval_df.drop(columns=['diagnosed_diabetes']),
    train_eval_df['diagnosed_diabetes'],
    cv=CV_FOLDS,
    n_jobs=N_JOBS
)

optimized_predictions = cross_val_predict(
    optimized_model,
    train_eval_df.drop(columns=['diagnosed_diabetes']),
    train_eval_df['diagnosed_diabetes'],
    cv=CV_FOLDS,
    n_jobs=N_JOBS
)

# Plot confusion matrices side by side
fig, axes = plt.subplots(1, 2, figsize=(8, 4))

# Baseline confusion matrix
axes[0].set_title('Unoptimized model')

disp1 = ConfusionMatrixDisplay.from_predictions(
    train_df['diagnosed_diabetes'],
    baseline_predictions,
    normalize='true',
    ax=axes[0],
    colorbar=False
)

# Optimized confusion matrix
axes[1].set_title('Optimized model')

disp3 = ConfusionMatrixDisplay.from_predictions(
    train_df['diagnosed_diabetes'],
    optimized_predictions,
    normalize='true',
    ax=axes[1],
    colorbar=False
)

plt.tight_layout()
plt.show()

## 5. Save Model

In [None]:
# Define model directory and ensure it exists
model_dir = Path('../models')
model_dir.mkdir(parents=True, exist_ok=True)

# Create model filename
model_name = 'gradient_boosting'
model_path = model_dir / f'{model_name}.joblib'

# Save the final model
joblib.dump(optimized_model, model_path)
print('Model saved to:', model_path)
print(f'File size: {model_path.stat().st_size / (1024**2):.2f} MB')