# Coding Block 2 - Hyperparameter Optimization

### Load the packages

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import time

### Read the dataset

In [5]:
df = pd.read_csv('../data/df_imputed_clean.csv')

### Copy the code from your last successful classifiers (RF, XGBoost, ...)
Or use function below for XGBoost/RF

In [6]:
def create_model(data, model_type="xgboost"):
    """
    Create and train ML models on the given dataset
    
    Parameters:
    -----------
    data : DataFrame
        The dataset containing features and target variable
    model_type : str
        The type of model to create (default: "xgboost")
        
    Returns:
    --------
    dict
        Dictionary containing the trained model, X and y data, and train/test splits
    """
    # Separate features and target
    X = data.drop('Outcome', axis=1)
    y = data['Outcome']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create and train model based on type
    if model_type == "random_forest":
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)
    elif model_type == "xgboost":
        import xgboost as xgb
        model = xgb.XGBClassifier(random_state=42)
        model.fit(X_train, y_train)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_type.title()} Model Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    
    # Return model and data
    return {
        "model": model,
        "X": X,
        "y": y,
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test
    }

### Define the parameter grid for GridSearchCV or use RandomizedSearchCV

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

def create_model(data, model_type="xgboost", search_type="grid"):
    """
    Create, train, and optimize ML models using GridSearchCV or RandomizedSearchCV.

    Parameters:
    -----------
    data : DataFrame
        The dataset containing features and target variable.
    model_type : str
        The type of model to create ("random_forest" or "xgboost", default: "xgboost").
    search_type : str
        Type of hyperparameter tuning to use ("grid" for GridSearchCV, "random" for RandomizedSearchCV).

    Returns:
    --------
    dict
        Dictionary containing the trained model, X and y data, and train/test splits.
    """
    # Separate features and target
    X = data.drop('Outcome', axis=1)
    y = data['Outcome']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Define parameter grids
    rf_param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }
    
    xgb_param_grid = {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.8, 1.0],
        'colsample_bytree': [0.8, 1.0]
    }

    # Create model and select parameter grid
    if model_type == "random_forest":
        model = RandomForestClassifier(random_state=42)
        param_grid = rf_param_grid
    elif model_type == "xgboost":
        model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")
        param_grid = xgb_param_grid
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

    # Select optimization strategy
    if search_type == "grid":
        search = GridSearchCV(model, param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)
    elif search_type == "random":
        search = RandomizedSearchCV(model, param_grid, scoring="accuracy", cv=5, n_jobs=-1, verbose=1, n_iter=20)
    else:
        raise ValueError(f"Unsupported search type: {search_type}")

    # Train model with hyperparameter tuning
    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    # Evaluate the model
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_type.title()} Model Accuracy: {accuracy:.4f}")
    print("Best Parameters:", search.best_params_)
    print(classification_report(y_test, y_pred))

    # Return model and data
    return {
        "model": best_model,
        "best_params": search.best_params_,
        "X": X,
        "y": y,
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test
    }


In [8]:
model_results = create_model(df, model_type="random_forest", search_type="grid")


Fitting 5 folds for each of 162 candidates, totalling 810 fits
Random_Forest Model Accuracy: 0.7603
Best Parameters: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
              precision    recall  f1-score   support

         0.0       0.82      0.81      0.82        97
         1.0       0.64      0.65      0.65        49

    accuracy                           0.76       146
   macro avg       0.73      0.73      0.73       146
weighted avg       0.76      0.76      0.76       146



In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
from skopt import BayesSearchCV  # Bayesian Optimization
from skopt.space import Real, Integer, Categorical

def create_model(data, model_type="xgboost", search_type="bayesian"):
    """
    Create, train, and optimize ML models using Bayesian Optimization.

    Parameters:
    -----------
    data : DataFrame
        The dataset containing features and target variable.
    model_type : str
        The type of model to create ("random_forest" or "xgboost", default: "xgboost").
    search_type : str
        Type of hyperparameter tuning to use ("bayesian" for BayesSearchCV).

    Returns:
    --------
    dict
        Dictionary containing the trained model, X and y data, and train/test splits.
    """
    # Separate features and target
    X = data.drop('Outcome', axis=1)
    y = data['Outcome']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Define parameter spaces for Bayesian Optimization
    rf_param_space = {
        'n_estimators': Integer(50, 300),
        'max_depth': Integer(3, 30),
        'min_samples_split': Integer(2, 10),
        'min_samples_leaf': Integer(1, 4),
        'bootstrap': Categorical([True, False])
    }
    
    xgb_param_space = {
        'n_estimators': Integer(50, 300),
        'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
        'max_depth': Integer(3, 10),
        'subsample': Real(0.5, 1.0),
        'colsample_bytree': Real(0.5, 1.0)
    }

    # Create model and select parameter space
    if model_type == "random_forest":
        model = RandomForestClassifier(random_state=42)
        param_space = rf_param_space
    elif model_type == "xgboost":
        model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")
        param_space = xgb_param_space
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

    # Bayesian Optimization with BayesSearchCV
    bayes_search = BayesSearchCV(
        model,
        param_space,
        n_iter=30,  # Number of iterations for Bayesian Optimization
        scoring="accuracy",
        cv=5,
        n_jobs=-1,
        verbose=1,
        random_state=42
    )

    # Train model with Bayesian Optimization
    bayes_search.fit(X_train, y_train)
    best_model = bayes_search.best_estimator_

    # Evaluate the model
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_type.title()} Model Accuracy: {accuracy:.4f}")
    print("Best Parameters:", bayes_search.best_params_)
    print(classification_report(y_test, y_pred))

    # Return model and data
    return {
        "model": best_model,
        "best_params": bayes_search.best_params_,
        "X": X,
        "y": y,
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test
    }


In [14]:
model_results = create_model(df, model_type="random_forest", search_type="grid")

Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Fitting 5 folds for each of 1 candidates, totalling 5 fi

### Perform GridSearchCV or RandomizedSearchCV and tune the hyperparameters of the model
Maybe the hyperparameter tuning won't finish in time though. No problem.

In [20]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb

def create_model(data, model_type="xgboost", n_iter=10):
    """
    Train and optimize ML models using RandomizedSearchCV.

    Parameters:
    -----------
    data : DataFrame
        The dataset containing features and target variable.
    model_type : str
        The type of model to create ("random_forest" or "xgboost", default: "xgboost").
    n_iter : int
        Number of parameter combinations to test in RandomizedSearchCV.

    Returns:
    --------
    dict
        Dictionary containing the trained model, X and y data, and train/test splits.
    """
    # Separate features and target
    X = data.drop('Outcome', axis=1)
    y = data['Outcome']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    # Define parameter search space
    rf_param_grid = {
        'n_estimators': [50, 100, 200, 300, 500],
        'max_depth': [None, 10, 20, 30, 50],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }
    
    xgb_param_grid = {
        'n_estimators': [50, 100, 200, 300, 500],
        'learning_rate': [0.01, 0.05, 0.1, 0.2],
        'max_depth': [3, 5, 7, 10],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }

    # Choose model and corresponding parameter grid
    if model_type == "random_forest":
        model = RandomForestClassifier(random_state=42)
        param_grid = rf_param_grid
    elif model_type == "xgboost":
        model = xgb.XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")
        param_grid = xgb_param_grid
    else:
        raise ValueError(f"Unsupported model type: {model_type}")

    # RandomizedSearchCV for hyperparameter tuning
    search = RandomizedSearchCV(
        model,
        param_distributions=param_grid,
        n_iter=n_iter,  # Number of random combinations to test
        scoring="accuracy",
        cv=5,
        n_jobs=-1,
        verbose=1,
        random_state=42
    )

    # Train model with hyperparameter tuning
    search.fit(X_train, y_train)
    best_model = search.best_estimator_

    # Evaluate the model
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_type.title()} Model Accuracy: {accuracy:.4f}")
    print("Best Parameters:", search.best_params_)
    print(classification_report(y_test, y_pred))

    # Return model and data
    return {
        "model": best_model,
        "best_params": search.best_params_,
        "X": X,
        "y": y,
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test
    }

# Trainiere ein RandomForest-Modell mit RandomizedSearchCV
model_results = create_model(df, model_type="random_forest", n_iter=15)

Fitting 5 folds for each of 15 candidates, totalling 75 fits
Random_Forest Model Accuracy: 0.7671
Best Parameters: {'n_estimators': 200, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_depth': 50, 'bootstrap': False}
              precision    recall  f1-score   support

         0.0       0.83      0.81      0.82        97
         1.0       0.65      0.67      0.66        49

    accuracy                           0.77       146
   macro avg       0.74      0.74      0.74       146
weighted avg       0.77      0.77      0.77       146

