# Coding Block 2 - Hyperparameter Optimization

### Load the packages

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
import time
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

### Read the dataset

In [10]:
data = pd.read_csv("../data/df_imputed_clean.csv")
data = data.drop(columns=['Mahalanobis_Distance','Multivariate_Outlier','Outlier'])

### Copy the code from your last successful classifiers (RF, XGBoost, ...)
Or use function below for XGBoost/RF

In [11]:
def create_model(data, model_type="xgboost"):
    """
    Create and train ML models on the given dataset
    
    Parameters:
    -----------
    data : DataFrame
        The dataset containing features and target variable
    model_type : str
        The type of model to create (default: "xgboost")
        
    Returns:
    --------
    dict
        Dictionary containing the trained model, X and y data, and train/test splits
    """
    # Separate features and target
    X = data.drop('Outcome', axis=1)
    y = data['Outcome']
    
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Create and train model based on type
    if model_type == "random_forest":
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(random_state=42)
        model.fit(X_train, y_train)
    elif model_type == "xgboost":
        import xgboost as xgb
        model = xgb.XGBClassifier(random_state=42)
        model.fit(X_train, y_train)
    else:
        raise ValueError(f"Unsupported model type: {model_type}")
    
    # Evaluate the model
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model_type.title()} Model Accuracy: {accuracy:.4f}")
    print(classification_report(y_test, y_pred))
    
    # Return model and data
    return {
        "model": model
    }

### Define the parameter grid for GridSearchCV or use RandomizedSearchCV

### Perform GridSearchCV or RandomizedSearchCV and tune the hyperparameters of the model
Maybe the hyperparameter tuning won't finish in time though. No problem.

In [12]:
# Split the data into features (X) and target (y)
X = data.drop("Outcome", axis=1)
y = data["Outcome"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the parameter grid for XGBoost
param_grid_xgb = {
    "n_estimators": [50, 100, 200],  # Number of boosting rounds
    "max_depth": [3, 6, 9],  # Maximum depth of trees
    "learning_rate": [0.01, 0.1, 0.2],  # Learning rate
    "subsample": [0.8, 1.0],  # Fraction of samples used for training
    "colsample_bytree": [0.8, 1.0],  # Fraction of features used for training
    "gamma": [0, 0.1, 0.2]  # Minimum loss reduction for a split
}

# Initialize the XGBoost model
xgb_classifier = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric="logloss")

# Perform GridSearchCV for XGBoost
grid_search_xgb = GridSearchCV(estimator=xgb_classifier, param_grid=param_grid_xgb, scoring="accuracy", cv=5, n_jobs=-1, verbose=1)
grid_search_xgb.fit(X_train, y_train)

# Print the best parameters and score
print("Best Parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best Cross-Validation Accuracy for XGBoost:", grid_search_xgb.best_score_)

# Evaluate the best model on the test set
best_xgb_model = grid_search_xgb.best_estimator_
y_pred_xgb = best_xgb_model.predict(X_test)

# Print classification report and accuracy
print("\nTest Set Evaluation for XGBoost:")
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=["No Diabetes", "Diabetes"]))

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
Best Parameters for XGBoost: {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 50, 'subsample': 1.0}
Best Cross-Validation Accuracy for XGBoost: 0.7803566165635132

Test Set Evaluation for XGBoost:
Accuracy: 0.7602739726027398

Classification Report:
              precision    recall  f1-score   support

 No Diabetes       0.77      0.91      0.83        97
    Diabetes       0.72      0.47      0.57        49

    accuracy                           0.76       146
   macro avg       0.75      0.69      0.70       146
weighted avg       0.75      0.76      0.74       146



Parameters: { "use_label_encoder" } are not used.



In [13]:

# Load dataset
X = data.drop("Outcome", axis=1)
y = data["Outcome"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize XGBoost Classifier
xgb_classifier = xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss")

# Define the parameter grid
param_grid = {
    "n_estimators": [50, 100, 200],   # Number of trees
    "max_depth": [3, 5, 7],           # Depth of trees
    "learning_rate": [0.01, 0.1, 0.2], # Learning rate
    "subsample": [0.8, 1.0],          # Fraction of samples used per tree
    "colsample_bytree": [0.8, 1.0]    # Fraction of features used per tree
}

# Setup GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_classifier, 
    param_grid=param_grid, 
    scoring="accuracy", 
    cv=5, 
    n_jobs=-1, 
    verbose=1
)

# Fit the model
grid_search.fit(X_train, y_train)

# Print best parameters and best accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy:", grid_search.best_score_)

# Use the best model to make predictions
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate on test data
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)


NameError: name 'xgb' is not defined

In [None]:
import sklearn

print("version: ", sklearn.__version__)

version:  1.2.2
