# Coding Block 4 - Automated model and hyperparameter tuning with AutoGluon

### Load the packages

In [11]:
#!pip install autogluon.tabular  > /dev/null 2>&1
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# AutoML
from autogluon.tabular import TabularPredictor


### Read the dataset 
You can also compare processed and non-processed data. The autogluon library will do some preprocessing as well.

In [12]:
data_cleaned = pd.read_csv("../data/df_imputed_clean.csv")
data = pd.read_csv("../data/df_imputed_clean.csv")
data = data.drop(columns=['Mahalanobis_Distance','Multivariate_Outlier','Outlier'])

In [13]:
# Define the target column
target_column = "Outcome"

# Initialize TabularPredictor with raw data
predictor_raw = TabularPredictor(label=target_column).fit(train_data=data)

# Evaluate on the test set
performance_raw = predictor_raw.evaluate(data)
print("Performance with Raw Data (AutoGluon Preprocessing):", performance_raw)

# Separate features and target
X = data.drop(columns=[target_column])
y = data[target_column]

# Define preprocessing for numerical and categorical features
numerical_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),  # Fill missing values with mean
    ("scaler", StandardScaler())  # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),  # Fill missing values with mode
    ("onehot", OneHotEncoder(handle_unknown="ignore"))  # Encode categorical features
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features)
    ])

# Apply preprocessing
X_processed = preprocessor.fit_transform(X)

# Convert back to a DataFrame
processed_data = pd.DataFrame(X_processed, columns=preprocessor.get_feature_names_out())
processed_data[target_column] = y

# Display the first few rows of processed data
print(processed_data.head())

No path specified. Models will be saved in: "AutogluonModels\ag-20250320_140736"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20250320_140736"
AutoGluon Version:  1.1.0
Python Version:     3.9.21
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          32
Memory Avail:       3.52 GB / 15.63 GB (22.5%)
Disk Space Avail:   672.

Performance with Raw Data (AutoGluon Preprocessing): {'accuracy': 0.9382716049382716, 'balanced_accuracy': 0.9177894064878123, 'mcc': 0.8615672528814801, 'roc_auc': 0.9707779663010062, 'f1': 0.9036402569593148, 'precision': 0.9590909090909091, 'recall': 0.854251012145749}
   num__Unnamed: 0  num__Pregnancies  num__Glucose  num__BloodPressure  \
0        -1.729677          0.640733      0.928449           -0.024242   
1        -1.724925         -0.857325     -1.200742           -0.518338   
2        -1.720173          1.239957      2.111334           -0.683037   
3        -1.715421         -0.857325     -1.065555           -0.518338   
4        -1.710669          0.341122     -0.153045            0.140457   

   num__SkinThickness  num__Insulin  num__BMI  num__DiabetesPedigreeFunction  \
0            0.692857      0.367315  0.195747                       0.572044   
1            0.021786     -1.062642 -0.845724                      -0.352984   
2           -0.336118      0.310324 -1.336

In [21]:
from sklearn.model_selection import train_test_split

# Define the target column
target_column = "Outcome"

# Split the data into features (X) and target (y)
X = data.drop(columns=[target_column])
y = data[target_column]

# Split into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Combine features and target for AutoGluon
train_data = pd.concat([X_train, y_train], axis=1)
test_data = pd.concat([X_test, y_test], axis=1)

# Display the shapes of the training and testing sets
print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)

# Initialize TabularPredictor
predictor = TabularPredictor(label=target_column, eval_metric= "f1").fit(train_data=train_data)

# Display the model leaderboard
leaderboard = predictor.leaderboard()
print(leaderboard)
performance = predictor.evaluate(test_data)
print("Test Set Performance:", performance)

# Make predictions on the test set
y_pred = predictor.predict(test_data)
print("Predictions on Test Set:", y_pred)

No path specified. Models will be saved in: "AutogluonModels\ag-20250321_083611"
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets.
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='best_quality'   : Maximize accuracy. Default time_limit=3600.
	presets='high_quality'   : Strong accuracy with fast inference speed. Default time_limit=3600.
	presets='good_quality'   : Good accuracy with very fast inference speed. Default time_limit=3600.
	presets='medium_quality' : Fast training time, ideal for initial prototyping.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20250321_083611"
AutoGluon Version:  1.1.0
Python Version:     3.9.21
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          32
Memory Avail:       2.75 GB / 15.63 GB (17.6%)
Disk Space Avail:   672.

Training data shape: (583, 10)
Testing data shape: (146, 10)


	0.7	 = Validation score   (f1)
	0.91s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: LightGBM ...
	0.6588	 = Validation score   (f1)
	0.86s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: RandomForestGini ...
	0.6341	 = Validation score   (f1)
	0.33s	 = Training   runtime
	0.03s	 = Validation runtime
Fitting model: RandomForestEntr ...
	0.6173	 = Validation score   (f1)
	0.29s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ...
	0.625	 = Validation score   (f1)
	0.81s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesGini ...
	0.6133	 = Validation score   (f1)
	0.35s	 = Training   runtime
	0.06s	 = Validation runtime
Fitting model: ExtraTreesEntr ...
	0.6053	 = Validation score   (f1)
	0.34s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
No improvement since epoch 6: early stopping
	0.75	 = Validation score   (f1)
	0.86s	 = Training   runtime
	0.01s	 = Valida

                  model  score_val eval_metric  pred_time_val  fit_time  \
0   WeightedEnsemble_L2   0.780488          f1       0.024220  3.825591   
1        NeuralNetTorch   0.765432          f1       0.015796  1.685366   
2       NeuralNetFastAI   0.750000          f1       0.007416  0.860039   
3            LightGBMXT   0.700000          f1       0.013973  0.908449   
4         LightGBMLarge   0.698795          f1       0.008423  1.946876   
5               XGBoost   0.666667          f1       0.006173  0.669173   
6              LightGBM   0.658824          f1       0.009454  0.860232   
7      RandomForestGini   0.634146          f1       0.032137  0.333944   
8              CatBoost   0.625000          f1       0.000000  0.806008   
9      RandomForestEntr   0.617284          f1       0.051039  0.286753   
10       ExtraTreesGini   0.613333          f1       0.058609  0.345797   
11       ExtraTreesEntr   0.605263          f1       0.053820  0.341422   
12       KNeighborsUnif  

### Use the Autogluon library
Use the library autogluon for automated hyperparametertuning and model benchmarking. The fit function of the TabularPredictor object allows for setting the option: <br>
<i>presets = {‘best_quality’, ‘high_quality’, ‘good_quality’, ‘medium_quality’, ‘experimental_quality’, ‘optimize_for_deployment’, ‘interpretable’, ‘ignore_text’}</i> <br>

medium_quality can limit the depths of hyperparameter optimization..

### Show the leaderboard
TabularPredictor objects from Autogluon provide a function "leaderboard"

### Show the feature importance table
The TabularPredictor class from Autogluon also provides a function "feature_importance"