# Gradient Boosting Classification with Scikit-Learn

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier


In [None]:
# Import data and assign to X and y
wine = datasets.load_wine(as_frame=True)

X = wine['data']
y = wine['target']

In [None]:
X.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [16]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
# Instantiate and train the model
model = GradientBoostingClassifier()
model.fit(X_train, y_train)

In [None]:
# Evaluate the model
score = cross_val_score(model, X_train, y_train, cv=3, n_jobs=1).mean()
# cv = 3 means 3-fold cross-validation, model trains on 2 parts and tests on the 3rd, rotating so each part gets tested once
# n_jobs=1 means using a single CPU core for the computation
print(f"GB Model Score: {score:.4f}")

GB Model Score: 0.9014


In [9]:
# Define hyperparameter grid for tuning
param_grid = {
    'n_estimators': [10, 50, 100, 200], # aka number of trees
    'learning_rate': [0.0001, 0.001, 0.01, 0.1, 1.0], # scales contribution of each tree
    'max_depth': [3, 5, 7, 9] # depth of each tree
}

In [11]:
# Set up GridSearchCV for hyperparameter tuning
# Try out all combinations of parameters established above
model_2 = GridSearchCV(model, param_grid, cv=3, n_jobs=-1) # use all available CPU cores
model_2.fit(X_train, y_train)

In [12]:
# See best hyperparameters found
model_2.best_params_

{'learning_rate': 1.0, 'max_depth': 3, 'n_estimators': 10}

In [13]:
# See best score achieved with those hyperparameters
model_2.best_score_

0.9716312056737589

## Gradient Boosting Regression

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import datasets
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [41]:
data = datasets.load_diabetes()
X, y = data.data, data.target

In [43]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model_3 = GradientBoostingRegressor()
model_3.fit(X_train, y_train)
y_pred = model_3.predict(X_test)

In [None]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, mae, r2

(2919.412519547588, 44.67625462394229, 0.4489752921167316)

In [50]:
# Try to improve the model with hyperparameter tuning
param_grid_reg = {
    'n_estimators': [100, 300, 500, 700],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 7],  # minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],   # minimum samples required to be at a leaf node
    'subsample': [0.5, 0.7, 1.0],  # fraction of samples used for fitting each base learner
    'criterion': ['friedman_mse']  # function to measure the quality of a split
}

In [51]:
grid = GridSearchCV(model_3, param_grid_reg, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
grid.fit(X_train, y_train)

In [52]:
# Make new predictions
y_pred = grid.predict(X_test)

# Evaluate the tuned model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

mse, mae, r2

(2748.3664315742362, 42.76377055709648, 0.48125939723343425)

In [53]:
print(grid.best_params_)
print(grid.best_score_)

{'criterion': 'friedman_mse', 'learning_rate': 0.01, 'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300, 'subsample': 0.5}
-3217.328656859347
