# Histogram Gradient Boosting (GBMs) via Grid Search CV # 

In [2]:
import numpy as np
import pandas
from sklearn import linear_model
from sklearn import metrics
import matplotlib.pyplot as plt

from sklearn.experimental import enable_hist_gradient_boosting  # explicitly require this experimental feature
from sklearn.ensemble import HistGradientBoostingClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

import time

Run Histogram-based GBMs on TWO datasets: One for the 28x28 images, one for the 64x64 images.

## 28 x 28 images ##

In [21]:
data = pandas.read_csv('data28.csv')
y = data['label'].values
X = data[data.columns[1:]].values

# 70-30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.7, random_state = 1)

In [22]:
grid = {'max_iter' : 250 * np.arange(1, 13)} # 250, 500, ..., 3000
gbm=HistGradientBoostingClassifier(max_depth=4, learning_rate=0.01) # choose learning rate 0.01, depth 4
gbm_cv_=GridSearchCV(gbm,grid,cv=5,n_jobs=-1,verbose=4) # run all jobs in parallel, and print the CV accuracy per fold on the terminal (verbose = 4)
t0 = time.time()
gbm_cv_.fit(X_train,y_train)
t1 = time.time()
print('Grid search for 28 x 28 images took', t1 - t0, 'seconds')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Grid search for 28 x 28 images took 6105.600783824921 seconds


In [23]:
print("tuned hyperparameters: (best parameters) ",gbm_cv_.best_params_)
print("CV accuracy :",gbm_cv_.best_score_)

tuned hyperparameters: (best parameters)  {'max_iter': 2500}
CV accuracy : 0.9393333333333332


In [24]:
max_iter = gbm_cv_.best_params_['max_iter']

clf = HistGradientBoostingClassifier(max_depth=4, max_iter=max_iter, learning_rate=0.01)
t0 = time.time()
clf.fit(X_train, y_train)
t1 = time.time()
print('Training for 28 x 28 images took', t1 - t0, 'seconds')

Training for 28 x 28 images took 223.2854940891266 seconds


In [25]:
# training accuracy
train_acc = accuracy_score(y_train, clf.predict(X_train)) 
train_acc

1.0

In [26]:
# test accuracy
test_acc = accuracy_score(y_test, clf.predict(X_test)) 
test_acc

0.9617142857142857

In [27]:
# write cv results (such as mean fit times for each hyperparam configuration) to file
# make sure to run the above cells first before running this!!!!
with open('gbm_28_gridcv.txt', 'w') as file:
    file.write(str(gbm_cv_.cv_results_))

## 64 x 64 images ##

In [11]:
data = pandas.read_csv('data64.csv')
y = data['label'].values
X = data[data.columns[1:]].values

# 70-30 train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.7, random_state = 1)

In [12]:
grid = {'max_iter' : 250 * np.arange(1, 13)} # 250, 500, ..., 3000
gbm=HistGradientBoostingClassifier(max_depth=4, learning_rate=0.01) # choose learning rate 0.01, depth 4
gbm_cv=GridSearchCV(gbm,grid,cv=5,n_jobs=-1,verbose=4) # run all jobs in parallel (n_jobs=-1), and print the CV accuracy per fold on the terminal (verbose = 4)
t0 = time.time()
gbm_cv.fit(X_train,y_train)
t1 = time.time()
print('Grid search for 64 x 64 images took', t1 - t0, 'seconds')

Fitting 5 folds for each of 12 candidates, totalling 60 fits
Grid search for 64 x 64 images took 24686.5692858696 seconds


In [20]:
print("tuned hyperparameters: (best parameters) ",gbm_cv.best_params_)
print("CV accuracy :",gbm_cv.best_score_)

tuned hyperparameters: (best parameters)  {'max_iter': 2250}
CV accuracy : 0.9146666666666666


In [14]:
max_iter = gbm_cv.best_params_['max_iter']

clf = HistGradientBoostingClassifier(max_depth=4, max_iter=max_iter, learning_rate=0.01)
t0 = time.time()
clf.fit(X_train, y_train)
t1 = time.time()
print('Training for 64 x 64 images took', t1 - t0, 'seconds')

Training for 64 x 64 images took 994.8379521369934 seconds


In [15]:
# training accuracy
train_acc = accuracy_score(y_train, clf.predict(X_train)) 
train_acc

1.0

In [16]:
# test accuracy
test_acc = accuracy_score(y_test, clf.predict(X_test)) 
test_acc

0.9382857142857143

In [17]:
# write cv results (such as mean fit times for each hyperparam configuration) to file
with open('gbm_64_gridcv.txt', 'w') as file:
    file.write(str(gbm_cv.cv_results_))