<a href="https://colab.research.google.com/github/gav-ip/cse176-fall-proj/blob/part2-LeNet5/xgboost_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**XG Boost**
Change runtime type to T4 GPU

In [1]:
from google.colab import files
uploaded = files.upload()

Saving model.pkl to model.pkl


In [5]:
from google.colab import files
uploaded = files.upload()

Saving model2.pkl to model2.pkl


In [6]:
from google.colab import files
uploaded = files.upload()

Saving MNIST.mat to MNIST.mat


In [7]:
from google.colab import files
uploaded = files.upload()

Saving MNIST-LeNet5.mat to MNIST-LeNet5.mat


In [2]:
# run this if on local local machine
# skip if on google colab
%pip install -r ../requirements.txt

[31mERROR: Could not open requirements file: [Errno 2] No such file or directory: '../requirements.txt'[0m[31m
[0m

In [3]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy.io import loadmat
import seaborn as sns
import numpy as np
import pandas as pd
import xgboost as xgb

#**LOAD MNIST**

In [4]:
mnist = loadmat('./MNIST.mat')

# Extract data
X_train_full = mnist['train_fea']
y_train_full = mnist['train_gnd'].ravel()
X_test = mnist['test_fea']
y_test = mnist['test_gnd'].ravel()

# Split training into train and validation (55k train, 5k val)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=5000, random_state=42)

# shift labels to 0-9
y_train = y_train - 1
y_val = y_val - 1
y_test = y_test - 1

print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")


FileNotFoundError: [Errno 2] No such file or directory: './MNIST.mat'

#**MODEL FITTING AND EARLY STOPPING**
*- ran on Google Colab T4 GPU*

In [None]:
# Early stopping callback to prevent overfitting
early_stop = xgb.callback.EarlyStopping(
    rounds=20,
    metric_name='mlogloss',
    data_name='validation_0',
    save_best=True,
    min_delta=1e-3    # (default 1e-2) Improvement threshold for continued training, decreasing to prevent premature stopping or increase to mitigate overfitting
)
# Use "hist" for constructing the trees, with early stopping enabled.
clf = xgb.XGBClassifier(
    tree_method="hist",
    n_estimators=2000,
    eval_metric="mlogloss",
    device = 'cuda',
    learning_rate = 0.2,
    max_depth = 6,   # Balanced value of ranges of max_depth of the decision tree to prevent overfitting
    callbacks=[early_stop]
)

## Fitting model
(may take a couple minute to run)

In [None]:
# Fitting model using train and validation sets
clf.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=False  # Set to True if you want to see the log stream for training (validation_0) and validation (validation_1)
)

results = clf.evals_result()

epochs = len(results['validation_0']['mlogloss'])
x_axis = range(0, epochs)
print("done fitting")

#**Performance Analysis**

In [None]:
plt.figure(figsize=(10, 6))

plt.axvline(x=clf.best_iteration, color='r', linestyle='--', alpha=0.7, label=f'Best iteration = {clf.best_iteration}')

plt.plot(x_axis, results['validation_0']['mlogloss'], label='Training Log Loss')
plt.plot(x_axis, results['validation_1']['mlogloss'], label='Validation Log Loss')
plt.legend()
plt.xlabel('Epochs (Boosting Rounds)')
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss: Training vs Validation')
plt.grid(True)
plt.show()

print(f"Best model found at iteration: {clf.best_iteration}")

# This automatically uses the best iteration
y_test_pred = clf.predict(X_test)

# Calculate test accuracy
test_acc = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy (using best model): {test_acc:.4f}")

#**Generate confusion matrix**
evaluating on the test set

In [None]:
cm = confusion_matrix(y_test, y_test_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(
            cm,
            cmap="magma",
            annot=True,
            fmt="d",
            cbar=True,
        )
plt.title('Confusion Matrix: MNIST Digit Classification', fontsize=14)
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.show()

print("\nClassification Report:\n")
print(classification_report(y_test, y_test_pred, digits=4))

#**Load LeNet5**

In [None]:
LeNet = loadmat('./MNIST-LeNet5.mat')

print("Keys in .mat file: ", LeNet.keys())

# Extract data
X_train_full = LeNet['train_fea']
y_train_full = LeNet['train_gnd'].ravel()
X_test = LeNet['test_fea']
y_test = LeNet['test_gnd'].ravel()

# Split training into train and validation (55k train, 5k val)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=5000, random_state=42)

# shift labels to 0-9
y_train = y_train - 1
y_val = y_val - 1
y_test = y_test - 1

print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")

#**MODEL FITTING AND EARLY STOPPING**

In [None]:
# Early stopping callback to prevent overfitting
early_stop = xgb.callback.EarlyStopping(
    rounds=20,
    metric_name='mlogloss',
    data_name='validation_0',
    save_best=True,
    min_delta=1e-3    # (default 1e-2) Improvement threshold for continued training, decreasing to prevent premature stopping or increase to mitigate overfitting
)

# Use "hist" for constructing the trees, with early stopping enabled.
clf = xgb.XGBClassifier(
    tree_method="hist",
    n_estimators=2000,
    eval_metric="mlogloss",
    learning_rate = 0.2,
    device = 'cuda',
    max_depth = 6,   # Balanced value of ranges of max_depth of the decision tree to prevent overfitting
    callbacks=[early_stop]
)

#Fitting Model

In [None]:
# Fitting model using train and validation sets
clf.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=False  # Set to True if you want to see the log stream for training (validation_0) and validation (validation_1)
)

results = clf.evals_result()

epochs = len(results['validation_0']['mlogloss'])
x_axis = range(0, epochs)
print("done fitting")

#**Performance Analysis**

In [None]:
plt.figure(figsize=(10, 6))

plt.axvline(x=clf.best_iteration, color='r', linestyle='--', alpha=0.7, label=f'Best iteration = {clf.best_iteration}')

plt.plot(x_axis, results['validation_0']['mlogloss'], label='Training Log Loss')
plt.plot(x_axis, results['validation_1']['mlogloss'], label='Validation Log Loss')
plt.legend()
plt.xlabel('Epochs (Boosting Rounds)')
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss: Training vs Validation')
plt.grid(True)
plt.show()

print(f"Best model found at iteration: {clf.best_iteration}")

# This automatically uses the best iteration
y_test_pred = clf.predict(X_test)

# Calculate test accuracy
test_acc = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy (using best model): {test_acc:.4f}")

#**Generate confusion matrix**


In [None]:
cm = confusion_matrix(y_test, y_test_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(
            cm,
            cmap="magma",
            annot=True,
            fmt="d",
            cbar=True,
        )
plt.title('Confusion Matrix: MNIST-LeNet5 Digit Classification', fontsize=14)
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.show()

print("\nClassification Report:\n")
print(classification_report(y_test, y_test_pred, digits=4))

In [15]:
import pickle

# Load pixel-based XGBoost
with open('model.pkl', 'rb') as f:
    model_pixel = pickle.load(f)

# Load LeNet feature XGBoost
with open('model2.pkl', 'rb') as f:
    model_lenet = pickle.load(f)


In [11]:
import scipy.io

mnist = scipy.io.loadmat("MNIST.mat")
mnist_lenet = scipy.io.loadmat("MNIST-LeNet5.mat")

print(mnist.keys())
print(mnist_lenet.keys())


dict_keys(['__header__', '__version__', '__globals__', 'train_fea', 'train_gnd', 'test_fea', 'test_gnd'])
dict_keys(['__header__', '__version__', '__globals__', 'test_fea', 'test_gnd', 'train_fea', 'train_gnd'])


In [12]:
import scipy.io
import numpy as np

mnist = scipy.io.loadmat("MNIST.mat")

Xtrain = mnist["train_fea"]         # shape (60000, 784)
ytrain = mnist["train_gnd"].ravel() # shape (60000,)
Xtest  = mnist["test_fea"]          # shape (10000, 784)
ytest  = mnist["test_gnd"].ravel()

print("Pixel feature shapes:")
print(Xtrain.shape, ytrain.shape)
print(Xtest.shape, ytest.shape)

mnist_l = scipy.io.loadmat("MNIST-LeNet5.mat")

Xtrain_l = mnist_l["train_fea"]        # shape (60000, 800)
ytrain_l = mnist_l["train_gnd"].ravel()
Xtest_l  = mnist_l["test_fea"]         # shape (10000, 800)
ytest_l  = mnist_l["test_gnd"].ravel()

print("LeNet5 feature shapes:")
print(Xtrain_l.shape, ytrain_l.shape)
print(Xtest_l.shape, ytest_l.shape)



Pixel feature shapes:
(60000, 784) (60000,)
(10000, 784) (10000,)
LeNet5 feature shapes:
(60000, 800) (60000,)
(10000, 800) (10000,)


In [18]:
import pickle
import scipy.io
from sklearn.metrics import accuracy_score

# Load model properly

with open('model.pkl', 'rb') as f:
    mp = pickle.load(f)
model_pixel = mp["model"]

with open('model2.pkl', 'rb') as f:
    ml = pickle.load(f)
model_lenet = ml["model"]

# Load MNIST FEATURES (.mat)

mnist = scipy.io.loadmat("MNIST.mat")
Xtrain = mnist["train_fea"]
ytrain = mnist["train_gnd"].ravel()
Xtest  = mnist["test_fea"]
ytest  = mnist["test_gnd"].ravel()

mnist_l = scipy.io.loadmat("MNIST-LeNet5.mat")
Xtrain_l = mnist_l["train_fea"]
ytrain_l = mnist_l["train_gnd"].ravel()
Xtest_l  = mnist_l["test_fea"]
ytest_l  = mnist_l["test_gnd"].ravel()

# Test pixel model

y_pred_pixel = model_pixel.predict(Xtest)
pixel_error = 1 - accuracy_score(ytest, y_pred_pixel)
print("Pixel model test error:", pixel_error)

# Test LeNet model

y_pred_lenet = model_lenet.predict(Xtest_l)
lenet_error = 1 - accuracy_score(ytest_l, y_pred_lenet)
print("LeNet model test error:", lenet_error)


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


Pixel model test error: 0.9976
LeNet model test error: 0.9986


In [19]:
mp = pickle.load(open("model.pkl", "rb"))
model_pixel = mp["model"]

ml = pickle.load(open("model2.pkl", "rb"))
model_lenet = ml["model"]
import pickle

with open("xgb_pixel_clean.pkl", "wb") as f:
    pickle.dump(model_pixel, f)

with open("xgb_lenet_clean.pkl", "wb") as f:
    pickle.dump(model_lenet, f)
