<a href="https://colab.research.google.com/github/gav-ip/cse176-fall-proj/blob/part2/xgboost_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#**XG Boost**
Change runtime type to T4 GPU

In [None]:
# skip if on google colab
%pip install -r ../requirements.txt

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from scipy.io import loadmat
import seaborn as sns
import numpy as np
import pandas as pd
import xgboost as xgb

#**LOAD MNIST**

In [None]:
mnist = loadmat('MNIST.mat')

# Extract data
X_train_full = mnist['train_fea']
y_train_full = mnist['train_gnd'].ravel()
X_test = mnist['test_fea']
y_test = mnist['test_gnd'].ravel()

# Split training into train and validation (55k train, 5k val)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=5000, random_state=42)

# shift labels to 0-9
y_train = y_train - 1
y_val = y_val - 1
y_test = y_test - 1

print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")


#**MODEL FITTING AND EARLY STOPPING**
*- ran on Google Colab T4 GPU*

In [None]:
# Early stopping callback to prevent overfitting
early_stop = xgb.callback.EarlyStopping(
    rounds=20,
    metric_name='mlogloss',
    data_name='validation_0',
    save_best=True,
    min_delta=1e-3    # (default 1e-2) Improvement threshold for continued training, decreasing to prevent premature stopping or increase to mitigate overfitting
)
# Use "hist" for constructing the trees, with early stopping enabled.
clf = xgb.XGBClassifier(
    tree_method="hist",
    n_estimators=2000,
    eval_metric="mlogloss",
    device = 'cuda',
    max_depth = 6,   # Balanced value of ranges of max_depth of the decision tree to prevent overfitting
    callbacks=[early_stop]
)

## Fitting model

In [None]:
# Fitting model using train and validation sets
clf.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=False  # Set to True if you want to see the log stream for training (validation_0) and validation (validation_1)
)

results = clf.evals_result()

epochs = len(results['validation_0']['mlogloss'])
x_axis = range(0, epochs)
print("done fitting")

#**Performance Analysis**

In [None]:
plt.figure(figsize=(10, 6))

plt.axvline(x=clf.best_iteration, color='r', linestyle='--', alpha=0.7, label=f'Best iteration = {clf.best_iteration}')

plt.plot(x_axis, results['validation_0']['mlogloss'], label='Training Log Loss')
plt.plot(x_axis, results['validation_1']['mlogloss'], label='Validation Log Loss')
plt.legend()
plt.xlabel('Epochs (Boosting Rounds)')
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss: Training vs Validation')
plt.grid(True)
plt.show()

print(f"Best model found at iteration: {clf.best_iteration}")

# This automatically uses the best iteration
y_test_pred = clf.predict(X_test)

# Calculate test accuracy
test_acc = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy (using best model): {test_acc:.4f}")

#**Generate confusion matrix**
evaluating on the test set

In [None]:
cm = confusion_matrix(y_test, y_test_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(
            cm,
            cmap="magma",
            annot=True,
            fmt="d",
            cbar=True,
        )
plt.title('Confusion Matrix: MNIST Digit Classification', fontsize=14)
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.show()

print("\nClassification Report:\n")
print(classification_report(y_test, y_test_pred, digits=4))

#**Load LeNet5**

In [None]:
LeNet = loadmat('../MNIST-LeNet5.mat')

print("Keys in .mat file: ", LeNet.keys())

# Extract data
X_train_full = LeNet['train_fea']
y_train_full = LeNet['train_gnd'].ravel()
X_test = LeNet['test_fea']
y_test = LeNet['test_gnd'].ravel()

# Split training into train and validation (55k train, 5k val)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=5000, random_state=42)

# shift labels to 0-9
y_train = y_train - 1
y_val = y_val - 1
y_test = y_test - 1

print(f"Training set: {X_train.shape}, {y_train.shape}")
print(f"Validation set: {X_val.shape}, {y_val.shape}")
print(f"Test set: {X_test.shape}, {y_test.shape}")

#**MODEL FITTING AND EARLY STOPPING**

In [None]:
# Early stopping callback to prevent overfitting
early_stop = xgb.callback.EarlyStopping(
    rounds=20,
    metric_name='mlogloss',
    data_name='validation_0',
    save_best=True,
    min_delta=1e-3    # (default 1e-2) Improvement threshold for continued training, decreasing to prevent premature stopping or increase to mitigate overfitting
)

# Use "hist" for constructing the trees, with early stopping enabled.
clf = xgb.XGBClassifier(
    tree_method="hist",
    n_estimators=2000,
    eval_metric="mlogloss",
    device = 'cuda',
    max_depth = 6,   # Balanced value of ranges of max_depth of the decision tree to prevent overfitting
    callbacks=[early_stop]
)

#Fitting Model

In [None]:
# Fitting model using train and validation sets
clf.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    verbose=False  # Set to True if you want to see the log stream for training (validation_0) and validation (validation_1)
)

results = clf.evals_result()

epochs = len(results['validation_0']['mlogloss'])
x_axis = range(0, epochs)
print("done fitting")

#**Performance Analysis**

In [None]:
plt.figure(figsize=(10, 6))

plt.axvline(x=clf.best_iteration, color='r', linestyle='--', alpha=0.7, label=f'Best iteration = {clf.best_iteration}')

plt.plot(x_axis, results['validation_0']['mlogloss'], label='Training Log Loss')
plt.plot(x_axis, results['validation_1']['mlogloss'], label='Validation Log Loss')
plt.legend()
plt.xlabel('Epochs (Boosting Rounds)')
plt.ylabel('Log Loss')
plt.title('XGBoost Log Loss: Training vs Validation')
plt.grid(True)
plt.show()

print(f"Best model found at iteration: {clf.best_iteration}")

# This automatically uses the best iteration
y_test_pred = clf.predict(X_test)

# Calculate test accuracy
test_acc = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy (using best model): {test_acc:.4f}")