# Gradient Boosting Classifier

In [14]:
# Import
import polars as pl
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [18]:
data = pl.read_parquet('../data/training_data.parquet')

X = data.drop("label").select([
    pl.col(col).cast(pl.Float32) for col in data.columns if col != "label"
]).to_numpy()

y = data["label"].to_numpy()
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

print(f"Features shape: {X.shape}, Labels shape: {y_encoded.shape}")

Features shape: (455006, 785), Labels shape: (455006,)


In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.pipeline import Pipeline
import time
# histogram 
from sklearn.ensemble import HistGradientBoostingClassifier

steps = [
    ('hgb', HistGradientBoostingClassifier(max_iter = 100, random_state = 42))
]

gb_pipeline = Pipeline(steps)

inner_cv = KFold(n_splits = 3, random_state = 42, shuffle = True)

param_grid = {
    'hgb__learning_rate': [0.01, 0.1, 0.2],
    'hgb__max_depth': [3, 5, 7],
}

gb_grid = GridSearchCV(
    estimator = gb_pipeline,
    param_grid = param_grid,
    scoring = 'accuracy',
    cv = inner_cv,
    n_jobs = 6,
    verbose = 1
)

gb_start = time.time()
gb_grid.fit(X, y_encoded)
gb_end = time.time()
gb_diff = gb_end - gb_start
print("Gradient Boosting Training Complete")
print(f"Time to train Gradient Boosting Model: {np.round(gb_diff, 2)} seconds")

In [None]:
from sklearn.model_selection import train_test_split

# 10% train, 90% test (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y_encoded,
    train_size=0.12,
    test_size=0.88,
    stratify=y_encoded,
    random_state=42,
)

print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")

# Train model
model = HistGradientBoostingClassifier(
    max_iter=100,
    learning_rate=0.01,
    max_depth=3,
    random_state=42
)

start = time.time()
model.fit(X_train, y_train)
end = time.time()

print("Single Model Fit Complete")
print(f"Time to train: {np.round(end - start, 2)} seconds")

# Evaluate on test
test_acc = model.score(X_test, y_test)
print(f"Test accuracy: {test_acc:.4f}")

Train shape: (54600, 785), Test shape: (400406, 785)
Single Model Fit Complete
Time to train: 27.8 seconds
Test accuracy: 0.9204


In [25]:
# Train model
model = HistGradientBoostingClassifier(
    max_iter=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42
)

start = time.time()
model.fit(X_train, y_train)
end = time.time()

print("Single Model Fit Complete")
print(f"Time to train: {np.round(end - start, 2)} seconds")

# Evaluate on test
test_acc = model.score(X_test, y_test)
print(f"Test accuracy: {test_acc:.4f}")

Single Model Fit Complete
Time to train: 11.64 seconds
Test accuracy: 0.9430


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

y_pred = gb_grid.predict(X)  # Use cross-validation predictions if possible
print("Accuracy:", accuracy_score(y_encoded, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_encoded, y_pred))
print(classification_report(y_encoded, y_pred))
