## Random Forest

In [5]:
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score,
    classification_report
)

### Loading Data

In [6]:
train_path = "../data/training_data.parquet"

train_df = pd.read_parquet(train_path)
print("Training data shape:", train_df.shape)
#print("Columns:", train_df.columns.to_list())

#split features and labels
X = train_df.drop(columns=["label"]).to_numpy()
y = train_df["label"].to_numpy()

print("X (Features) shape:", X.shape)
print("y (Label Vector) shape:", y.shape)

#encode labels
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

print("\nClasses:")
for i, cls in enumerate(encoder.classes_):
    print(f"{i} -> {cls}")

Training data shape: (68250, 785)
X (Features) shape: (68250, 784)
y (Label Vector) shape: (68250,)

Classes:
0 -> airplane
1 -> ice cream
2 -> spreadsheet
3 -> sword


### Model Setup & Training

Built a pipeline with RandomForestClassifier, defined a 5-fold cross-validation scheme, and set up a hyperparameter grid. GridSearchCV will search over these settings to find the best-performing Random Forest model.

In [7]:
#random forest pipeline
rf_steps = [
    ('rf', RandomForestClassifier(random_state=42, n_jobs=-1))
]

rf_pipeline = Pipeline(rf_steps)

#same inner CV as gradient boosting so results are comparable
rf_inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf_param_grid = {
    'rf__n_estimators': [100, 200],
    'rf__max_depth': [10, 20],       
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2]
}

rf_grid = GridSearchCV(
    estimator=rf_pipeline,
    param_grid=rf_param_grid,
    scoring='accuracy',
    cv=rf_inner_cv,
    n_jobs=1,
    verbose=1
)


Timing the model

In [8]:
rf_start = time.time()
rf_grid.fit(X, y_encoded)
rf_end = time.time()
rf_diff = rf_end - rf_start

print("\nRandom Forest Training Complete")
print(f"Time to train Random Forest Model: {np.round(rf_diff, 2)} seconds | {np.round(rf_diff / 60, 2)} minutes")

Fitting 5 folds for each of 16 candidates, totalling 80 fits

Random Forest Training Complete
Time to train Random Forest Model: 709.59 seconds | 11.83 minutes


### Evaluating the Model

In [9]:

#best model hyperparameters
best_params = rf_grid.best_params_
print(f"\nBest Hyperparameters: {best_params}")

#cross-validation accuracy of best model
cv_accuracy = rf_grid.best_score_
print(f"Best Cross-Validation Accuracy: {cv_accuracy:.4f}")


Best Hyperparameters: {'rf__max_depth': 20, 'rf__min_samples_leaf': 1, 'rf__min_samples_split': 2, 'rf__n_estimators': 200}
Best Cross-Validation Accuracy: 0.9441


In [10]:
#best model from grid search
best_rf_model = rf_grid.best_estimator_

#predictions on the full training set
y_pred = best_rf_model.predict(X)

#accuracy on training data
accuracy = accuracy_score(y_encoded, y_pred)
print(f"\nRandom Forest Training Accuracy: {accuracy:.4f}")

print("\nRandom Forest Classification Report:")
print(classification_report(y_encoded, y_pred, target_names=encoder.classes_))



Random Forest Training Accuracy: 0.9971

Random Forest Classification Report:
              precision    recall  f1-score   support

    airplane       1.00      1.00      1.00     18194
   ice cream       1.00      0.99      1.00     14776
 spreadsheet       1.00      1.00      1.00     20424
       sword       0.99      1.00      0.99     14856

    accuracy                           1.00     68250
   macro avg       1.00      1.00      1.00     68250
weighted avg       1.00      1.00      1.00     68250

