In [23]:
# -*- coding: utf-8 -*-
"""XGBoost_train_validation_test.ipynb

Automatically generated by Colab.

Original file is located at...
"""

# Import Libraries
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [24]:
# Constants
DATA_PATH = '/content/drive/MyDrive/distance_prediction_project/preprocessed_data/'
TARGET_COL = 'zloc'
MODEL_PATH = './weights/xgboost_model.json'
SCALER_PATH = './weights/scaler.joblib'
NUMERICAL_COLS_PATH = './weights/numerical_columns.joblib'
ONEHOT_CATEGORIES = ['Misc', 'bicycle', 'car', 'people', 'train', 'truck']  # Expected categories

In [2]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [25]:
# Load Data
def load_data(file):
    df = pd.read_csv(file)
    df = df[df[TARGET_COL] < 90].reset_index(drop=True)  # Filter outliers
    return df.drop(['filename', 'weather', 'depth_min'], axis=1)

In [26]:
train = load_data(DATA_PATH + 'iou1_train.csv')
valid = load_data(DATA_PATH + 'iou1_valid.csv')
test = load_data(DATA_PATH + 'iou1_test.csv')

In [27]:
# Generate One-Hot Encoded Columns
def preprocess_onehot(df):
    # Ensure 'person' is mapped to 'people' to match your categories
    df['class'] = df['class'].replace({'person': 'people'})

    # Create one-hot encoded columns
    df_onehot = pd.get_dummies(df, columns=['class'], prefix='', prefix_sep='')

    # Add missing categories with 0s
    for cat in ONEHOT_CATEGORIES:
        if cat not in df_onehot.columns:
            df_onehot[cat] = 0
    return df_onehot

In [28]:
train = preprocess_onehot(train)
valid = preprocess_onehot(valid)
test = preprocess_onehot(test)

In [29]:
# Preprocess Data
def preprocess(df, scaler=None, numerical_cols=None, fit_scaler=False):
    # Separate features
    numerical = [col for col in df.columns if col not in ONEHOT_CATEGORIES + [TARGET_COL]]
    onehot = df[ONEHOT_CATEGORIES]

    # Scale numerical features
    if fit_scaler:
        scaler = StandardScaler().fit(df[numerical])
        numerical_cols = numerical
    X_scaled = scaler.transform(df[numerical]) if scaler else df[numerical]

    # Combine features
    X_processed = np.concatenate([X_scaled, onehot], axis=1)
    y = df[TARGET_COL].values
    return X_processed, y, scaler, numerical_cols

In [30]:
# Fit scaler on training data
X_train, y_train, scaler, numerical_cols = preprocess(train, fit_scaler=True)
X_valid, y_valid, _, _ = preprocess(valid, scaler, numerical_cols)
X_test, y_test, _, _ = preprocess(test, scaler, numerical_cols)


In [32]:
import os

os.makedirs(os.path.dirname(SCALER_PATH), exist_ok=True)
joblib.dump(scaler, SCALER_PATH)

os.makedirs(os.path.dirname(NUMERICAL_COLS_PATH), exist_ok=True)
joblib.dump(numerical_cols, NUMERICAL_COLS_PATH)

['./weights/numerical_columns.joblib']

In [33]:
# Convert to DMatrix
d_train = xgb.DMatrix(X_train, label=y_train)
d_valid = xgb.DMatrix(X_valid, label=y_valid)
d_test = xgb.DMatrix(X_test, label=y_test)

In [34]:
# Hyperparameters (Pre-tuned)
best_params = {
    'objective': 'reg:squarederror',
    'colsample_bytree': 0.9,
    'learning_rate': 0.01,
    'max_depth': 9,
    'min_child_weight': 3,
    'alpha': 1,
    'lambda': 0.9,
    'subsample': 0.7,
    'gamma': 0.3,
    'n_estimators': 1000
}

In [35]:
# Train Model with Early Stopping
model = xgb.train(
    params=best_params,
    dtrain=d_train,
    evals=[(d_train, 'train'), (d_valid, 'eval')],
    early_stopping_rounds=50,
    num_boost_round=5000,
    verbose_eval=100
)


[0]	train-rmse:16.75147	eval-rmse:16.89485


Parameters: { "n_estimators" } are not used.



[100]	train-rmse:6.53204	eval-rmse:6.66077
[200]	train-rmse:2.96249	eval-rmse:3.26260
[300]	train-rmse:1.82294	eval-rmse:2.34487
[400]	train-rmse:1.46205	eval-rmse:2.12310
[500]	train-rmse:1.30357	eval-rmse:2.04861
[600]	train-rmse:1.20208	eval-rmse:2.01044
[700]	train-rmse:1.11607	eval-rmse:1.98192
[800]	train-rmse:1.03788	eval-rmse:1.95910
[900]	train-rmse:0.97359	eval-rmse:1.94135
[1000]	train-rmse:0.91673	eval-rmse:1.92567
[1100]	train-rmse:0.86439	eval-rmse:1.91237
[1200]	train-rmse:0.81665	eval-rmse:1.90206
[1300]	train-rmse:0.77399	eval-rmse:1.89341
[1400]	train-rmse:0.73414	eval-rmse:1.88711
[1500]	train-rmse:0.69991	eval-rmse:1.88138
[1600]	train-rmse:0.66488	eval-rmse:1.87670
[1700]	train-rmse:0.63589	eval-rmse:1.87213
[1800]	train-rmse:0.60996	eval-rmse:1.86865
[1900]	train-rmse:0.58332	eval-rmse:1.86499
[2000]	train-rmse:0.55979	eval-rmse:1.86041
[2100]	train-rmse:0.53593	eval-rmse:1.85719
[2200]	train-rmse:0.51540	eval-rmse:1.85435
[2300]	train-rmse:0.49670	eval-rmse:1.852

In [36]:
# Save Model
model.save_model(MODEL_PATH)

In [37]:
# Evaluate on Test Set
preds = model.predict(d_test)
rmse = np.sqrt(mean_squared_error(y_test, preds))
mae = mean_absolute_error(y_test, preds)

In [38]:
print(f'Test RMSE: {rmse:.4f}')
print(f'Test MAE: {mae:.4f}')

Test RMSE: 1.9579
Test MAE: 1.0849
