# XGBoost
* TODO: add averages for cells and stations, NB inference code will need to be adapted too
* TODO: target manipulations/engineering
    * rolling autocorrelation
* TODO: (vector leaf) multi-output regression
* TODO: maybe for week 10-11, we should have radically different approach that does not rely on lag feats, since these will be heavily inpacted by compound errors
    * maybe we should train lin reg for trend and xgboost for seasonality only on idx feats

In [1]:
# %%
import math
from pathlib import Path
import os
os.environ["WANDB_NOTEBOOK_NAME"] = "xgboost_train.ipynb"  # Manually set the notebook name
from typing import Callable

import polars as pl
import numpy as np
import xgboost as xgb
import optuna
from optuna_integration.wandb import WeightsAndBiasesCallback
from optuna_integration.xgboost import XGBoostPruningCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.compose import TransformedTargetRegressor
import shap

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import wandb
from wandb.integration.xgboost import WandbCallback
from tqdm.notebook import tqdm

import sys
sys.path.append('.')

# Import utility functions
import utils


In [2]:
# %%
DEBUG = False


In [3]:
# %%
# Read the CSV files
data_dir = Path('input-data')
thp_vol = pl.read_csv(data_dir / 'traffic_DLThpVol.csv')  # This is the target variable
prb = pl.read_csv(data_dir / 'traffic_DLPRB.csv')
thp_time = pl.read_csv(data_dir / 'traffic_DLThpTime.csv')
mr_number = pl.read_csv(data_dir / 'traffic_MR_number.csv')

target_dataframes = {
    'thp_vol': thp_vol,
    'prb': prb,
    'thp_time': thp_time,
    'mr_number': mr_number
}

idx_hour_series = thp_vol['']

for k, v in target_dataframes.items():
    # Drop the first column (idx hour)
    target_dataframes[k] = v.drop('')


In [4]:
# %%
xgb_hyperparams = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    # 'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.7,
    # 'colsample_bytree': 0.8,
    # 'verbosity': 2,
    'early_stopping_rounds': 10,
    'n_estimators': 100,
}

config = {
    'lags': [1, 2, 3, 6, 12, 13, 14, 24, 25, 26, 48, 49, 72],
    'rolling_avgs': [1, 3, 9, 24, 48, 72, 86],
    'delta_reference_points': [(1, 2), (1, 3), (1, 6), (1, 24), (24, 25), (48, 49)],
    'std_windows': [3, 6, 12, 24, 48, 72, 86],
    'num_zeros_windows': [6, 12, 24],
    'hour_shifts': [0, 6, 12, 18],
    'weekday_shifts': [0, 3, 6],
    'train_percentage': 0.6,
    'val_percentage': 0.3,  # The rest is test
    'run_shap': False,
    'target_df_names': [  # dataframes used as target variables
        'thp_vol', 
        'mr_number',
        # 'vol_per_prb',
    ],
    'feat_base_df_names': [  # dataframes used to create the features
        'thp_vol', 
        'mr_number',
        'vol_per_user', 
        # 'vol_per_prb',
    ],
}

if DEBUG:
    target_dataframes = {k: v.head(400).select(v.columns[:800]) for k, v in target_dataframes.items()}
    # Shorten every list in config to max three elements
    config = {k: v[:3] if isinstance(v, list) else v for k, v in config.items()}

config = {**xgb_hyperparams, **config}


In [5]:
# %%
# Initialize W&B
run = wandb.init(
    project="traffic-forecasting-challenge", 
    job_type='train', 
    entity="esedx12", 
    config=config, 
    save_code=True, 
    mode=('dryrun' if DEBUG else 'online')
)

# Save utils.py to W&B
utils_path = Path('utils.py')
if utils_path.exists():
    wandb.save(str(utils_path))


[34m[1mwandb[0m: Currently logged in as: [33mesedx12[0m. Use [1m`wandb login --relogin`[0m to force relogin


 ## Feature Engineering

 The feature engineering steps are handled by utility functions.

In [6]:
# %%
# Use first config.train_percentage of dataframe rows for training, and the rest for validation and testing
num_rows = len(target_dataframes['thp_vol'])
num_train_rows = round(num_rows * wandb.config.train_percentage)
num_val_rows = round(num_rows * wandb.config.val_percentage)

train_dataframes = {k: v.head(num_train_rows) for k, v in target_dataframes.items()}
train_idx_hour_series = idx_hour_series.head(num_train_rows)

val_dataframes = {k: v.slice(num_train_rows + 1, num_val_rows) for k, v in target_dataframes.items()}
val_idx_hour_series = idx_hour_series.slice(num_train_rows + 1, num_val_rows)

# Create long format dataframes using utility functions
long_train_df = utils.create_long_format_df(train_dataframes, train_idx_hour_series, wandb.config)
long_val_df = utils.create_long_format_df(val_dataframes, val_idx_hour_series, wandb.config)

dropped_cols = ['idx_hour', 'beam_id']
target_cols = list(target_dataframes.keys())

X_train, y_train = long_train_df.drop(dropped_cols + target_cols), long_train_df.select(target_cols)
X_val, y_val = long_val_df.drop(dropped_cols + target_cols), long_val_df.select(target_cols)

wandb.config.update({
    'train_shape': X_train.shape, 
    'val_shape': X_val.shape, 
    'features': X_train.columns, 
    'targets': y_train.columns
})


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

 ## Train Models
*  TODO if indicated for performance reasons, get the max idx_hour with a null and return it so we can shorten the df for multi-step predict
* TODO also add target transformations (maybe sklearn can help)
* TODO normalize somehow if data is on very different scales for different beams

In [7]:
# %%
models = {}
for target_name in y_train.columns:
    model = xgb.XGBRegressor(**xgb_hyperparams, callbacks=[WandbCallback(log_model=True)])
    print(f"\nFitting model for {target_name}:")
    model.fit(
        X_train.to_pandas(), 
        y_train[target_name].to_pandas(), 
        eval_set=[(X_train.to_pandas(), y_train[target_name].to_pandas()), 
                  (X_val.to_pandas(), y_val[target_name].to_pandas())], 
        verbose=25
    )
    models[target_name] = model



Fitting model for thp_vol:
[0]	validation_0-mae:0.36062	validation_1-mae:0.34899
[25]	validation_0-mae:0.21061	validation_1-mae:0.20305
[50]	validation_0-mae:0.20551	validation_1-mae:0.20049
[75]	validation_0-mae:0.20402	validation_1-mae:0.20080

Fitting model for prb:
[0]	validation_0-mae:0.56195	validation_1-mae:0.56734
[25]	validation_0-mae:0.26076	validation_1-mae:0.26140
[50]	validation_0-mae:0.25204	validation_1-mae:0.25384
[75]	validation_0-mae:0.24967	validation_1-mae:0.25256
[99]	validation_0-mae:0.24823	validation_1-mae:0.25200

Fitting model for thp_time:
[0]	validation_0-mae:0.46431	validation_1-mae:0.45966
[25]	validation_0-mae:0.24542	validation_1-mae:0.24187
[50]	validation_0-mae:0.23922	validation_1-mae:0.23722
[75]	validation_0-mae:0.23734	validation_1-mae:0.23646
[99]	validation_0-mae:0.23586	validation_1-mae:0.23616

Fitting model for mr_number:
[0]	validation_0-mae:0.58309	validation_1-mae:0.59065
[25]	validation_0-mae:0.20207	validation_1-mae:0.20304
[50]	validati

In [9]:
# %%
# Save models
for target_name, model in models.items():
    model_dir = Path('checkpoints') / wandb.run.name
    model_dir.mkdir(parents=True, exist_ok=True)
    model_path = model_dir / f'{target_name}.ubj'
    model.save_model(model_path)
    wandb.save(str(model_path))

 ## Evaluation and Logging

In [10]:
# %%
# Iterate through each model in models
for target, target_model in models.items():
    print(f"Processing target: {target}")

    # Predict
    train_preds = target_model.predict(X_train.to_pandas())
    val_preds = target_model.predict(X_val.to_pandas())

    # Compute MAE values
    train_mae = mean_absolute_error(y_train[target].to_pandas(), train_preds)
    val_mae = mean_absolute_error(y_val[target].to_pandas(), val_preds)

    # Log the best score to wandb
    # XGBoost does not have best_iteration attribute in scikit-learn API
    evals_result = target_model.evals_result()
    best_iteration = len(evals_result['validation_0']['mae'])  # Last iteration
    best_val_mae = evals_result['validation_1']['mae'][-1]
    best_train_mae = evals_result['validation_0']['mae'][-1]

    wandb.log({
        f'{target}_best_val_mae': best_val_mae, 
        f'{target}_best_round': best_iteration, 
        f'{target}_best_train_mae': best_train_mae
    })

    # Convert evaluation results to a DataFrame
    eval_df = pl.DataFrame({
        'Round': list(range(1, len(evals_result['validation_0']['mae']) + 1)),
        'Train MAE': evals_result['validation_0']['mae'],
        'Val MAE': evals_result['validation_1']['mae']
    })

    # Log eval_df to wandb
    wandb.log({f'{target}_eval_df': wandb.Table(data=eval_df.to_pandas())})

    # Plot the results using Plotly
    fig = px.line(
        eval_df.to_pandas(), 
        x='Round', 
        y=['Train MAE', 'Val MAE'],
        labels={'value': 'Mean Absolute Error'}, 
        title=f'Training and Validation MAE over Boosting Rounds for {target}'
    )
    
    fig.update_layout(
        legend=dict(
            title='Dataset',
            itemsizing='constant'
        )
    )

    # Log the plot to wandb
    wandb.log({f"{target}_MAE_Plot": fig})

    # Optionally, display the plot
    fig.show()

    print(f"Best Val MAE for {target}: {best_val_mae}")
    print(f"Round: {best_iteration}")


Processing target: thp_vol


Best Val MAE for thp_vol: 0.2008071520950386
Round: 77
Processing target: prb


Best Val MAE for prb: 0.2519975820163693
Round: 100
Processing target: thp_time


Best Val MAE for thp_time: 0.23616248640245366
Round: 100
Processing target: mr_number


Best Val MAE for mr_number: 0.19285431611752188
Round: 100


In [11]:
# %%
if wandb.config.run_shap:
    # Create a SHAP explainer for the XGBoost model
    # Assuming 'thp_vol' is one of the targets
    target_name = 'thp_vol'
    explainer = shap.TreeExplainer(models[target_name], X_val.to_pandas())
    
    # Calculate SHAP values for the val set
    shap_values = explainer.shap_values(X_val.to_pandas())
    
    # Log SHAP plots to wandb
    shap_bar = shap.summary_plot(shap_values, X_val.to_pandas(), plot_type="bar", show=False)
    plt.savefig("shap_bar.png")
    wandb.log({"SHAP Bar Plot": wandb.Image("shap_bar.png")})
    plt.clf()
    
    shap_summary = shap.summary_plot(shap_values, X_val.to_pandas(), show=False)
    plt.savefig("shap_summary.png")
    wandb.log({"SHAP Summary Plot": wandb.Image("shap_summary.png")})
    plt.clf()

    # Optionally, add more SHAP plots as needed


In [12]:
# %%
wandb.finish()


VBox(children=(Label(value='5.274 MB of 5.274 MB uploaded (0.098 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
best_iteration,▁███
best_score,▂█▆▁
epoch,▁▂▂▃▄▄▅▆▁▂▂▃▄▅▅▆▇▇█▁▂▃▃▄▅▅▆▇▇▁▂▂▃▄▅▅▆▇▇█
mr_number_best_round,▁
mr_number_best_train_mae,▁
mr_number_best_val_mae,▁
prb_best_round,▁
prb_best_train_mae,▁
prb_best_val_mae,▁
thp_time_best_round,▁

0,1
best_iteration,99.0
best_score,0.19285
epoch,99.0
mr_number_best_round,100.0
mr_number_best_train_mae,0.18979
mr_number_best_val_mae,0.19285
prb_best_round,100.0
prb_best_train_mae,0.24823
prb_best_val_mae,0.252
thp_time_best_round,100.0
