# XGBoost
* TODO: add averages for cells and stations, NB inference code will need to be adapted too
* TODO: target manipulations/engineering
    * rolling autocorrelation
* TODO: (vector leaf) multi-output regression
* TODO: maybe for week 10-11, we should have radically different approach that does not rely on lag feats, since these will be heavily inpacted by compound errors
    * maybe we should train lin reg for trend and xgboost for seasonality only on idx feats

In [16]:
# %%
import utils
import sys
from tqdm.notebook import tqdm
from wandb.integration.xgboost import WandbCallback
import wandb
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from optuna_integration.xgboost import XGBoostPruningCallback
from optuna_integration.wandb import WeightsAndBiasesCallback
import optuna
import xgboost as xgb
import numpy as np
import polars as pl
import pickle
from typing import Callable
import math
from pathlib import Path
import os
# Manually set the notebook name
os.environ["WANDB_NOTEBOOK_NAME"] = "xgboost_train.ipynb"


sys.path.append('.')

# Import utility functions

In [17]:
# %%
DEBUG = False

In [18]:
# %%
xgb_hyperparams = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    # 'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.7,
    # 'colsample_bytree': 0.8,
    # 'verbosity': 2,
    'early_stopping_rounds': 10,
    'n_estimators': 20,
}

config = {
    'lags': [1, 2, 3, 6, 12, 13, 14, 24, 25, 26, 48, 49, 72],
    'rolling_avgs': [1, 3, 9, 24, 48, 72, 86],
    'delta_reference_points': [(1, 2), (1, 3), (1, 6), (1, 24), (24, 25), (48, 49)],
    'std_windows': [3, 6, 12, 24, 48, 72, 86],
    'num_zeros_windows': [6, 12, 24],
    'hour_shifts': [0, 6, 12, 18],
    'weekday_shifts': [0, 3, 6],
    'train_percentage': 0.6,
    'val_percentage': 0.3,  # The rest is test
    'run_shap': False,
    'target_df_names': [  # dataframes used as target variables
        'thp_vol',
        'mr_number',
        # 'vol_per_prb',
    ],
    'feat_base_df_names': [  # dataframes used to create the features
        'thp_vol',
        'mr_number',
        # 'vol_per_user',
        # 'vol_per_prb',
    ],
}

In [19]:
# %%
# Read the CSV files
data_dir = Path('input-data')
target_dataframes = {
    # This is the target variable
    'thp_vol': pl.read_csv(data_dir / 'traffic_DLThpVol.csv'),
    'prb': pl.read_csv(data_dir / 'traffic_DLPRB.csv'),
    'thp_time': pl.read_csv(data_dir / 'traffic_DLThpTime.csv'),
    'mr_number': pl.read_csv(data_dir / 'traffic_MR_number.csv')
}

# Filter target dataframes based on config
target_dataframes = {
    k: v for k, v in target_dataframes.items() if k in config['target_df_names']}

idx_hour_series = target_dataframes['thp_vol']['']

# Drop the first column (idx hour) from each dataframe
for k in target_dataframes:
    target_dataframes[k] = target_dataframes[k].drop('')

# Debug mode: shorten dataframes and config lists
if DEBUG:
    target_dataframes = {k: v.head(200).select(
        v.columns[:800]) for k, v in target_dataframes.items()}
    config = {k: v[:3] if isinstance(
        v, list) else v for k, v in config.items()}

# Merge xgb_hyperparams into config
config.update(xgb_hyperparams)

In [20]:
# %%
# Initialize W&B
run = wandb.init(
    project="traffic-forecasting-challenge",
    job_type='train',
    entity="esedx12",
    config=config,
    save_code=True,
    mode=('dryrun' if DEBUG else 'online')
)

# Save utils.py to W&B
utils_path = Path('utils.py')
if utils_path.exists():
    wandb.save(str(utils_path))

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112493777828705, max=1.0…

 ## Feature Engineering

 The feature engineering steps are handled by utility functions.

In [21]:
# %%
# Use first config.train_percentage of dataframe rows for training, and the rest for validation and testing
num_rows = len(target_dataframes['thp_vol'])
num_train_rows = round(num_rows * wandb.config.train_percentage)
num_val_rows = round(num_rows * wandb.config.val_percentage)

train_dataframes = {k: v.head(num_train_rows)
                    for k, v in target_dataframes.items()}
train_idx_hour_series = idx_hour_series.head(num_train_rows)

val_dataframes = {k: v.slice(num_train_rows + 1, num_val_rows)
                  for k, v in target_dataframes.items()}
val_idx_hour_series = idx_hour_series.slice(num_train_rows + 1, num_val_rows)

# Create long format dataframes using utility functions
long_train_df = utils.create_long_format_df(
    train_dataframes, train_idx_hour_series, wandb.config)
long_val_df = utils.create_long_format_df(
    val_dataframes, val_idx_hour_series, wandb.config)

target_cols = list(target_dataframes.keys())

X_train, y_train = long_train_df.drop(
    target_cols), long_train_df.select(target_cols)
X_val, y_val = long_val_df.drop(target_cols), long_val_df.select(target_cols)

wandb.config.update({
    'num_train_samples': len(X_train),
    'num_val_samples': len(X_val),
    'features': X_train.columns,
    'targets': y_train.columns
})

Creating TS features for base dataframes...:   0%|          | 0/2 [00:00<?, ?it/s]

Creating TS features for base dataframes...:   0%|          | 0/2 [00:00<?, ?it/s]

 ## Train Models
*  TODO if indicated for performance reasons, get the max idx_hour with a null and return it so we can shorten the df for multi-step predict
* TODO also add target transformations (maybe sklearn can help)
* TODO normalize somehow if data is on very different scales for different beams

### Fit models

In [7]:
# %%
models = {}
for target_name in y_train.columns:
    model = xgb.XGBRegressor(
        **xgb_hyperparams, callbacks=[WandbCallback(log_model=True)])
    print(f"\nFitting model for {target_name}:")
    model.fit(
        X_train.to_numpy(),
        y_train[target_name].to_numpy(),
        eval_set=[(X_train.to_numpy(), y_train[target_name].to_numpy()),
                  (X_val.to_numpy(), y_val[target_name].to_numpy())],
        verbose=25
    )
    models[target_name] = model


Fitting model for thp_vol:
[0]	validation_0-mae:0.36061	validation_1-mae:0.34890
[19]	validation_0-mae:0.21552	validation_1-mae:0.20634

Fitting model for mr_number:
[0]	validation_0-mae:0.58309	validation_1-mae:0.59065
[19]	validation_0-mae:0.21446	validation_1-mae:0.21526


### Save models

In [8]:
# %%
for target_name, model in models.items():
    model_dir = Path('checkpoints') / wandb.run.name
    model_dir.mkdir(parents=True, exist_ok=True)
    model_path = model_dir / f'{target_name}.ubj'
    pickle.dump(model, open(model_path, 'wb'))
    wandb.save(str(model_path))

 ## Evaluation and Logging

In [9]:
# %%
# Iterate through each model in models
for target, target_model in models.items():
    print(f"Processing target: {target}")

    # Predict
    train_preds = target_model.predict(X_train.to_pandas())
    val_preds = target_model.predict(X_val.to_pandas())

    # Compute MAE values
    train_mae = mean_absolute_error(y_train[target].to_pandas(), train_preds)
    val_mae = mean_absolute_error(y_val[target].to_pandas(), val_preds)

    # Log the best score to wandb
    # XGBoost does not have best_iteration attribute in scikit-learn API
    evals_result = target_model.evals_result()
    best_iteration = len(evals_result['validation_0']['mae'])  # Last iteration
    best_val_mae = evals_result['validation_1']['mae'][-1]
    best_train_mae = evals_result['validation_0']['mae'][-1]

    wandb.log({
        f'{target}_best_val_mae': best_val_mae,
        f'{target}_best_round': best_iteration,
        f'{target}_best_train_mae': best_train_mae
    })

    # Convert evaluation results to a DataFrame
    eval_df = pl.DataFrame({
        'Round': list(range(1, len(evals_result['validation_0']['mae']) + 1)),
        'Train MAE': evals_result['validation_0']['mae'],
        'Val MAE': evals_result['validation_1']['mae']
    })

    # Log eval_df to wandb
    wandb.log({f'{target}_eval_df': wandb.Table(data=eval_df.to_pandas())})

    # Plot the results using Plotly
    fig = px.line(
        eval_df.to_pandas(),
        x='Round',
        y=['Train MAE', 'Val MAE'],
        labels={'value': 'Mean Absolute Error'},
        title=f'Training and Validation MAE over Boosting Rounds for {target}'
    )

    fig.update_layout(
        legend=dict(
            title='Dataset',
            itemsizing='constant'
        )
    )

    # Log the plot to wandb
    wandb.log({f"{target}_MAE_Plot": fig})

    # Optionally, display the plot
    fig.show()

    print(f"Best Val MAE for {target}: {best_val_mae}")
    print(f"Round: {best_iteration}")

Processing target: thp_vol


Best Val MAE for thp_vol: 0.2063427548677024
Round: 20
Processing target: mr_number


Best Val MAE for mr_number: 0.2152638340811775
Round: 20


In [10]:
# %%
if wandb.config.run_shap:
    # Create a SHAP explainer for the XGBoost model
    # Assuming 'thp_vol' is one of the targets
    target_name = 'thp_vol'
    explainer = shap.TreeExplainer(models[target_name], X_val.to_pandas())

    # Calculate SHAP values for the val set
    shap_values = explainer.shap_values(X_val.to_pandas())

    # Log SHAP plots to wandb
    shap_bar = shap.summary_plot(
        shap_values, X_val.to_pandas(), plot_type="bar", show=False)
    plt.savefig("shap_bar.png")
    wandb.log({"SHAP Bar Plot": wandb.Image("shap_bar.png")})
    plt.clf()

    shap_summary = shap.summary_plot(
        shap_values, X_val.to_pandas(), show=False)
    plt.savefig("shap_summary.png")
    wandb.log({"SHAP Summary Plot": wandb.Image("shap_summary.png")})
    plt.clf()

    # Optionally, add more SHAP plots as needed

In [11]:
# %%
wandb.finish()

VBox(children=(Label(value='1.019 MB of 1.174 MB uploaded (0.002 MB deduped)\r'), FloatProgress(value=0.867857…

0,1
best_iteration,▁▁
best_score,▁█
epoch,▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██▁▁▂▂▂▃▃▄▄▄▅▅▅▆▆▇▇▇██
mr_number_best_round,▁
mr_number_best_train_mae,▁
mr_number_best_val_mae,▁
thp_vol_best_round,▁
thp_vol_best_train_mae,▁
thp_vol_best_val_mae,▁
validation_0-mae,▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁█▇▆▅▅▄▄▃▃▃▂▂▂▂▁▁▁▁▁▁

0,1
best_iteration,19.0
best_score,0.21526
epoch,19.0
mr_number_best_round,20.0
mr_number_best_train_mae,0.21446
mr_number_best_val_mae,0.21526
thp_vol_best_round,20.0
thp_vol_best_train_mae,0.21552
thp_vol_best_val_mae,0.20634
