# XGBoost
* TODO: add averages for cells and stations, NB inference code will need to be adapted too
* TODO: target manipulations/engineering
    * rolling autocorrelation
* TODO: (vector leaf) multi-output regression
* TODO: maybe for week 10-11, we should have radically different approach that does not rely on lag feats, since these will be heavily inpacted by compound errors
    * maybe we should train lin reg for trend and xgboost for seasonality only on idx feats

## Roadmap Note
Regarding your plan to expand the script to first fit a linear model and then apply XGBoost on the residuals, that's a solid approach known as model stacking or residual modeling. This can be set up as a parameter in W&B for flexibility. When you're ready to implement it, you might consider:

* Implementing a Pipeline: Use scikit-learn's Pipeline to chain the linear model and XGBoost.
* Parameterization: Add a parameter in your config (e.g., use_linear_model) to toggle this behavior.
* Logging: Use W&B to track both models' performances separately and combined.

In [1]:
# %%
from tqdm.notebook import tqdm
from wandb.integration.xgboost import WandbCallback
import wandb
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from optuna_integration.xgboost import XGBoostPruningCallback
from optuna_integration.wandb import WeightsAndBiasesCallback
import optuna
import xgboost as xgb
import sklearn
import numpy as np
import polars as pl
import pandas as pd
import pickle
from typing import Callable
import math
from pathlib import Path
import os
import yaml

import utils

# Manually set the notebook name
os.environ["WANDB_NOTEBOOK_NAME"] = "xgboost_train.ipynb"

In [2]:
# %%
DEBUG = False
# config_file_path = Path('configs') / 'linear_config.yaml'
config_file_path = Path('configs') / 'autoregressive_config.yaml'
# config_file_path = Path('configs') / '168hour_shift_config.yaml'
# config_file_path = Path('configs') / 'SWEEP_autoregressive_config.yaml'

In [3]:
# Load the YAML configuration file
with open(config_file_path, 'r') as file:
    config = yaml.safe_load(file)

# Extract xgb_hyperparams from config
xgb_hyperparams = config.get('xgb_hyperparams', {})

# Merge xgb_hyperparams into config
config.update(xgb_hyperparams)

In [4]:
# %%
# Read the CSV files
data_dir = Path('input-data')
target_dataframes = {
    # This is the target variable
    'thp_vol': pl.read_csv(data_dir / 'traffic_DLThpVol.csv'),
    'prb': pl.read_csv(data_dir / 'traffic_DLPRB.csv'),
    'thp_time': pl.read_csv(data_dir / 'traffic_DLThpTime.csv'),
    'mr_number': pl.read_csv(data_dir / 'traffic_MR_number.csv')
}

# Filter target dataframes based on config
target_dataframes = {
    k: v for k, v in target_dataframes.items() if k in config['target_df_names']}

idx_hour_series = target_dataframes['thp_vol']['']

# Drop the first column (idx hour) from each dataframe
for k in target_dataframes:
    target_dataframes[k] = target_dataframes[k].drop('')

# Debug mode: shorten dataframes and config lists
if DEBUG:
    target_dataframes = {k: v.head(200).select(
        v.columns[:800]) for k, v in target_dataframes.items()}
    config = {k: v[:3] if isinstance(
        v, list) else v for k, v in config.items()}

# Merge xgb_hyperparams into config
config.update(xgb_hyperparams)

In [5]:
# %%
# Initialize W&B
run = wandb.init(
    project="traffic-forecasting-challenge",
    job_type='train',
    entity="esedx12",
    config=config,
    save_code=True,
    mode=('dryrun' if DEBUG else 'online')
)

# Save utils.py to W&B
utils_path = Path('utils.py')
if utils_path.exists():
    wandb.save(str(utils_path))

[34m[1mwandb[0m: Currently logged in as: [33mesedx12[0m. Use [1m`wandb login --relogin`[0m to force relogin


 ## Feature Engineering

 The feature engineering steps are handled by utility functions.

In [6]:
# %%
# Use first config.train_percentage of dataframe rows for training, and the rest for validation and testing
num_rows = len(target_dataframes['thp_vol'])
num_train_rows = round(num_rows * wandb.config.train_percentage)
num_val_rows = round(num_rows * wandb.config.val_percentage)

# Make feature dataframes
feature_dfs = utils.create_all_feature_dfs(
    target_dataframes, idx_hour_series, config)

train_target_dfs = {k: v.head(num_train_rows)
                    for k, v in target_dataframes.items()}
train_feature_dfs = {k: v.head(num_train_rows)
                     for k, v in feature_dfs.items()}
train_idx_hour_series = idx_hour_series.head(num_train_rows)

val_target_dfs = {k: v.slice(num_train_rows + 1, num_val_rows)
                  for k, v in target_dataframes.items()}
val_feature_dfs = {k: v.slice(num_train_rows + 1, num_val_rows)
                   for k, v in feature_dfs.items()}
val_idx_hour_series = idx_hour_series.slice(num_train_rows + 1, num_val_rows)

In [7]:
# Create long format dataframes using utility functions
long_train_df = utils.create_long_format_df(
    train_target_dfs, train_feature_dfs, train_idx_hour_series, wandb.config)
long_val_df = utils.create_long_format_df(
    val_target_dfs, val_feature_dfs, val_idx_hour_series, wandb.config)

target_cols = list(target_dataframes.keys())

# Assuming long_train_df and long_val_df are pandas DataFrames
X_train = long_train_df.drop(columns=target_cols)
y_train = long_train_df[target_cols]

X_val = long_val_df.drop(columns=target_cols)
y_val = long_val_df[target_cols]

wandb.config.update({
    'num_train_samples': len(X_train),
    'num_val_samples': len(X_val),
    'features': X_train.columns.to_list(),
    'targets': y_train.columns.to_list()
})

 ## Train Models
*  TODO if indicated for performance reasons, get the max idx_hour with a null and return it so we can shorten the df for multi-step predict
* TODO also add target transformations (maybe sklearn can help)
* TODO normalize somehow if data is on very different scales for different beams

### Fit models

In [8]:
# sk-learn linear model
if config['model'] == 'linear':
    models = {}
    for target in target_cols:
        model = sklearn.linear_model.LinearRegression()
        model.fit(pd.get_dummies(X_train), y_train[target])
        models[target] = model
        # wandb log and print some metrics, like mae
        y_pred = model.predict(pd.get_dummies(X_val))
        mae = sklearn.metrics.mean_absolute_error(y_val[target], y_pred)
        wandb.log({f'mae_{target}': mae})
        print(f'MAE for {target}: {mae}')

In [9]:
X_train.columns[:20]

Index(['beam_id', 'cell_id', 'station_id', 'daily_hours_shifted_0h',
       'daily_hours_shifted_6h', 'daily_hours_shifted_12h',
       'daily_hours_shifted_18h', 'weekday_shifted_0d', 'weekday_shifted_3d',
       'weekday_shifted_6d', 'thp_vol_lag_87', 'thp_vol_lag_88',
       'thp_vol_lag_89', 'thp_vol_lag_92', 'thp_vol_lag_98', 'thp_vol_lag_99',
       'thp_vol_lag_100', 'thp_vol_lag_110', 'thp_vol_lag_111',
       'thp_vol_lag_112'],
      dtype='object')

In [10]:
# xgboost model
if config['model'] == 'xgboost':
    models = {}
    for target_name in y_train.columns:
        model = xgb.XGBRegressor(
            **xgb_hyperparams, callbacks=[WandbCallback(log_model=True)])
        print(f"\nFitting model for {target_name}:")
        model.fit(
            X_train,
            y_train[target_name],
            eval_set=[(X_train, y_train[target_name]),
                      (X_val, y_val[target_name])],
            verbose=25
        )
        models[target_name] = model


Fitting model for thp_vol:
[0]	validation_0-mae:0.41088	validation_1-mae:0.39245
[25]	validation_0-mae:0.35608	validation_1-mae:0.34583
[50]	validation_0-mae:0.31886	validation_1-mae:0.31534
[75]	validation_0-mae:0.29392	validation_1-mae:0.29580
[100]	validation_0-mae:0.27729	validation_1-mae:0.28351
[125]	validation_0-mae:0.26615	validation_1-mae:0.27544
[150]	validation_0-mae:0.25855	validation_1-mae:0.27033
[175]	validation_0-mae:0.25340	validation_1-mae:0.26741
[200]	validation_0-mae:0.24965	validation_1-mae:0.26568
[225]	validation_0-mae:0.24673	validation_1-mae:0.26490
[250]	validation_0-mae:0.24440	validation_1-mae:0.26452
[275]	validation_0-mae:0.24245	validation_1-mae:0.26425
[276]	validation_0-mae:0.24240	validation_1-mae:0.26432

Fitting model for mr_number:
[0]	validation_0-mae:0.65652	validation_1-mae:0.65030
[25]	validation_0-mae:0.53847	validation_1-mae:0.53418
[50]	validation_0-mae:0.45116	validation_1-mae:0.44888
[75]	validation_0-mae:0.38777	validation_1-mae:0.38736


### Save models

In [11]:
# %%
for target_name, model in models.items():
    model_dir = Path('checkpoints') / wandb.run.name
    model_dir.mkdir(parents=True, exist_ok=True)
    model_path = model_dir / f'{target_name}.ubj'
    pickle.dump(model, open(model_path, 'wb'))
    wandb.save(str(model_path))

 ## Evaluation and Logging

In [12]:
# %%
# Iterate through each model in models
for target, target_model in models.items():
    print(f"Processing target: {target}")

    # Predict
    train_preds = target_model.predict(X_train)
    val_preds = target_model.predict(X_val)

    # Compute MAE values
    train_mae = mean_absolute_error(y_train[target], train_preds)
    val_mae = mean_absolute_error(y_val[target], val_preds)

    # Log the best score to wandb
    # XGBoost does not have best_iteration attribute in scikit-learn API
    evals_result = target_model.evals_result()
    best_iteration = len(evals_result['validation_0']['mae'])  # Last iteration
    best_val_mae = evals_result['validation_1']['mae'][-1]
    best_train_mae = evals_result['validation_0']['mae'][-1]

    wandb.log({
        f'{target}_best_val_mae': best_val_mae,
        f'{target}_best_round': best_iteration,
        f'{target}_best_train_mae': best_train_mae
    })

    # Convert evaluation results to a DataFrame
    eval_df = pl.DataFrame({
        'Round': list(range(1, len(evals_result['validation_0']['mae']) + 1)),
        'Train MAE': evals_result['validation_0']['mae'],
        'Val MAE': evals_result['validation_1']['mae']
    })

    # Log eval_df to wandb
    wandb.log({f'{target}_eval_df': wandb.Table(data=eval_df.to_pandas())})

    # Plot the results using Plotly
    fig = px.line(
        eval_df.to_pandas(),
        x='Round',
        y=['Train MAE', 'Val MAE'],
        labels={'value': 'Mean Absolute Error'},
        title=f'Training and Validation MAE over Boosting Rounds for {target}'
    )

    fig.update_layout(
        legend=dict(
            title='Dataset',
            itemsizing='constant'
        )
    )

    # Log the plot to wandb
    wandb.log({f"{target}_MAE_Plot": fig})

    # Optionally, display the plot
    fig.show()

    print(f"Best Val MAE for {target}: {best_val_mae}")
    print(f"Round: {best_iteration}")

Processing target: thp_vol


Best Val MAE for thp_vol: 0.2642814720902956
Round: 278
Processing target: mr_number


Best Val MAE for mr_number: 0.2422304536975673
Round: 500


In [13]:
# %%
if wandb.config.run_shap:
    # Create a SHAP explainer for the XGBoost model
    # Assuming 'thp_vol' is one of the targets
    target_name = 'thp_vol'
    explainer = shap.TreeExplainer(models[target_name], X_val)

    # Calculate SHAP values for the val set
    shap_values = explainer.shap_values(X_val)

    # Log SHAP plots to wandb
    shap_bar = shap.summary_plot(
        shap_values, X_val, plot_type="bar", show=False)
    plt.savefig("shap_bar.png")
    wandb.log({"SHAP Bar Plot": wandb.Image("shap_bar.png")})
    plt.clf()

    shap_summary = shap.summary_plot(
        shap_values, X_val, show=False)
    plt.savefig("shap_summary.png")
    wandb.log({"SHAP Summary Plot": wandb.Image("shap_summary.png")})
    plt.clf()

    # Optionally, add more SHAP plots as needed

In [14]:
# %%
wandb.finish()

VBox(children=(Label(value='249.365 MB of 349.213 MB uploaded (0.063 MB deduped)\r'), FloatProgress(value=0.71…

0,1
best_iteration,▁█
best_score,█▁
epoch,▁▁▂▂▂▂▃▃▃▄▄▄▄▅▁▁▂▂▂▂▃▃▃▃▄▄▄▅▅▅▅▆▆▆▇▇▇▇██
mr_number_best_round,▁
mr_number_best_train_mae,▁
mr_number_best_val_mae,▁
thp_vol_best_round,▁
thp_vol_best_train_mae,▁
thp_vol_best_val_mae,▁
validation_0-mae,▄▃▃▂▂▂▂▂▁▁▁▁▁▁█▇▅▄▃▃▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
best_iteration,499.0
best_score,0.24223
epoch,499.0
mr_number_best_round,500.0
mr_number_best_train_mae,0.22801
mr_number_best_val_mae,0.24223
thp_vol_best_round,278.0
thp_vol_best_train_mae,0.24232
thp_vol_best_val_mae,0.26428
