# XGBoost
* TODO: add averages for cells and stations, NB inference code will need to be adapted too
* TODO: target manipulations/engineering
* TODO: (vector leaf) multi-output regression

In [2]:
import math
from pathlib import Path
import os
os.environ["WANDB_NOTEBOOK_NAME"] = "xgboost_train.ipynb"  # Manually set the notebook name

import polars as pl
import numpy as np
import xgboost as xgb
import optuna
from optuna_integration.wandb import WeightsAndBiasesCallback
from optuna_integration.xgboost import XGBoostPruningCallback
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import shap

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import wandb
from tqdm.notebook import tqdm

In [3]:
DEBUG = True

In [4]:
# Read the CSV files
data_dir = Path('input-data')
thp_vol = pl.read_csv(data_dir / 'traffic_DLThpVol.csv')  # This is the target variable
prb = pl.read_csv(data_dir / 'traffic_DLPRB.csv')
thp_time = pl.read_csv(data_dir / 'traffic_DLThpTime.csv')
mr_number = pl.read_csv(data_dir / 'traffic_MR_number.csv')

dataframes = {
    'thp_vol': thp_vol,
    'prb': prb,
    'thp_time': thp_time,
    'mr_number': mr_number
}

# Rename first col to 'hour'
for k, v in dataframes.items():
    dataframes[k] = v.rename({'': "idx_hour"})

In [50]:
xgb_hyperparams = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    # 'max_depth': 6,
    'eta': 0.1,
    'subsample': 0.8,
    # 'colsample_bytree': 0.8,
    # 'verbosity': 2,
}

config = {
    'num_boost_round': 1000,
    'early_stopping_rounds': 10,
    'lags': [1, 2, 3, 6, 12, 13, 14, 24, 25, 26, 48, 49, 72],
    'rolling_avgs': [1, 3, 9, 24, 48, 72],
    'delta_reference_points': [(1, 2), (1, 3), (1, 6), (1, 24), (24, 25), (48, 49)],
    'std_windows': [24, 48, 72],
    'hour_shifts': [0, 6, 12, 18],
    'weekday_shifts': [0, 3, 6],
    'train_percentage': 0.7,
}

if DEBUG:
    dataframes = {k: v.head(400).select(v.columns[:100]) for k, v in dataframes.items()}
    # shorten every list in config to max three elements
    config = {k: v[:3] if isinstance(v, list) else v for k, v in config.items()}

config = xgb_hyperparams | config

In [51]:
wandb.init(project="traffic-forecasting-challenge", entity="esedx12", config=config, mode=('dryrun' if DEBUG else 'online'))

[34m[1mwandb[0m: Currently logged in as: [33mesedx12[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112996488938936, max=1.0…

In [52]:
# Features from manipulating the idx hour column

def create_hour_feats(idx_hour_series: pl.Series, lags: list[int]) -> pl.DataFrame:
    """
    Create shifted versions of daily 24h count
    """
    daily_hour_series = idx_hour_series % 24  # TODO correct?
    result_df = pl.DataFrame()
    for lag in lags:
        result_df = result_df.with_columns(
            (daily_hour_series + lag % 24).alias(f'24h_shifted_{lag}h')
        )
    return result_df

def create_weekday_feats(idx_hour_series: pl.Series, lags: list[int]) -> pl.DataFrame:
    """
    Create shifted versions of weekday count
    """
    day_series = idx_hour_series // 24
    weekday_series = day_series % 7  # TODO correct?
    result_df = pl.DataFrame()
    for lag in lags:
        result_df = result_df.with_columns(
            (weekday_series + lag % 7).alias(f'weekday_shifted_{lag}d')
        )
    return result_df

In [53]:
# Features from manipulating inidividual time series

def create_lag_feats(series: pl.Series, lags: list[int]) -> pl.DataFrame:
    """
    Create lag features
    """
    result_df = pl.DataFrame()
    for lag in lags:
        result_df = result_df.with_columns(
            series.shift(lag).alias(f'lag_{lag}')
        )
    return result_df

def create_rolling_avg_feats(series: pl.Series, windows: list[int]) -> pl.DataFrame:
    """
    Create rolling average features for a specified series
    """
    result_df = pl.DataFrame()
    for window in windows:
        result_df = result_df.with_columns(
            series.shift(1).rolling_mean(window).alias(f'rolling_avg_{window}')  # Shift by 1 to avoid data leakage
        )
    return result_df

def create_delta_feats(series: pl.Series, reference_points: list[tuple[int, int]]) -> pl.DataFrame:
    """
    Create delta features for a specified series
    """
    result_df = pl.DataFrame()
    for ref_point in reference_points:
        result_df = result_df.with_columns(
            (series.shift(ref_point[0]) - series.shift(ref_point[1])).alias(f'delta_{ref_point[0]}_{ref_point[1]}')
        )
    return result_df

def create_std_feats(series: pl.Series, windows: list[int]) -> pl.DataFrame:
    """
    Create rolling standard deviation features for a specified series
    """
    result_df = pl.DataFrame()
    for window in windows:
        result_df = result_df.with_columns(
            series.shift(1).rolling_std(window).alias(f'std_{window}')
        )
    return result_df

In [54]:
def extract_cell_id(beam_id: str) -> str:
    """
    Extract the cell ID from a beam ID.
    """
    return "_".join(beam_id.split("_")[0:2])

def extract_station_id(beam_id: str) -> int:
    """
    Extract the station ID from a beam ID.
    """
    return beam_id.split("_")[0]

def compute_cell_avg(dataframe: pl.DataFrame, cell_id: str) -> pl.Series:
    """
    Compute the average of a DataFrame for a given cell.
    """
    cell_cols = [col for col in dataframe.columns if col.startswith(cell_id)]
    cell_data = dataframe.select(cell_cols)
    return cell_data.mean(axis=1)

def compute_station_avg(dataframe: pl.DataFrame, station_id: int) -> pl.Series:
    """
    Compute the average of a DataFrame for a given station.
    """
    station_cols = [col for col in dataframe.columns if col.startswith(station_id)]
    station_data = dataframe.select(station_cols)
    return station_data.mean(axis=1)

In [55]:
def create_time_features(idx_hours: pl.Series, hour_shifts: list[int], weekday_shifts: list[int]) -> pl.DataFrame:
    """
    Create time-related features like hourly and weekday shifts.
    """
    daily_hours = create_hour_feats(idx_hours, hour_shifts)
    weekdays = create_weekday_feats(idx_hours, weekday_shifts)
    return pl.concat([daily_hours, weekdays], how="horizontal")

def create_ts_features(ts: pl.Series, rolling_avgs: list[int], lags: list[int], delta_reference_points: list[int], std_windows: list[int]) -> pl.DataFrame:
    """
    Create beam-specific features.
    """
    rolling_avg_feats = create_rolling_avg_feats(ts, rolling_avgs)
    rolling_avg_feats = rolling_avg_feats.rename({col: f"rolling_avg_{col}" for col in rolling_avg_feats.columns})
    
    lag_feats = create_lag_feats(ts, lags)
    lag_feats = lag_feats.rename({col: f"lag_{col}" for col in lag_feats.columns})
    
    delta_feats = create_delta_feats(ts, delta_reference_points)
    delta_feats = delta_feats.rename({col: f"delta_{col}" for col in delta_feats.columns})
    
    std_feats = create_std_feats(ts, std_windows)
    std_feats = std_feats.rename({col: f"std_{col}" for col in std_feats.columns})

    return pl.concat([rolling_avg_feats, lag_feats, delta_feats, std_feats], how="horizontal")

In [56]:
def create_id_features(beam_id: str, length: int) -> pl.DataFrame:
    """
    Create DataFrame columns for cell and station IDs based on beam ID.
    """
    beam_id_col = pl.Series([beam_id] * length).alias("beam_id").to_frame()

    cell_id = extract_cell_id(beam_id)
    cell_id_col = pl.Series([cell_id] * length).alias("cell_id").to_frame()

    station_id = extract_station_id(beam_id)
    station_id_col = pl.Series([station_id] * length).alias("station_id").to_frame() 

    return pl.concat([beam_id_col, cell_id_col, station_id_col], how="horizontal")

In [57]:
def create_features(dataframes: dict[str, pl.DataFrame], config: wandb.Config) -> pl.DataFrame:
    """
    Create features for the traffic forecasting model.
    """
    beam_dfs = []
    idx_hours = dataframes['thp_vol']['idx_hour'].to_frame()

    for beam_id in tqdm(dataframes['thp_vol'].drop('idx_hour').columns):
        feature_dfs = []

        # Create hour and weekday features
        feature_dfs.append(create_time_features(idx_hours['idx_hour'], config.hour_shifts, config.weekday_shifts))

        # Create beam, cell, and station ID features
        feature_dfs.append(create_id_features(beam_id, len(idx_hours)))

        for df_name, df in dataframes.items():
            ts = df[beam_id]
            
            # Create lags, rolling averages, etc. for the time series
            df_feats = create_ts_features(ts, config.rolling_avgs, config.lags, config.delta_reference_points, config.std_windows)
            df_feats = df_feats.rename({col: f"{df_name}_{col}" for col in df_feats.columns})

            # # Create features from the cell and station averages, NB need to apply lag/avg methods to avoid data leakage
            # cell_avg = compute_cell_avg_ts(df, cell_id)
            # station_avg = compute_station_avg(df, station_id)

            feature_dfs.append(df_feats)

        # TODO also add target transformations (maybe sklearn can help)
        targets = pl.DataFrame([
            dataframes['thp_vol'][beam_id].alias("thp_vol"), 
            dataframes['prb'][beam_id].alias("prb"), 
            dataframes['thp_time'][beam_id].alias("thp_time"), 
            dataframes['mr_number'][beam_id].alias("mr_number")
            ])
        targets = targets.rename({col: f"target_{col}" for col in targets.columns})

        # Combine all features for this beam
        beam_df = pl.concat([idx_hours] + feature_dfs + [targets], how="horizontal").drop_nulls()

        # Append the DataFrame for this beam to the list
        beam_dfs.append(beam_df)

    # Concatenate all beams into a single DataFrame
    return pl.concat(beam_dfs, how="vertical")

In [58]:
# Use first config.train_percentage of dataframe rows for training, and the rest for testing

train_dataframes = {k: v.head(math.floor(v.height * wandb.config.train_percentage)) for k, v in dataframes.items()}
test_dataframes = {k: v.tail(math.ceil(v.height * (1 - wandb.config.train_percentage))) for k, v in dataframes.items()}

train_feats = create_features(train_dataframes, wandb.config)
test_feats = create_features(test_dataframes, wandb.config)

wandb.log({'train shape': train_feats.shape, 'test shape': test_feats.shape})

  0%|          | 0/2880 [00:00<?, ?it/s]

  0%|          | 0/2880 [00:00<?, ?it/s]

In [59]:
dropped_cols = ['idx_hour', 'beam_id', 'cell_id', 'station_id']
target_cols = [col for col in train_feats.columns if col.startswith("target")]

# Fit an xgboost model
X_train, y_train = train_feats.drop(target_cols + dropped_cols), train_feats[target_cols]
X_test, y_test = test_feats.drop(target_cols + dropped_cols), test_feats[target_cols]

wandb.config.update({
    'train_feats': X_train.columns,
    'train_shape': X_train.shape,
    'test_shape': X_test.shape,
})

In [60]:

dtrain = xgb.DMatrix(X_train.to_numpy(), label=y_train.to_numpy())
dtest = xgb.DMatrix(X_test.to_numpy(), label=y_test.to_numpy())

watchlist = [(dtrain, 'train'), (dtest, 'test')]

# Dictionary to store evaluation results
evals_result = {}

In [None]:
# TODO change to xgb fit (?)
# TODO add optuna
#     wandbc = WeightsAndBiasesCallback(metric_name="accuracy", wandb_kwargs=wandb_kwargs)
bst = xgb.train(xgb_hyperparams, 
                dtrain, 
                config['num_boost_round'], 
                watchlist, 
                evals_result=evals_result, 
                verbose_eval=5, 
                early_stopping_rounds=config['early_stopping_rounds'],
                )

## Train
* We use the sk-learn API

In [None]:
model = xgb.XGBRegressor(
    

In [62]:
# Predict
train_preds = bst.predict(dtrain)
test_preds = bst.predict(dtest)

# Extract MAE values
train_mae = evals_result['train']['mae']
test_mae = evals_result['test']['mae']

# Create a DataFrame using polars
eval_df = pl.DataFrame({
    'Round': range(1, len(train_mae) + 1),
    'Train MAE': train_mae,
    'Test MAE': test_mae
})

best_test_mae = eval_df['Test MAE'].min()
best_test_mae_round = eval_df.filter(pl.col('Test MAE') == best_test_mae)['Round'][0]
best_train_mae = eval_df['Train MAE'].min()

# Log to wandb
wandb.log({'best_test_mae': best_test_mae, 'best_round': best_test_mae_round, 'best_train_mae': best_train_mae})
wandb.log({'eval_df': wandb.Table(data=eval_df.to_pandas())})

print(f"Best Test MAE: {best_test_mae}")
print(f"Round: {best_test_mae_round}")

# Save model
model_path = Path(f'checkpoints/{wandb.run.name}.ubj')
bst.save_model(model_path)

Best Test MAE: 0.21346909727772195
Round: 238


In [70]:
test_preds
# WHAT IS MAE CALCULATED ON ??

array([[0.7476496 , 1.0659671 , 0.6641862 , 0.9606448 ],
       [0.61770576, 0.96787477, 0.5101617 , 0.9006841 ],
       [1.3303899 , 1.624158  , 1.4107857 , 1.2676449 ],
       ...,
       [0.08550844, 0.12701981, 0.09883477, 0.12284534],
       [0.08078336, 0.09582741, 0.08413868, 0.09643217],
       [0.06998763, 0.11047703, 0.10029584, 0.11158922]], dtype=float32)

In [63]:
# Assuming you have the train_mae and test_mae from evals_result
train_mae = evals_result['train']['mae']
test_mae = evals_result['test']['mae']
rounds = range(1, len(train_mae) + 1)

# Generate the plot using Plotly
fig = px.line(
    x=rounds, 
    y=[train_mae, test_mae],
    labels={'x': 'Boosting Round', 'value': 'Mean Absolute Error'}, 
    title='Training and Test MAE over Boosting Rounds'
)
fig.update_layout(
    legend=dict(
        title='Legend',
        itemsizing='constant'
    ),
    legend_title_text='Dataset'
)
fig.data[0].name = 'Train MAE'
fig.data[1].name = 'Test MAE'

# Log the plot to wandb
wandb.log({"MAE Plot": fig})

# Optionally, display the plot
fig.show()

In [64]:
# # Create a SHAP explainer for the XGBoost model
# explainer = shap.TreeExplainer(bst)

# # Calculate SHAP values for the test set
# shap_values = explainer.shap_values(X_test)

# # Generate and display a summary plot of SHAP values
# shap.summary_plot(shap_values, X_test)

# # Optional: Generate a dependence plot for a specific feature (replace 'feature_index' with the actual feature index)
# # shap.dependence_plot(feature_index, shap_values, X_test)

# # Optional: Generate a force plot for the first instance in the test set
# # shap.force_plot(explainer.expected_value, shap_values[0, :], X_test[0, :])

## Inference

In [65]:
def predict_multi_step(input_dfs: dict[pl.DataFrame], model: xgb.Booster, num_steps: int) -> pl.DataFrame:
    """
    Predict multiple steps into the future using a trained model.

    Parameters:
    - input_dfs: A dictionary containing the required DataFrames. 
                  Each key corresponds to a different type of DataFrame.
    - model: The trained XGBoost model.
    - steps: The number of steps to predict into the future.

    Returns:
    - A Polars DataFrame with the predictions.
    """
    # Create a copy of the input DataFrames
    dataframes = {k: v.clone() for k, v in input_dfs.items()}

    # Create a list to store the predictions
    preds = []

    for step in range(num_steps):
        # Create features for the current step
        features = create_features(dataframes)

        # Extract the features
        X = features.drop(["target"])

        # Predict
        dmatrix = xgb.DMatrix(X.to_numpy())
        y_pred = model.predict(dmatrix)

        # Append the predictions to the list
        preds.append(y_pred)

        # Update the DataFrames with the new predictions
        for df_name, df in dataframes.items():
            # Get the target column
            target_col = df.columns[-1]

            # Shift the target column by 1
            df[target_col] = df[target_col].shift(-1)

            # Update the target column with the new predictions
            df[target_col][-1] = y_pred

    # Concatenate the predictions into a single DataFrame
    return pl.DataFrame(preds).transpose()

In [1]:
def preds_vec_to_df_row(preds: np.ndarray, idxs: pl.DataFrame, output_df: pl.DataFrame) -> pl.Series:
    """
    Convert a numpy array of predictions to a Polars DataFrame with the correct col and row indices.
    """
    

    pred_format_df = pl.concat([idxs, [pl.Series(preds).alias('preds').to_frame()]], how='horizontal')

    
    

NameError: name 'np' is not defined

In [73]:
def mean_absolute_error(Y_true: pl.DataFrame, Y_pred: pl.DataFrame) -> float:
    """
    Compute the mean absolute error between two DataFrames.
    """
    return (Y_true - Y_pred).abs().mean()

In [68]:
wandb.finish()

VBox(children=(Label(value='0.045 MB of 0.045 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
best_round,▁
best_test_mae,▁
best_train_mae,▁

0,1
best_round,238.0
best_test_mae,0.21347
best_train_mae,0.20819


## Create Submission CSV

* Hours in 5 weeks: 840
* Hours in 6 weeks: 1008
* We need period 841-1008 (841-1009 with Python list indexing)

* Hours in 10 weeks: 1680
* Hours in 11 weeks: 1848

In [18]:
def create_half_submission_df(input_df: pl.DataFrame, weeks: str]) -> pl.DataFrame:
    """
    Create a submission CSV file from a Polars DataFrame.
    """
    if weeks == '5w-6w':
        range = [841, 1008]
    elif weeks == '10w-11w':
        range = [1681, 1848]

    # Choose rows with first column 'idx_hour' having the values 671-840.
    input_df = input_df.filter(pl.col('idx_hour').is_in(range)).with_row_index()

    # Check that shape of dataframe is (168, 2881)
    assert input_df.shape == (168, 2881), f"Expected shape (168, 2881), got {input_df.shape}"

    # Check that there is no null value in the dataframe
    assert input_df.is_null().any().any() == False, "Submission dataframe contains null values"

    # Stack the dataframe with f'traffic_DLThpVol_test_5w-6w_{hour}_{beam_id}' as index
    # where it cycles through the values 671-840 for hour and then the beam_ids, which are colnames of input_df
    return input_df.unpivot(index='idx_hour').with_columns(
        (pl.struct(pl.all()).map_elements(lambda row: f'traffic_DLThpVol_test_5w-6w_{row['row_index']}_{row["variable"]}', return_dtype=pl.String)).alias('ID')
    ).select(['ID', 'value']).rename({'value': 'Target'})


def create_submission_csv(input_df: pl.DataFrame, output_filename='traffic_forecast.csv', archiving_dir='submission-csvs-archive') -> pl.DataFrame:
    """
    Create a submission CSV file from data in input format that's been extended to cover weeks 5-6 and 10-11.
    """

    # Create half submission dataframes
    half_submission_5w_6w = create_half_submission_df(input_df, '5w-6w')
    half_submission_10w_11w = create_half_submission_df(input_df, '10w-11w')

    # Concatenate the two half submission dataframes
    submission_df = pl.concat([half_submission_5w_6w, half_submission_10w_11w], how='vertical')

    # Save the submission dataframe to a CSV file for submission
    submission_df.write_csv(output_filename)
    
    # Save the submission dataframe to a CSV file for archiving
    if archiving_dir:
        archiving_dir = Path(archiving_dir)
        archiving_dir.mkdir(parents=True, exist_ok=True)
        submission_df.write_csv(archiving_dir / f'{wandb.run.name}_{output_filename}')

    return submission_df