In [None]:
import pandas as pd

# Preprocessing and features
from skforecast.preprocessing import RollingFeatures
from skforecast.recursive import ForecasterRecursiveMultiSeries

from lightgbm import LGBMRegressor

from src.paths import TRANSFORMED_DATA_DIR

In [11]:
data = pd.read_csv(TRANSFORMED_DATA_DIR / 'ts_tabular_2023_1_to_2024_10.csv')
# Wrangling index for deriving exog features
data['datetime'] = pd.to_datetime(data['datetime'])
data = data.set_index('datetime')
data.head()

Unnamed: 0_level_0,ba_AECI,ba_AVA,ba_AZPS,ba_BANC,ba_BPAT,ba_CAL,ba_CAR,ba_CENT,ba_CHPD,ba_CISO,...,ba_TEN,ba_TEPC,ba_TEX,ba_TIDC,ba_TPWR,ba_TVA,ba_US48,ba_WACM,ba_WALC,ba_WAUW
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-01-01,59909.0,38266.0,74476.0,39939.0,167930.0,621178.0,478148.0,644349.0,7685.0,512043.0,...,342521.0,31528.0,899540.0,5677.0,14380.0,342521.0,9389054.0,87420.0,18327.0,2469.0
2023-01-02,58056.0,40165.0,76802.0,46635.0,187046.0,692440.0,519117.0,669956.0,8058.0,572456.0,...,354751.0,33047.0,970461.0,6594.0,16639.0,354751.0,9984103.0,93155.0,19378.0,2751.0
2023-01-03,58551.0,43507.0,77934.0,47536.0,185754.0,736465.0,530603.0,719373.0,7708.0,610457.0,...,367694.0,34068.0,965338.0,7051.0,16742.0,367694.0,10451658.0,95885.0,19580.0,2914.0
2023-01-04,73122.0,42674.0,75801.0,47250.0,188878.0,732759.0,524179.0,760561.0,7763.0,606172.0,...,374897.0,32894.0,981661.0,7263.0,16474.0,374897.0,10528590.0,99419.0,18547.0,2835.0
2023-01-05,77401.0,41295.0,77519.0,45100.0,173307.0,718498.0,538507.0,771291.0,7668.0,598329.0,...,429229.0,34291.0,1011497.0,7123.0,14473.0,429229.0,10760439.0,98267.0,18793.0,2829.0


In [None]:
# Explicitly set freqency of index
data = data.asfreq("1D")

In [16]:
# Make exog features
import holidays

us_holidays = holidays.US(years=[2023, 2024])
data["exog_is_holiday"] = data.index.map(lambda day: day in us_holidays).astype(int)

data["exog_month"] = data.index.month
data["exog_day_of_week"] = data.index.dayofweek
data["exog_is_weekend"] = data["exog_day_of_week"].isin([5, 6]).astype(int)

# Winter = 12, 1, 2; Spring = 3, 4, 4; ...
data["exog_season"] = ((data["exog_month"] - 1) // 3) + 1

In [18]:
data.filter(like="exog_").head()

Unnamed: 0_level_0,exog_is_holiday,exog_month,exog_day_of_week,exog_is_weekend,exog_season
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2023-01-01,1,1,6,1,1
2023-01-02,1,1,0,0,1
2023-01-03,0,1,1,0,1
2023-01-04,0,1,2,0,1
2023-01-05,0,1,3,0,1


In [29]:
forecaster = ForecasterRecursiveMultiSeries(
    regressor=LGBMRegressor(random_state=123, verbose=-1),
    lags=[1, 7, 182, 365], #lags for 1 d, 1 w, 6 m, 12 m
    window_features=RollingFeatures(stats=['mean', 'mean', 'mean', 'mean'], 
                                    window_sizes=[7, 30, 182, 365]), #Rolling means for 1w, 1m, 6m, 12m
    encoding='ordinal',
)

forecaster.fit(
    series=data.filter(like="ba_"),
    exog=data.filter(like="exog_"),
)
forecaster

In [39]:
# Check data for single series
first_ba = data[["ba_AECI", "ba_AVA"]]
X_train = forecaster.create_train_X_y(
    series=first_ba,
    exog=data.filter(like="exog_")
)[0]
display(X_train.head())

Unnamed: 0_level_0,lag_1,lag_7,lag_182,lag_365,roll_mean_7,roll_mean_30,roll_mean_182,roll_mean_365,_level_skforecast,exog_is_holiday,exog_month,exog_day_of_week,exog_is_weekend,exog_season
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2024-01-01,77857.0,64340.0,73125.0,59909.0,74842.428571,68214.233333,64997.917582,65021.356164,0,1,1,0,0,1
2024-01-02,78611.0,73005.0,78119.0,58056.0,76881.142857,68614.866667,65028.06044,65072.594521,0,0,1,1,0,1
2024-01-03,81919.0,75775.0,73746.0,58551.0,78154.571429,69185.133333,65048.93956,65137.972603,0,0,1,2,0,1
2024-01-04,78631.0,77685.0,68196.0,73122.0,78562.571429,69593.4,65075.78022,65192.986301,0,0,1,3,0,1
2024-01-05,78007.0,79943.0,67293.0,77401.0,78608.571429,69879.7,65129.686813,65206.369863,0,0,1,4,0,1


In [43]:
# Slice for first BA
X_train.loc[X_train["_level_skforecast"] == 0].head(3)

Unnamed: 0_level_0,lag_1,lag_7,lag_182,lag_365,roll_mean_7,roll_mean_30,roll_mean_182,roll_mean_365,_level_skforecast,exog_is_holiday,exog_month,exog_day_of_week,exog_is_weekend,exog_season
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2024-01-01,77857.0,64340.0,73125.0,59909.0,74842.428571,68214.233333,64997.917582,65021.356164,0,1,1,0,0,1
2024-01-02,78611.0,73005.0,78119.0,58056.0,76881.142857,68614.866667,65028.06044,65072.594521,0,0,1,1,0,1
2024-01-03,81919.0,75775.0,73746.0,58551.0,78154.571429,69185.133333,65048.93956,65137.972603,0,0,1,2,0,1


In [None]:
# Slice for second BA
X_train.loc[X_train["_level_skforecast"] == 1].head(3)

Unnamed: 0_level_0,lag_1,lag_7,lag_182,lag_365,roll_mean_7,roll_mean_30,roll_mean_182,roll_mean_365,_level_skforecast,exog_is_holiday,exog_month,exog_day_of_week,exog_is_weekend,exog_season
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2024-01-01,36136.0,38164.0,32641.0,38266.0,37940.714286,38696.4,34903.824176,35826.660274,1,1,1,0,0,1
2024-01-02,36077.0,38846.0,32721.0,40165.0,37642.571429,38578.166667,34922.703297,35820.663014,1,0,1,1,0,1
2024-01-03,38528.0,39924.0,34764.0,43507.0,37597.142857,38525.6,34954.60989,35816.178082,1,0,1,2,0,1


In [38]:
data.loc['2024-01-01':].head()

Unnamed: 0_level_0,ba_AECI,ba_AVA,ba_AZPS,ba_BANC,ba_BPAT,ba_CAL,ba_CAR,ba_CENT,ba_CHPD,ba_CISO,...,ba_TVA,ba_US48,ba_WACM,ba_WALC,ba_WAUW,exog_is_holiday,exog_month,exog_day_of_week,exog_is_weekend,exog_season
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2024-01-01,78611.0,36077.0,79345.0,40454.0,168877.0,625536.0,621622.0,759462.0,6637.0,516489.0,...,467122.0,10655169.0,88369.0,16582.0,2150.0,1,1,0,0,1
2024-01-02,81919.0,38528.0,80633.0,45783.0,180449.0,685735.0,706554.0,801892.0,6911.0,563905.0,...,496786.0,11691464.0,92783.0,16596.0,2308.0,0,1,1,0,1
2024-01-03,78631.0,39052.0,84726.0,45870.0,181536.0,701915.0,735424.0,798097.0,6915.0,578043.0,...,548142.0,11940253.0,95553.0,17667.0,2422.0,0,1,2,0,1
2024-01-04,78007.0,38172.0,85372.0,45849.0,178161.0,706456.0,692597.0,816406.0,6785.0,578584.0,...,538419.0,11873250.0,97184.0,17782.0,2263.0,0,1,3,0,1
2024-01-05,80639.0,38133.0,87058.0,44772.0,180235.0,700442.0,754904.0,814876.0,7336.0,574812.0,...,542626.0,12001513.0,100482.0,17730.0,2336.0,0,1,4,0,1
