## Libraries

In [None]:
%pip install -qqq "mlforecast[lag_transforms]"

In [2]:
from pathlib import Path

import lightgbm as lgb
import mlforecast
import numpy as np
import pandas as pd
from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean, SeasonalRollingMean

In [None]:
import mlforecast

In [None]:
mlforecast.__version__

### Calendar

In [None]:
cal_dtypes = {
    'd': 'category',
    'wm_yr_wk': np.uint16,
    'event_name_1': 'category',
    'event_type_1': 'category',
    'event_name_2': 'category',
    'event_type_2': 'category',
    'snap_CA': np.uint8,
    'snap_TX': np.uint8,
    'snap_WI': np.uint8,
}
cal = pd.read_csv( 'calendar.csv', 
                  dtype=cal_dtypes, 
                  usecols=list(cal_dtypes.keys()) + ['date'], 
                  parse_dates=['date'])
event_cols = [k for k in cal_dtypes if k.startswith('event')]
for col in event_cols:
    cal[col] = cal[col].cat.add_categories('nan').fillna('nan')

### Prices

In [None]:
prices_dtypes = {
    'store_id': 'category',
    'item_id': 'category',
    'wm_yr_wk': np.uint16,
    'sell_price': np.float32
}
prices = pd.read_csv('sell_prices.csv', dtype=prices_dtypes)

### Sales

In [None]:
sales_dtypes = {
    'id': 'category',
    'item_id': prices.item_id.dtype,
    'dept_id': 'category',
    'cat_id': 'category',
    'store_id': 'category',
    'state_id': 'category',
    **{f'd_{i}': np.float32 for i in range(1942)}
}
sales = pd.read_csv(
    'sales_train_evaluation.csv',
    dtype=sales_dtypes,
)

In [None]:
long = sales.melt(
    id_vars=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id'],
    var_name='d',
    value_name='y'
)

In [None]:
long=long[long['cat_id']=="HOBBIES"]

In [None]:
%%time
print(long.shape[0])
long['date_idx'] = long['d'].str.replace('d_', '').astype('int32')
dates = sorted(long['date_idx'].unique())
long = long.sort_values(['id', 'date_idx'])
long['d'] = long['d'].astype(cal.d.dtype)
long = long.merge(cal, on=['d'])
long = long.merge(prices, on=['store_id', 'item_id', 'wm_yr_wk'])
last_wmyrwk = long['wm_yr_wk'].max()
last_date_train = long['date'].max()
long = long.drop(columns=['d', 'date_idx', 'wm_yr_wk'])
long = long.sample(frac=1.0, random_state=0).reset_index(drop=True)
long=long[long['sell_price'].notna()]


In [None]:
without_leading_zeros = long['y'].gt(0).groupby(long['id']).transform('cummax')
above_min_date = long['date_idx'] >= dates[-400]
keep_mask = without_leading_zeros & above_min_date
long = long[keep_mask]
print(long.shape[0])

In [None]:
# build future X
future_cal = cal[cal['date'] > last_date_train]
future_prices = prices[prices['wm_yr_wk'] >= last_wmyrwk].copy()
future_prices['id'] = future_prices['item_id'].astype(str) + '_' + future_prices['store_id'].astype(str) + '_evaluation'
future_prices['id']=future_prices['id'].astype(sales.id.dtype)
X_df = future_prices.merge(future_cal, on='wm_yr_wk').drop(columns=['store_id', 'item_id', 'wm_yr_wk', 'd'])

In [None]:
X_df['id'] = X_df['id'].str.replace('_evaluation', '').astype('category')

In [None]:
X_df.to_csv("X_df.csv",index=False)

## Training

In [None]:
model_params = {
    'verbose': -1,
    'force_col_wise': True,
    'num_leaves': 256,
    'n_estimators': 50,
}

fcst = MLForecast(
    models=[lgb.LGBMRegressor(**model_params)],
    freq='D',
    lags=[7 * (i+1) for i in range(8)],
    lag_transforms = {
        1:  [ExpandingMean()],
        7:  [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        14: [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
        28: [RollingMean(7), RollingMean(14), RollingMean(28), SeasonalRollingMean(7, 4)],
    },
    date_features=['year', 'month', 'day', 'dayofweek', 'quarter', 'week'],    
    num_threads=4,
)

In [None]:
long.drop(['item_id','dept_id','state_id'],axis=1,inplace=True)

In [None]:
long.rename(columns={"y":"sold"},inplace=True)

In [None]:
y=pd.DataFrame(long['date'].dt.to_period('M'))

In [None]:
y.rename(columns={'date':'YearMonth'},inplace=True)

In [None]:
long=pd.concat([long.iloc[:, :4], y, long.iloc[:, 4:]], axis=1)

In [None]:
long['id'] = long['id'].str.replace('_evaluation', '').astype('category')

In [None]:
long['sold']=long['sold'].astype('int16')

In [None]:
long.head()

In [None]:
long.dtypes

In [None]:
sum(long.memory_usage())

In [None]:
long.to_csv("long.csv",index=False)

In [None]:
def downcast(df):
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df  


In [None]:
long=downcast(long)

In [None]:
%%time
fcst.fit(
    long.drop(columns=['YearMonth'],inplace=False),
    id_col='id',
    time_col='date',
    target_col='sold',
    static_features=['id',  'cat_id', 'store_id'],
)

## Forecasting

In [None]:
%time preds = fcst.predict(28, X_df=X_df)

In [None]:
 preds = fcst.predict(28, X_df=X_df)

In [None]:
preds

## Converting To Horizontal


In [None]:
wide = preds.pivot_table(index='id', columns='date')
wide.columns = [f'F{i+1}' for i in range(28)]
wide.columns.name = None
wide.index.name = 'id'
wide