In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from functools import partial
from typing import Optional

In [2]:
df_calendar = pd.read_csv("../data/calendar.csv", parse_dates=["date"])
df_wide_train_val = pd.read_csv("../data/sales_train_validation.csv")
df_prices = pd.read_csv("../data/sell_prices.csv")

## Calendar
* create event name (instead of event_name_1, event_name_2)
* remove SNAP from areas not in CA
* create a yearly seasonality using day-of-year
    * yearly seasonality using week-of-year (divide T by 7) and take t="week_of_year"
* remove year (cannot predict new offset)
* remove (for now) day_of_year and week_of_year (can be meanningful)
* truncate into relevant dates

In [3]:
df_wide_train_val.head(2)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [4]:
def create_dict_from_df(df: pd.DataFrame, key_col: str, val_col: str):
    return df.set_index(key_col)[val_col].to_dict()

In [5]:
dept_id = "FOODS_3"
store_id = "CA_1"
state_id = store_id.split("_")[0]

In [6]:
target_col = "sales"
date_col = "date"

In [7]:
# create a mapping from d_i days to date
day2date = create_dict_from_df(df_calendar, key_col="d", val_col=date_col)
date2day = {date: day for day, date in day2date.items()}

In [8]:
max_lag_size = 30
start_date = pd.Timestamp("2012-01-01") - pd.Timedelta(max_lag_size, "d")  # include max lag
start_day = date2day[start_date]
end_date = pd.Timestamp("2015-12-31")
end_day = date2day[end_date]

In [9]:
snap_cols = (
    df_calendar
    .columns[(df_calendar.columns.str.startswith("snap_")) & (~df_calendar.columns.str.endswith(state_id))]
    .tolist()
)

In [10]:
# df_calendar['event_name'] = df_calendar['event_name_1'].apply(lambda s: s if isinstance(s, str) else "No Event")
cols_to_drop = ["d"] + snap_cols
# remove "event", "snap_TX",	"snap_WI", "week_of_year", "year"
df_calendar = df_calendar.drop(columns=cols_to_drop)

# filter dates
df_calendar = df_calendar.query("date.between(@start_date, @end_date)")
df_calendar

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA
307,2011-12-02,11144,Friday,7,12,2011,,,,,1
308,2011-12-03,11145,Saturday,1,12,2011,,,,,1
309,2011-12-04,11145,Sunday,2,12,2011,,,,,1
310,2011-12-05,11145,Monday,3,12,2011,,,,,1
311,2011-12-06,11145,Tuesday,4,12,2011,,,,,1
...,...,...,...,...,...,...,...,...,...,...,...
1793,2015-12-27,11548,Sunday,2,12,2015,,,,,0
1794,2015-12-28,11548,Monday,3,12,2015,,,,,0
1795,2015-12-29,11548,Tuesday,4,12,2015,,,,,0
1796,2015-12-30,11548,Wednesday,5,12,2015,,,,,0


In [11]:
# df_calendar['event_name_1'].unique()

In [12]:
class EventTransformer(BaseEstimator, TransformerMixin):

    def __init__(
        self,
        event_cols: list[str],
        event_baseline: str="No Event",
        date_col: str="date",
        drop_baseline: bool=False,
        drop_event_cols: bool=False
    ):
        self.event_cols = event_cols
        self.event_baseline = event_baseline
        self.date_col = date_col
        self.drop_baseline = drop_baseline
        self.drop_event_cols = drop_event_cols
        self._merge_events = partial(self.merge_events, event_baseline=self.event_baseline)
        self._unpivot_events = partial(self.unpivot_events, event_baseline=self.event_baseline, max_events=len(self.event_cols))
        self._wide_event_cols: list[str]


    @staticmethod
    def merge_events(event_series, event_baseline: str) -> list[str]:
        events = event_series[pd.notnull(event_series)].tolist()
        return  events if events else [event_baseline]

    @staticmethod
    def unpivot_events(s: pd.Series, event_baseline: str, max_events: int) -> list[str]:
        event_lst = s.index[s == 1].tolist() if (s == 1).sum() > 0 else [event_baseline] * max_events
        if len(event_lst) < max_events:
            event_lst += [event_baseline] * (max_events - len(event_lst))
        return event_lst


    def fit(self, X: pd.DataFrame, y: Optional[pd.Series]=None):
        return self

    def transform(self, X):
        X = X.copy()
        event_col = "all_events"
        # create a column with a list of all events
        X_events = X[self.event_cols].apply(self._merge_events, axis=1).rename(event_col) 
        X_events.index = X.index
        # create a long representation 
        X_events = pd.concat([X[self.date_col], X_events], axis=1).explode(column=event_col)
        X_wide_events = pd.crosstab(index=X_events[self.date_col], columns=X_events[event_col])
        X_wide_events.index = X.index
        if self.drop_baseline:
            X_wide_events = X_wide_events.drop(columns=self.event_baseline)
            self._wide_event_cols = X_wide_events.columns
        X = pd.concat([X, X_wide_events.set_index(X.index)], axis=1)
        
        if self.drop_event_cols:
            X = X.drop(columns=self.event_cols)
        return X

    def inverse_transform(self, X):
        # Apply the inverse transformation (e.g., dividing by the factor)
        X = X.copy()
        if self.drop_event_cols:
            X[self.event_cols] = X[self._wide_event_cols].apply(self._unpivot_events, axis=1, result_type="expand")
        X = X.drop(columns=self._wide_event_cols)
        return X    

In [13]:
event_cols = ["event_name_1", "event_name_2"]

event_transformer = EventTransformer(
    event_cols=event_cols,
    event_baseline="No Event",
    date_col=date_col,
    drop_baseline=True,
    drop_event_cols=False
)

df_calendar = event_transformer.fit_transform(df_calendar)
df_calendar

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,...,OrthodoxEaster,Pesach End,PresidentsDay,Purim End,Ramadan starts,StPatricksDay,SuperBowl,Thanksgiving,ValentinesDay,VeteransDay
307,2011-12-02,11144,Friday,7,12,2011,,,,,...,0,0,0,0,0,0,0,0,0,0
308,2011-12-03,11145,Saturday,1,12,2011,,,,,...,0,0,0,0,0,0,0,0,0,0
309,2011-12-04,11145,Sunday,2,12,2011,,,,,...,0,0,0,0,0,0,0,0,0,0
310,2011-12-05,11145,Monday,3,12,2011,,,,,...,0,0,0,0,0,0,0,0,0,0
311,2011-12-06,11145,Tuesday,4,12,2011,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1793,2015-12-27,11548,Sunday,2,12,2015,,,,,...,0,0,0,0,0,0,0,0,0,0
1794,2015-12-28,11548,Monday,3,12,2015,,,,,...,0,0,0,0,0,0,0,0,0,0
1795,2015-12-29,11548,Tuesday,4,12,2015,,,,,...,0,0,0,0,0,0,0,0,0,0
1796,2015-12-30,11548,Wednesday,5,12,2015,,,,,...,0,0,0,0,0,0,0,0,0,0


## Train-val
* truncate based on start and end date and dept., state and store
* filter 1- filter based on relative low averages
    * below 0.5 and above 0.15
* filter 2- filter based on being not too sparse in last 6 months
* wide-to-long transformation

In [14]:
index_cols = ["store_id", "item_id"]
index_cols

['store_id', 'item_id']

In [15]:
# static_cols = df_wide_train_val.select_dtypes("O").columns.difference(set(index_cols))
# remove all irrelevant hierarchical "ids"
drop_train_cols = df_wide_train_val.select_dtypes("O").columns.difference(set(index_cols)).tolist()
drop_train_cols

['cat_id', 'dept_id', 'id', 'state_id']

In [16]:
df_wide_train_val = df_wide_train_val.query("dept_id == @dept_id and store_id == @store_id")  # [cols]
df_wide_train_val = df_wide_train_val.drop(columns=drop_train_cols)

df_wide_train_val.head()

Unnamed: 0,item_id,store_id,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
2226,FOODS_3_001,CA_1,1,1,1,1,1,0,1,2,...,0,0,1,2,0,0,1,0,0,1
2227,FOODS_3_002,CA_1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2228,FOODS_3_003,CA_1,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,0,1,0
2229,FOODS_3_004,CA_1,0,0,0,0,0,0,0,0,...,1,2,0,0,0,0,0,2,0,1
2230,FOODS_3_005,CA_1,1,0,1,2,2,0,1,1,...,0,0,2,0,0,0,0,0,1,0


In [17]:
class SparseTSFilter(BaseEstimator, TransformerMixin):
    """Take intermitten time series s.t. the mean target value (sales) is between (fixed) LB and UB"""
    
    def __init__(self, index_cols: list[str], mean_lb: float=0.15, mean_ub: float=0.5):
        self.index_cols = index_cols
        self.mean_lb = mean_lb  # minimum mean sales of a series
        self.mean_ub = mean_ub  # maximum mean sales of a series
        self._sparse_series: pd.DataFrame


    def fit(self, X: pd.DataFrame, y: Optional[pd.Series]=None):
        cols_to_drop = X.select_dtypes("O").columns.difference(set(self.index_cols))
        sparse_series = (
            X
            .drop(columns=cols_to_drop)
            .set_index(self.index_cols)
            .mean(axis=1)
            [lambda mean_demand: mean_demand.between(self.mean_lb, self.mean_ub)]
        )
        self._sparse_series = sparse_series.reset_index()[self.index_cols]
        return self


    def transform(self, X: pd.DataFrame, y: Optional[pd.Series]=None) -> pd.DataFrame:
        X = X.copy()
        X = pd.merge(X, self._sparse_series, on=self.index_cols)
        return X


class PredictabilityFilter(BaseEstimator, TransformerMixin):
    """Filter out non-predictable series (too many zeros- the q-quantile of the series is 0) at test time"""

    def __init__(self, q: float, start_date: pd.Timestamp, end_date: pd.Timestamp, date2day: dict):
        self.q = q
        self.start_date = start_date
        self.end_date = end_date
        self.date2day = date2day
        self._df_q_sales: pd.DataFrame
        self._df_predictable_series: pd.DataFrame

    @property
    def q_percent(self) -> int:
        return int(self.q * 100)

    @property
    def n_days(self) -> int:
        return (self.end_date - self.start_date).days

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series]=None):
        col_name = f"q{self.q_percent}_sales"
        id_cols = X.select_dtypes("O").columns.tolist()
        self._df_q_sales = (
            X
            .set_index(id_cols)
            .loc[:, self.date2day[self.start_date]: self.date2day[self.end_date]]
            .apply(lambda s: pd.Series({col_name: s.quantile(q=self.q)}), axis=1)
        )
        self._df_predictable_series = self._df_q_sales.query(f"{col_name} > 0").reset_index()[id_cols]
        return self


    def transform(self, X: pd.DataFrame, y: Optional[pd.Series]=None) -> pd.DataFrame:
        X = X.copy()
        id_cols = X.select_dtypes("O").columns.tolist()
        X = pd.merge(X, self._df_predictable_series, on=id_cols)
        return X



class WidetoLongTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, index_cols: list[str], long_dict: dict, wide_prefix: str="d_", long_col: str="date", target_col: str="sales"):
        self.index_cols = index_cols
        self.long_dict = long_dict  # mapping from long value to new long value
        self.wide_prefix = wide_prefix  # prefix in wide format
        self.long_col = long_col  # column name in long format (values are the columns' name in wide format without prefix)
        self.target_col = target_col  # value in long format (values are the values in the wide format)
        self.cols_to_drop: list[str]

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series]=None):
        self.cols_to_drop = X.select_dtypes("O").columns.difference(set(self.index_cols)).tolist()
        return self

    def transform(self, X: pd.DataFrame, y: Optional[pd.Series]=None) -> pd.DataFrame:
        X = X.copy()
        X = (
            pd.wide_to_long(X.drop(columns=self.cols_to_drop), stubnames=self.wide_prefix, i=index_cols, j=self.long_col)
            
            .rename(columns={self.wide_prefix: self.target_col})
        )
        X = X.reset_index()
        X[self.long_col] = X[self.long_col].map(lambda el: self.long_dict[f"{self.wide_prefix}{el}"])
        return X



class IntervalTransformer(BaseEstimator, TransformerMixin):
    """day since last sale day since last 0 sale series"""
    def __init__(self, groupby_cols: list[str], target_col: str, set_non_zero_inrervals: bool=True, set_zero_intervals: bool=False):
        self.groupby_cols = groupby_cols
        self.target_col = target_col
        self.set_non_zero_inrervals = set_non_zero_inrervals
        self.set_zero_intervals = set_zero_intervals

    @staticmethod
    def create_non_zero_interval_series(s: pd.Series):
        vals = s.values
        is_zero_val_series = np.where(vals == 0, 1, 0)  # ones for periods with zero values (0 o.w.)
        non_zero_vals_idxes = np.where(vals > 0)[0]  # get indexes of non-zero values
        non_zero_interval_vals = np.concatenate(
            [ np.where(sub_series == 1, sub_series.cumsum(), 0)
              for sub_series in np.split(is_zero_val_series, non_zero_vals_idxes)
            ]
        )
        return non_zero_interval_vals


    @staticmethod
    def create_zero_interval_series(s: pd.Series):
        vals = s.values
        is_non_zero_val_series = np.where(vals > 0, 1, 0)  # ones for periods with non-zero values (0 o.w.)
        zero_vals_idxes = np.where(vals == 0)[0]  # get indexes of zero values
        zero_interval_vals = np.concatenate(
            [ np.where(sub_series == 1, sub_series.cumsum(), 0)
              for sub_series in np.split(is_non_zero_val_series, zero_vals_idxes)
            ]
        )
        return zero_interval_vals

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series]=None):
        return self

    def transform(self, X: pd.DataFrame, y: Optional[pd.Series]=None) -> pd.DataFrame:
        X = X.copy()
        if self.set_non_zero_inrervals:
            X[f'periods since last {self.target_col}'] = (
                X
                .groupby(self.groupby_cols)[self.target_col]
                .transform(self.create_non_zero_interval_series)
            )
        if self.set_zero_intervals:
            X[f'periods since last 0 {self.target_col}'] = (
                X
                .groupby(self.groupby_cols)[self.target_col]
                .transform(self.create_zero_interval_series)
            )
        
        return X

In [18]:
# "sparse_ts_filter"
sparse_ts_filter = SparseTSFilter(index_cols=index_cols, mean_lb=0.15, mean_ub=0.5)
predictability_filter = PredictabilityFilter(
    q=0.85,
    start_date=pd.Timestamp("2015-07-01"),
    end_date=end_date,
    date2day=date2day
)
wide_to_long_transformer = WidetoLongTransformer(index_cols=index_cols, long_dict=day2date, long_col=date_col, target_col=target_col)
interval_transformer = IntervalTransformer(
    groupby_cols=index_cols,
    target_col=target_col,
    set_non_zero_inrervals=True,
    set_zero_intervals=True
)

In [19]:
train_val_pipe = Pipeline(
    steps=[
        ("sparse_ts_filter", sparse_ts_filter),
        ("predictability_filter", predictability_filter),
        ("wide_to_long_transformer", wide_to_long_transformer),
        ("interval_transformer", interval_transformer)
    ]
)

In [20]:
df_targets = train_val_pipe.fit_transform(df_wide_train_val)
df_targets = df_targets.query("date.between(@start_date, @end_date)")
print(df_targets.shape)
df_targets.head()

(205758, 6)


Unnamed: 0,store_id,item_id,date,sales,periods since last sales,periods since last 0 sales
307,CA_1,FOODS_3_001,2011-12-02,0,1,0
308,CA_1,FOODS_3_001,2011-12-03,0,2,0
309,CA_1,FOODS_3_001,2011-12-04,0,3,0
310,CA_1,FOODS_3_001,2011-12-05,2,0,1
311,CA_1,FOODS_3_001,2011-12-06,2,0,2


## Price
* filter based from start date to end date, store, relevant items


In [21]:
class MomentumTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, groupby_cols: list[str], target_col: str, window_size: int, closed: str="left"):
        self.groupby_cols = groupby_cols
        self.target_col = target_col
        self.window_size = window_size  # 
        self.closed = closed  # exclude last (current) observation

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series]=None):
        return self

    def transform(self, X: pd.DataFrame, y: Optional[pd.Series]=None) -> pd.DataFrame:
        X = X.copy()
        X[f"{self.target_col}_momentum"] = (
            X.groupby(self.groupby_cols)[self.target_col]
            .transform(lambda s: s - s.rolling(window=self.window_size, closed=self.closed).mean())
        )
        return X

In [22]:
class IdTransformer(BaseEstimator, TransformerMixin):

    def __init__(self, id_cols: list[str], id_col_name: str="unique_id", sep: str="__", drop: bool=False):
        self.id_cols = id_cols
        self.sep = sep
        self.drop = drop
        self.id_col_name = id_col_name
        self._id_col2loc: dict[str, int]

    def fit(self, X: pd.DataFrame, y: Optional[pd.Series]=None):
        self._id_col2loc = {
            id_col: (X.columns == id_col).argmax().item()
            for id_col in self.id_cols
        }
        return self

    def transform(self, X: pd.DataFrame, y: Optional[pd.Series]=None) -> pd.DataFrame:
        X = X.copy()
        X = self._add_id_col(X)
        if self.drop:
            X = X.drop(columns=self.id_cols)
        return X


    def _add_id_col(self, X: pd.DataFrame) -> pd.DataFrame:
        id_col_name = self.id_col_name
        id_series = X[self.id_cols[0]]
        for id_col in self.id_cols[1:]:
            id_series = id_series.str.cat(X[id_col], sep=self.sep)
        X.insert(0, id_col_name, id_series)
        return X

    def inverse_transform(self, X: pd.DataFrame) -> pd.DataFrame:
        X = X.copy()
        df_id_cols = X[self.id_col_name].str.split(pat=self.sep, expand=True)
        X = X.drop(columns=self.id_col_name)
        for i, (id_col, id_loc) in enumerate(self._id_col2loc.items()):
            X.insert(id_loc, id_col, df_id_cols[i])
        return X

In [23]:
items = df_targets['item_id'].unique()
price_col = "sell_price"

In [24]:
df_prices = df_prices.query("(store_id == @store_id) and (item_id.isin(@items))")
# df_prices_momentum = momentum_transformer.fit_transform(df_prices)
# df_prices_momentum

In [25]:
df_features = pd.merge(
    df_targets[index_cols + [date_col]],
    df_calendar,
    on="date"
)

df_features = pd.merge(df_features, df_prices, on=index_cols +  ["wm_yr_wk"], how="left")
df_features = df_features.fillna(value={price_col: 0  })
df_features['is_item_exists'] = (df_features[price_col] > 0).astype(int)

In [26]:
nixtla_mapper = {date_col: "ds", target_col: "y"}

In [27]:
id_transformer = IdTransformer(id_cols=index_cols, drop=True)
df_features = id_transformer.fit_transform(df_features)
df_features = df_features.rename(columns=nixtla_mapper)
df_features

Unnamed: 0,unique_id,ds,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,...,PresidentsDay,Purim End,Ramadan starts,StPatricksDay,SuperBowl,Thanksgiving,ValentinesDay,VeteransDay,sell_price,is_item_exists
0,CA_1__FOODS_3_001,2011-12-02,11144,Friday,7,12,2011,,,,...,0,0,0,0,0,0,0,0,2.50,1
1,CA_1__FOODS_3_001,2011-12-03,11145,Saturday,1,12,2011,,,,...,0,0,0,0,0,0,0,0,2.50,1
2,CA_1__FOODS_3_001,2011-12-04,11145,Sunday,2,12,2011,,,,...,0,0,0,0,0,0,0,0,2.50,1
3,CA_1__FOODS_3_001,2011-12-05,11145,Monday,3,12,2011,,,,...,0,0,0,0,0,0,0,0,2.50,1
4,CA_1__FOODS_3_001,2011-12-06,11145,Tuesday,4,12,2011,,,,...,0,0,0,0,0,0,0,0,2.50,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205753,CA_1__FOODS_3_821,2015-12-27,11548,Sunday,2,12,2015,,,,...,0,0,0,0,0,0,0,0,4.98,1
205754,CA_1__FOODS_3_821,2015-12-28,11548,Monday,3,12,2015,,,,...,0,0,0,0,0,0,0,0,4.98,1
205755,CA_1__FOODS_3_821,2015-12-29,11548,Tuesday,4,12,2015,,,,...,0,0,0,0,0,0,0,0,4.98,1
205756,CA_1__FOODS_3_821,2015-12-30,11548,Wednesday,5,12,2015,,,,...,0,0,0,0,0,0,0,0,4.98,1


In [28]:
df_targets = id_transformer.fit_transform(df_targets)
df_targets = df_targets.rename(columns=nixtla_mapper)
df_targets

Unnamed: 0,unique_id,ds,y,periods since last sales,periods since last 0 sales
307,CA_1__FOODS_3_001,2011-12-02,0,1,0
308,CA_1__FOODS_3_001,2011-12-03,0,2,0
309,CA_1__FOODS_3_001,2011-12-04,0,3,0
310,CA_1__FOODS_3_001,2011-12-05,2,0,1
311,CA_1__FOODS_3_001,2011-12-06,2,0,2
...,...,...,...,...,...
263874,CA_1__FOODS_3_821,2015-12-27,2,0,2
263875,CA_1__FOODS_3_821,2015-12-28,2,0,3
263876,CA_1__FOODS_3_821,2015-12-29,1,0,4
263877,CA_1__FOODS_3_821,2015-12-30,1,0,5


In [29]:
date_range_str = '_'.join([str(start_date.date()), str(end_date.date())])
root_dir = Path(os.getcwd()).parent
target_features_path = root_dir.joinpath("data").joinpath(f"target_features_{date_range_str}.csv")
features_path = root_dir.joinpath("data").joinpath(f"features_{date_range_str}.csv")

In [30]:
print(target_features_path)
features_path

/Users/galkampel/Desktop/Projects/m5_forecasting/data/target_features_2011-12-02_2015-12-31.csv


PosixPath('/Users/galkampel/Desktop/Projects/m5_forecasting/data/features_2011-12-02_2015-12-31.csv')

In [31]:
df_targets.to_csv(target_features_path, index=False)
df_features.to_csv(features_path, index=False)