In [1]:
import pandas as pd
import numpy as np

In [2]:
df_calendar = pd.read_csv("../data/calendar.csv", parse_dates=["date"])
df_wide_train_val = pd.read_csv("../data/sales_train_validation.csv")
df_prices = pd.read_csv("../data/sell_prices.csv")

In [3]:
def fourier_modes(df: pd.DataFrame, t_col: str="day_of_year", T: float=365.25 , k: int=1, prefix: str="year") -> pd.DataFrame:
    for order in range(1, k+1):
        df[f"{prefix}_sin_{order}"] = np.sin(2 * np.pi * (order * df[t_col] / T))
        df[f"{prefix}_cos_{order}"] = np.cos(2 * np.pi * (order * df[t_col] / T))
    return df

In [4]:
df_calendar.insert(2, "day_of_year", df_calendar['date'].dt.dayofyear)
df_calendar.insert(4, "week_of_year", df_calendar['date'].dt.isocalendar().week)
df_calendar['event_name'] = df_calendar['event_name_1'].apply(lambda s: s if isinstance(s, str) else "No Event")
df_calendar['event'] = df_calendar['event_name'].apply(lambda el: 1 if el != "No Event" else 0)
df_calendar = df_calendar[['date', 'wm_yr_wk', 'day_of_year', 'weekday', 'week_of_year', 'month', 'year', 'd', 'event_name', 'event', "snap_CA",	"snap_TX",	"snap_WI"]]

df_calendar = fourier_modes(df_calendar)
# create a mapping from d_i days to date
day2date = df_calendar.set_index('d')['date'].to_dict()
date2day = {date: day for day, date in day2date.items()}
df_calendar

Unnamed: 0,date,wm_yr_wk,day_of_year,weekday,week_of_year,month,year,d,event_name,event,snap_CA,snap_TX,snap_WI,year_sin_1,year_cos_1
0,2011-01-29,11101,29,Saturday,4,1,2011,d_1,No Event,0,0,0,0,0.478434,0.878124
1,2011-01-30,11101,30,Sunday,4,1,2011,d_2,No Event,0,0,0,0,0.493468,0.869764
2,2011-01-31,11101,31,Monday,5,1,2011,d_3,No Event,0,0,0,0,0.508356,0.861147
3,2011-02-01,11101,32,Tuesday,5,2,2011,d_4,No Event,0,1,1,0,0.523094,0.852275
4,2011-02-02,11101,33,Wednesday,5,2,2011,d_5,No Event,0,1,0,1,0.537677,0.843151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1964,2016-06-15,11620,167,Wednesday,24,6,2016,d_1965,No Event,0,0,1,1,0.265563,-0.964094
1965,2016-06-16,11620,168,Thursday,24,6,2016,d_1966,No Event,0,0,0,0,0.248940,-0.968519
1966,2016-06-17,11620,169,Friday,24,6,2016,d_1967,No Event,0,0,0,0,0.232243,-0.972658
1967,2016-06-18,11621,170,Saturday,24,6,2016,d_1968,No Event,0,0,0,0,0.215477,-0.976509


In [5]:
start_date = pd.Timestamp("2012-01-01")
start_day = date2day[start_date]
end_date = pd.Timestamp("2016-01-31")
end_day = date2day[end_date]

In [1]:
date_cols = (df_wide_train_val.columns[(df_wide_train_val.columns == start_day).argmax(): (df_wide_train_val.columns == end_day).argmax()+1 ]).tolist()
cat_cols = df_wide_train_val.select_dtypes("O").columns.tolist()
cols = cat_cols + date_cols
# cols

NameError: name 'df_wide_train_val' is not defined

In [7]:
dept_id = "FOODS_3"
store_id = "CA_1"

In [8]:
df_wide_train_val = df_wide_train_val.query("dept_id == @dept_id and store_id == @store_id")[cols]

In [10]:
df_wide_train_val.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_338,d_339,d_340,d_341,...,d_1820,d_1821,d_1822,d_1823,d_1824,d_1825,d_1826,d_1827,d_1828,d_1829
2226,FOODS_3_001_CA_1_validation,FOODS_3_001,FOODS_3,FOODS,CA_1,CA,3,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2227,FOODS_3_002_CA_1_validation,FOODS_3_002,FOODS_3,FOODS,CA_1,CA,0,0,0,0,...,1,3,6,3,0,1,2,2,10,4
2228,FOODS_3_003_CA_1_validation,FOODS_3_003,FOODS_3,FOODS,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2229,FOODS_3_004_CA_1_validation,FOODS_3_004,FOODS_3,FOODS,CA_1,CA,0,0,0,0,...,0,2,3,0,1,1,1,0,1,0
2230,FOODS_3_005_CA_1_validation,FOODS_3_005,FOODS_3,FOODS,CA_1,CA,2,0,3,2,...,0,0,1,0,1,0,0,0,2,1


In [11]:
lb_mean_demand = 0.15
ub_mean_demand = 0.5
sparse_series = (
    df_wide_train_val
    .set_index(["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"])
    .mean(axis=1)
    [lambda mean_demand: mean_demand.between(lb_mean_demand, ub_mean_demand)]
)
print(sparse_series.shape)
sparse_series

(144,)


id                           item_id      dept_id  cat_id  store_id  state_id
FOODS_3_001_CA_1_validation  FOODS_3_001  FOODS_3  FOODS   CA_1      CA          0.374665
FOODS_3_003_CA_1_validation  FOODS_3_003  FOODS_3  FOODS   CA_1      CA          0.214477
FOODS_3_006_CA_1_validation  FOODS_3_006  FOODS_3  FOODS   CA_1      CA          0.179625
FOODS_3_009_CA_1_validation  FOODS_3_009  FOODS_3  FOODS   CA_1      CA          0.207105
FOODS_3_016_CA_1_validation  FOODS_3_016  FOODS_3  FOODS   CA_1      CA          0.417560
                                                                                   ...   
FOODS_3_790_CA_1_validation  FOODS_3_790  FOODS_3  FOODS   CA_1      CA          0.263405
FOODS_3_796_CA_1_validation  FOODS_3_796  FOODS_3  FOODS   CA_1      CA          0.244638
FOODS_3_799_CA_1_validation  FOODS_3_799  FOODS_3  FOODS   CA_1      CA          0.180295
FOODS_3_806_CA_1_validation  FOODS_3_806  FOODS_3  FOODS   CA_1      CA          0.305630
FOODS_3_815_CA_1_valid

In [13]:
sparse_df = (
    df_wide_train_val
    .set_index(keys=["state_id",'store_id', 'item_id'])
    .loc[sparse_series.reset_index()[["state_id", 'store_id', 'item_id']].values.tolist()]
    .drop(columns=["id", "dept_id", "cat_id"])
)
sparse_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,d_338,d_339,d_340,d_341,d_342,d_343,d_344,d_345,d_346,d_347,...,d_1820,d_1821,d_1822,d_1823,d_1824,d_1825,d_1826,d_1827,d_1828,d_1829
state_id,store_id,item_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
CA,CA_1,FOODS_3_001,3,0,0,0,1,1,1,2,0,1,...,0,0,0,0,1,0,0,0,0,0
CA,CA_1,FOODS_3_003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CA,CA_1,FOODS_3_006,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,1,3
CA,CA_1,FOODS_3_009,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,3,1,1,0,1,2
CA,CA_1,FOODS_3_016,0,0,0,0,0,0,0,0,0,0,...,3,5,3,1,0,2,5,0,5,3
CA,CA_1,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
CA,CA_1,FOODS_3_790,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CA,CA_1,FOODS_3_796,0,0,0,0,0,0,0,0,0,0,...,2,7,7,2,5,8,5,5,2,7
CA,CA_1,FOODS_3_799,0,0,0,1,0,2,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
CA,CA_1,FOODS_3_806,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,0,0,2,0,3


In [16]:
df_train_val = pd.wide_to_long(sparse_df.reset_index(), stubnames='d_', i=["state_id", "store_id", "item_id"], j='date')
df_train_val = (
    df_train_val
    .reset_index(level="date")
    # .reorder_levels(order=["state_id", "store_id", "item_id"])
    .rename(columns={"d_": "sales"})
)
df_train_val["date"] = df_train_val["date"].map(lambda el: day2date[f"d_{el}"])
df_train_val = df_train_val.set_index("date", append=True)

In [None]:
def set_days_since_last_demand(s: pd.Series) -> pd.Series:
    non_zero_demand_dates = s.loc[s > 0].index.get_level_values("date")
    np.diff

In [29]:
sales_series = df_train_val.loc[pd.IndexSlice["CA", "CA_1", "FOODS_3_001"]]
(sales_series.loc[sales_series['sales'] > 0].index.diff().days - 1).fillna(0)

Index([0.0, 3.0, 0.0, 0.0, 0.0, 1.0, 3.0, 1.0, 0.0, 2.0,
       ...
       0.0, 0.0, 1.0, 2.0, 1.0, 4.0, 1.0, 0.0, 1.0, 4.0],
      dtype='float64', name='date', length=348)

In [50]:
non_zero_demand_series = sales_series.loc[sales_series['sales'] > 0]
non_zero_demand_series.loc[:, 'sales'] = (non_zero_demand_series.index.diff().days - 1).fillna(0).values

s_new = (sales_series['sales'] == 0).astype(int)
# 
s_new.loc[non_zero_demand_series.index] = -non_zero_demand_series['sales']
# days since non-zero sales
s_new.cumsum()

Unnamed: 0_level_0,sales
date,Unnamed: 1_level_1
2012-01-01,0
2012-01-05,3
2012-01-06,0
2012-01-07,0
2012-01-08,0
...,...
2016-01-16,4
2016-01-18,1
2016-01-19,0
2016-01-21,1


In [57]:
min_idx = (sales_series['sales'] == 0).idxmax()
zero_demand_series = sales_series.loc[min_idx:].loc[]
zero_demand_series.loc[:, 'sales'] = (zero_demand_series.index.diff().days - 1).fillna(0).values

s_new2 = (sales_series['sales'] > 0).astype(int)
# 
s_new2.loc[zero_demand_series.index] = -zero_demand_series['sales']
# days since non-zero sales
s_new2.cumsum()

date
2012-01-01    1
2012-01-02    1
2012-01-03    1
2012-01-04    1
2012-01-05    2
             ..
2016-01-27    1
2016-01-28    1
2016-01-29    1
2016-01-30    1
2016-01-31    1
Name: sales, Length: 1492, dtype: int64

In [58]:
df_train_val.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sales
state_id,store_id,item_id,date,Unnamed: 4_level_1
CA,CA_1,FOODS_3_001,2012-01-01,3
CA,CA_1,FOODS_3_001,2012-01-02,0
CA,CA_1,FOODS_3_001,2012-01-03,0
CA,CA_1,FOODS_3_001,2012-01-04,0
CA,CA_1,FOODS_3_001,2012-01-05,1
CA,CA_1,FOODS_3_001,2012-01-06,1
CA,CA_1,FOODS_3_001,2012-01-07,1
CA,CA_1,FOODS_3_001,2012-01-08,2
CA,CA_1,FOODS_3_001,2012-01-09,0
CA,CA_1,FOODS_3_001,2012-01-10,1


In [56]:
s_new.cumsum().head(20)

date
2012-01-01    0
2012-01-02    1
2012-01-03    2
2012-01-04    3
2012-01-05    0
2012-01-06    0
2012-01-07    0
2012-01-08    0
2012-01-09    1
2012-01-10    0
2012-01-11    1
2012-01-12    2
2012-01-13    3
2012-01-14    0
2012-01-15    1
2012-01-16    0
2012-01-17    0
2012-01-18    1
2012-01-19    2
2012-01-20    0
Name: sales, dtype: int64

In [30]:
(sales_series.loc[sales_series['sales'] == 0].index.diff().days - 1).fillna(0)

Index([0.0, 0.0, 0.0, 4.0, 1.0, 0.0, 0.0, 1.0, 2.0, 0.0,
       ...
       2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0],
      dtype='float64', name='date', length=1144)

In [28]:
df_train_val.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sales
state_id,store_id,item_id,date,Unnamed: 4_level_1
CA,CA_1,FOODS_3_001,2012-01-01,3
CA,CA_1,FOODS_3_001,2012-01-02,0
CA,CA_1,FOODS_3_001,2012-01-03,0
CA,CA_1,FOODS_3_001,2012-01-04,0
CA,CA_1,FOODS_3_001,2012-01-05,1
CA,CA_1,FOODS_3_001,2012-01-06,1
CA,CA_1,FOODS_3_001,2012-01-07,1
CA,CA_1,FOODS_3_001,2012-01-08,2
CA,CA_1,FOODS_3_001,2012-01-09,0
CA,CA_1,FOODS_3_001,2012-01-10,1
