In [1]:
from  datetime import datetime, timedelta
import gc
import numpy as np
import pandas as pd
import lightgbm as lgb
import scipy
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error

import matplotlib
%matplotlib inline

  from pandas import MultiIndex, Int64Index


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('m5-accuracy-goes-again-csc-22'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

m5-accuracy-goes-again-csc-22/calendar.csv
m5-accuracy-goes-again-csc-22/sample_submission (2).csv
m5-accuracy-goes-again-csc-22/prices_sell.csv
m5-accuracy-goes-again-csc-22/sales_train_competition.csv


In [3]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [4]:
pd.options.display.max_columns = 50

In [5]:
h = 28 
max_lags = 28
tr_last = 1913
fday = datetime(2016,4,25)
fday, tr_last, max_lags

(datetime.datetime(2016, 4, 25, 0, 0), 1913, 28)

In [6]:
def create_dt(is_train = True, nrows = None, first_day = 1213):
    prices = pd.read_csv("m5-accuracy-goes-again-csc-22/prices_sell.csv", dtype = PRICE_DTYPES)
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
            
    cal = pd.read_csv("m5-accuracy-goes-again-csc-22/calendar.csv", dtype = CAL_DTYPES)
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = pd.read_csv("m5-accuracy-goes-again-csc-22/sales_train_competition.csv", 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype)
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt


In [7]:
def create_fea(dt):
    lags = [28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
    }
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


In [8]:
FIRST_DAY = 1200


In [9]:
%%time

df = create_dt(is_train=True, first_day= FIRST_DAY)
df.shape

CPU times: user 11.8 s, sys: 3.36 s, total: 15.2 s
Wall time: 15.2 s


(21290801, 22)

In [10]:
df.head()


Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,d_1200,0.0,2014-05-12,11415,1,3,5,2014,0,0,0,0,0.0,1.0,1.0,8.27
1,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,d_1201,0.0,2014-05-13,11415,5,4,5,2014,0,0,0,0,0.0,1.0,0.0,8.27
2,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,d_1202,0.0,2014-05-14,11415,6,5,5,2014,0,0,0,0,0.0,0.0,1.0,8.27
3,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,d_1203,0.0,2014-05-15,11415,4,6,5,2014,0,0,0,0,0.0,1.0,1.0,8.27
4,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,d_1204,0.0,2014-05-16,11415,0,7,5,2014,0,0,0,0,0.0,0.0,0.0,8.27


In [11]:
df.columns


Index(['id', 'item_id', 'dept_id', 'store_id', 'cat_id', 'state_id', 'd',
       'sales', 'date', 'wm_yr_wk', 'weekday', 'wday', 'month', 'year',
       'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2',
       'snap_CA', 'snap_TX', 'snap_WI', 'sell_price'],
      dtype='object')

In [12]:
%%time

create_fea(df)
df.shape

  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


CPU times: user 20.3 s, sys: 2.47 s, total: 22.7 s
Wall time: 22.8 s


(21290801, 25)

In [13]:
df.tail()


Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price,lag_28,rmean_28_28,week
21290796,FOODS_3_825_WI_3_evaluation,3046,6,9,2,2,d_1913,0.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,3.99,1.0,1.25,16
21290797,FOODS_3_826_WI_3_evaluation,3047,6,9,2,2,d_1912,1.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.29,2.0,1.107143,16
21290798,FOODS_3_826_WI_3_evaluation,3047,6,9,2,2,d_1913,3.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,1.29,4.0,1.25,16
21290799,FOODS_3_827_WI_3_evaluation,3048,6,9,2,2,d_1912,0.0,2016-04-23,11613,2,1,4,2016,0,0,0,0,0.0,0.0,0.0,1.01,0.0,1.785714,16
21290800,FOODS_3_827_WI_3_evaluation,3048,6,9,2,2,d_1913,0.0,2016-04-24,11613,3,2,4,2016,0,0,0,0,0.0,0.0,0.0,1.01,5.0,1.964286,16


In [14]:
df.dropna(inplace = True)
df.shape

(19613851, 25)

In [15]:
df.__sizeof__()


4447446111

In [17]:
df_train = df

In [18]:
cat_feats = ['dept_id', 'item_id','store_id', 'cat_id', 'state_id'] + ["event_name_1", "event_name_2", "event_type_1", "event_type_2"] 
useless_cols = ["id", "date", "sales","d", "wm_yr_wk", "weekday"]
train_cols = df_train.columns[~df_train.columns.isin(useless_cols)]


In [19]:
X_train = df_train[train_cols]
y_train = df_train["sales"]

In [21]:
tscv = TimeSeriesSplit(n_splits=5)

In [20]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

Validation

In [22]:
def cv_rmse(df, params):
    rmses = []
    dates = df['date'].unique()
    dates.sort()
    for i, (train, test) in enumerate(tscv.split(dates)):
        print(i)
        df_train = df[df['date'].isin(dates[train])]
        df_test = df[df['date'].isin(dates[test])]
        
        X_train = df_train[train_cols]
        y_train = df_train["sales"]
        X_test = df_test[train_cols]
        y_test = df_test["sales"]
        train_data = lgb.Dataset(X_train, label=y_train, 
                         categorical_feature=cat_feats, free_raw_data=False)
        m_lgb = lgb.train(params, train_data, verbose_eval=300) 
        rmses.append(rmsle(y_test, m_lgb.predict(X_test)))
    return rmses

Ниже замерены метрики для разных запусков кода (разные настройки модели и разные признаки)

Запуски с разными num_iterations

In [170]:
params = {"nthread" : 15, "metric" :"rmse", "num_iterations":5000}
m_5000 = cv_rmse(df, params)

0




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3535
[LightGBM] [Info] Number of data points in the train set: 3243626, number of used features: 16
[LightGBM] [Info] Start training from score 1.276110
1




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3606
[LightGBM] [Info] Number of data points in the train set: 6430128, number of used features: 19
[LightGBM] [Info] Start training from score 1.223936
2




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3683
[LightGBM] [Info] Number of data points in the train set: 9682984, number of used features: 19
[LightGBM] [Info] Start training from score 1.220221
3




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3709
[LightGBM] [Info] Number of data points in the train set: 12977558, number of used features: 19
[LightGBM] [Info] Start training from score 1.243751
4




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3724
[LightGBM] [Info] Number of data points in the train set: 16292164, number of used features: 19
[LightGBM] [Info] Start training from score 1.243941


In [174]:
params = {"nthread" : 15, "metric" :"rmse", "num_iterations":1000}
m_1000 = cv_rmse(df, params)

0




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3535
[LightGBM] [Info] Number of data points in the train set: 3243626, number of used features: 16
[LightGBM] [Info] Start training from score 1.276110
1




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3606
[LightGBM] [Info] Number of data points in the train set: 6430128, number of used features: 19
[LightGBM] [Info] Start training from score 1.223936
2




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3683
[LightGBM] [Info] Number of data points in the train set: 9682984, number of used features: 19
[LightGBM] [Info] Start training from score 1.220221
3




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3709
[LightGBM] [Info] Number of data points in the train set: 12977558, number of used features: 19
[LightGBM] [Info] Start training from score 1.243751
4




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3724
[LightGBM] [Info] Number of data points in the train set: 16292164, number of used features: 19
[LightGBM] [Info] Start training from score 1.243941


In [23]:
params = {"nthread" : 15, "metric" :"rmse", "num_iterations":100}
m_100 = cv_rmse(df, params)

0




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3535
[LightGBM] [Info] Number of data points in the train set: 3243626, number of used features: 16
[LightGBM] [Info] Start training from score 1.276110
1




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3606
[LightGBM] [Info] Number of data points in the train set: 6430128, number of used features: 19
[LightGBM] [Info] Start training from score 1.223936
2




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3683
[LightGBM] [Info] Number of data points in the train set: 9682984, number of used features: 19
[LightGBM] [Info] Start training from score 1.220221
3




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3709
[LightGBM] [Info] Number of data points in the train set: 12977558, number of used features: 19
[LightGBM] [Info] Start training from score 1.243751
4




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3724
[LightGBM] [Info] Number of data points in the train set: 16292164, number of used features: 19
[LightGBM] [Info] Start training from score 1.243941


In [24]:
m_100

[2.3957720618906446,
 2.3920395632371325,
 2.3576851400222094,
 2.2486271335858015,
 2.3145692603414867]

In [171]:
m_5000

[2.614818517306548,
 2.483452716331412,
 2.4037493017315286,
 2.2477892523776757,
 2.4571822063581386]

In [175]:
m_1000

[2.50480625599695,
 2.4273986873206863,
 2.3570338156865223,
 2.224890839791744,
 2.3697548279155174]

Запуски с разными признаками (в подготовку данных были добавлены дополнительные лаги и тп)

In [212]:
params = {"nthread" : 15, "metric" :"rmse", "num_iterations":1000}
m_lags_3 = cv_rmse(df, params)

0




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3526
[LightGBM] [Info] Number of data points in the train set: 2926228, number of used features: 16
[LightGBM] [Info] Start training from score 1.223715
1




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3608
[LightGBM] [Info] Number of data points in the train set: 5841366, number of used features: 19
[LightGBM] [Info] Start training from score 1.206581
2




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3669
[LightGBM] [Info] Number of data points in the train set: 8813494, number of used features: 19
[LightGBM] [Info] Start training from score 1.216705
3




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3707
[LightGBM] [Info] Number of data points in the train set: 11824544, number of used features: 19
[LightGBM] [Info] Start training from score 1.244203
4




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3721
[LightGBM] [Info] Number of data points in the train set: 14861444, number of used features: 19
[LightGBM] [Info] Start training from score 1.242816


In [213]:
m_lags_3

[2.5296316529799037,
 2.370505275792068,
 2.292491923744742,
 2.2327775297640344,
 2.3492986899856714]

In [187]:
params = {"nthread" : 15, "metric" :"rmse", "num_iterations":1000}
m_lags_10 = cv_rmse(df, params)

0




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3527
[LightGBM] [Info] Number of data points in the train set: 3128038, number of used features: 16
[LightGBM] [Info] Start training from score 1.260186
1




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3608
[LightGBM] [Info] Number of data points in the train set: 6282785, number of used features: 19
[LightGBM] [Info] Start training from score 1.219085
2




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3677
[LightGBM] [Info] Number of data points in the train set: 9502749, number of used features: 19
[LightGBM] [Info] Start training from score 1.216590
3




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3707
[LightGBM] [Info] Number of data points in the train set: 12765073, number of used features: 19
[LightGBM] [Info] Start training from score 1.242274
4




You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3722
[LightGBM] [Info] Number of data points in the train set: 16048494, number of used features: 19
[LightGBM] [Info] Start training from score 1.242007


In [188]:
m_lags_10

[2.497974360787198,
 2.3719357224897117,
 2.35041134712565,
 2.2255835949435054,
 2.3806507460681554]

Различные добавления признаков, отдельное OHE категориальных признаков не дали нормальных результатов, поэтому в качестве итогового оставлен простой вариант

In [25]:
train_data = lgb.Dataset(X_train, label=y_train,
                         categorical_feature=cat_feats, free_raw_data=False)

In [26]:
params = {"nthread" : 15, "metric" :"rmse", "num_iterations": 1000}
m_lgb = lgb.train(params, train_data)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3729
[LightGBM] [Info] Number of data points in the train set: 19613851, number of used features: 19
[LightGBM] [Info] Start training from score 1.263883


In [35]:
%time

te = create_dt(False)
te.shape

CPU times: user 3 µs, sys: 1e+03 ns, total: 4 µs
Wall time: 5.01 µs


(1737930, 22)

In [36]:
te.head()

Unnamed: 0,id,item_id,dept_id,store_id,cat_id,state_id,d,sales,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI,sell_price
0,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,d_1885,1.0,2016-03-27,11609,3,2,3,2016,5,1,0,0,0.0,0.0,0.0,8.27
1,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,d_1886,1.0,2016-03-28,11609,1,3,3,2016,0,0,0,0,0.0,0.0,0.0,8.27
2,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,d_1887,0.0,2016-03-29,11609,5,4,3,2016,0,0,0,0,0.0,0.0,0.0,8.27
3,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,d_1888,0.0,2016-03-30,11609,6,5,3,2016,0,0,0,0,0.0,0.0,0.0,8.27
4,HOBBIES_1_001_CA_1_evaluation,0,0,0,0,0,d_1889,0.0,2016-03-31,11609,4,6,3,2016,0,0,0,0,0.0,0.0,0.0,8.27


In [37]:
for i in range(0, 28):
    day = fday + timedelta(days=i)
    print(i, day)
    tst = te[(te.date >= day - timedelta(days=max_lags)) & (te.date <= day)].copy()
    create_fea(tst)
    tst = tst.loc[tst.date == day , train_cols]
    te.loc[te.date == day, "sales"] = m_lgb.predict(tst)

0 2016-04-25 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


1 2016-04-26 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


2 2016-04-27 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


3 2016-04-28 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


4 2016-04-29 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


5 2016-04-30 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


6 2016-05-01 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


7 2016-05-02 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


8 2016-05-03 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


9 2016-05-04 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


10 2016-05-05 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


11 2016-05-06 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


12 2016-05-07 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


13 2016-05-08 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


14 2016-05-09 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


15 2016-05-10 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


16 2016-05-11 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


17 2016-05-12 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


18 2016-05-13 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


19 2016-05-14 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


20 2016-05-15 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


21 2016-05-16 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


22 2016-05-17 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


23 2016-05-18 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


24 2016-05-19 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


25 2016-05-20 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


26 2016-05-21 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


27 2016-05-22 00:00:00


  dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")


In [38]:
sample_sub = pd.read_csv('m5-accuracy-goes-again-csc-22/sample_submission (2).csv')


In [39]:
te = te[te['d'].isin([f"d_{i}" for i in range(1914, 1942)])]

In [40]:
te_sub = pd.pivot_table(te, index='id', columns='d', values='sales')
te_sub = te_sub.reset_index().set_index('id').reindex(sample_sub['id']).reset_index()
te_sub.to_csv('simple_model_submission1000', index=False)