In [1]:
# import packages

import numpy as np
import pandas as pd
from datetime import datetime, date, timedelta
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc    # garbage collector
import warnings

warnings.filterwarnings('ignore')    # ignore warning

In [2]:
# read files

df_train = pd.read_csv(
    'data/favorita-grocery-sales-train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "data/favorita-grocery-sales-test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    "data/favorita-grocery-sales-items.csv",
).set_index("item_nbr")

stores = pd.read_csv(
    "data/favorita-grocery-sales-stores.csv",
).set_index("store_nbr")

In [3]:
# sneak peek

print(df_train.info(), end='\n\n')
print(df_test.info(), end='\n\n')
print(items.info(), end='\n\n')
print(stores.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59038132 entries, 0 to 59038131
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   date         datetime64[ns]
 1   store_nbr    int64         
 2   item_nbr     int64         
 3   unit_sales   float64       
 4   onpromotion  bool          
dtypes: bool(1), datetime64[ns](1), float64(1), int64(2)
memory usage: 1.8 GB
None

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 3370464 entries, (1, 96995, Timestamp('2017-08-16 00:00:00')) to (54, 2134244, Timestamp('2017-08-31 00:00:00'))
Data columns (total 2 columns):
 #   Column       Dtype
---  ------       -----
 0   id           int64
 1   onpromotion  bool 
dtypes: bool(1), int64(1)
memory usage: 41.9 MB
None

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4100 entries, 96995 to 2134244
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   family      4100 non-null   

In [4]:
df_train.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
0,2016-01-01,25,105574,2.564949,False
1,2016-01-01,25,105575,2.302585,False
2,2016-01-01,25,105857,1.386294,False
3,2016-01-01,25,108634,1.386294,False
4,2016-01-01,25,108701,1.098612,True


In [5]:
df_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,id,onpromotion
store_nbr,item_nbr,date,Unnamed: 3_level_1,Unnamed: 4_level_1
1,96995,2017-08-16,125497040,False
1,99197,2017-08-16,125497041,False
1,103501,2017-08-16,125497042,False
1,103520,2017-08-16,125497043,False
1,103665,2017-08-16,125497044,False


In [6]:
items.head()

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,GROCERY I,1093,0
99197,GROCERY I,1067,0
103501,CLEANING,3008,0
103520,GROCERY I,1028,0
103665,BREAD/BAKERY,2712,1


In [7]:
stores.head()

Unnamed: 0_level_0,city,state,type,cluster
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Quito,Pichincha,D,13
2,Quito,Pichincha,D,13
3,Quito,Pichincha,D,8
4,Quito,Pichincha,D,9
5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [8]:
# data cleaning

le = LabelEncoder()
items['family'] = le.fit_transform(items['family'].values)

stores['city'] = le.fit_transform(stores['city'].values)
stores['state'] = le.fit_transform(stores['state'].values)
stores['type'] = le.fit_transform(stores['type'].values)

df_2017 = df_train.loc[df_train.date >= datetime(2017,1,1)]
del df_train

promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

items = items.reindex(df_2017.index.get_level_values(1))
stores = stores.reindex(df_2017.index.get_level_values(0))


df_2017_item = df_2017.groupby('item_nbr')[df_2017.columns].sum()
promo_2017_item = promo_2017.groupby('item_nbr')[promo_2017.columns].sum()

df_2017_store_class = df_2017.reset_index()
df_2017_store_class['class'] = items['class'].values
df_2017_store_class_index = df_2017_store_class[['class', 'store_nbr']]
df_2017_store_class = df_2017_store_class.groupby(['class', 'store_nbr'])[df_2017.columns].sum()

df_2017_promo_store_class = promo_2017.reset_index()
df_2017_promo_store_class['class'] = items['class'].values
df_2017_promo_store_class_index = df_2017_promo_store_class[['class', 'store_nbr']]
df_2017_promo_store_class = df_2017_promo_store_class.groupby(['class', 'store_nbr'])[promo_2017.columns].sum()

In [9]:
# a function for preprocessing

def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

def prepare_dataset(df, promo_df, t2017, is_train=True, name_prefix=None):
    X = {
        "promo_14_2017": get_timespan(promo_df, t2017, 14, 14).sum(axis=1).values,
        "promo_60_2017": get_timespan(promo_df, t2017, 60, 60).sum(axis=1).values,
        "promo_140_2017": get_timespan(promo_df, t2017, 140, 140).sum(axis=1).values,
        "promo_3_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=16), 15, 3).sum(axis=1).values,
        "promo_7_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=16), 15, 7).sum(axis=1).values,
        "promo_14_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=16), 15, 14).sum(axis=1).values,
    }

    for i in [3, 7, 14, 30, 60, 140]:
        tmp1 = get_timespan(df, t2017, i, i)
        tmp2 = (get_timespan(promo_df, t2017, i, i) > 0) * 1

        X['has_promo_mean_%s' % i] = (tmp1 * tmp2.replace(0, np.nan)).mean(axis=1).values
        X['has_promo_mean_%s_decay' % i] = (tmp1 * tmp2.replace(0, np.nan) * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values

        X['no_promo_mean_%s' % i] = (tmp1 * (1 - tmp2).replace(0, np.nan)).mean(axis=1).values
        X['no_promo_mean_%s_decay' % i] = (tmp1 * (1 - tmp2).replace(0, np.nan) * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values

    for i in [3, 7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017, i, i)
        X['diff_%s_mean' % i] = tmp.diff(axis=1).mean(axis=1).values
        X['mean_%s_decay' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        X['mean_%s' % i] = tmp.mean(axis=1).values
        X['median_%s' % i] = tmp.median(axis=1).values
        X['min_%s' % i] = tmp.min(axis=1).values
        X['max_%s' % i] = tmp.max(axis=1).values
        X['std_%s' % i] = tmp.std(axis=1).values

    for i in [3, 7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017 + timedelta(days=-7), i, i)
        X['diff_%s_mean_2' % i] = tmp.diff(axis=1).mean(axis=1).values
        X['mean_%s_decay_2' % i] = (tmp * np.power(0.9, np.arange(i)[::-1])).sum(axis=1).values
        X['mean_%s_2' % i] = tmp.mean(axis=1).values
        X['median_%s_2' % i] = tmp.median(axis=1).values
        X['min_%s_2' % i] = tmp.min(axis=1).values
        X['max_%s_2' % i] = tmp.max(axis=1).values
        X['std_%s_2' % i] = tmp.std(axis=1).values

    for i in [7, 14, 30, 60, 140]:
        tmp = get_timespan(df, t2017, i, i)
        X['has_sales_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        X['last_has_sales_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        X['first_has_sales_day_in_last_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

        tmp = get_timespan(promo_df, t2017, i, i)
        X['has_promo_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        X['last_has_promo_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        X['first_has_promo_day_in_last_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

    tmp = get_timespan(promo_df, t2017 + timedelta(days=16), 15, 15)
    X['has_promo_days_in_after_15_days'] = (tmp > 0).sum(axis=1).values
    X['last_has_promo_day_in_after_15_days'] = i - ((tmp > 0) * np.arange(15)).max(axis=1).values
    X['first_has_promo_day_in_after_15_days'] = ((tmp > 0) * np.arange(15, 0, -1)).max(axis=1).values

    for i in range(1, 16):
        X['day_%s_2017' % i] = get_timespan(df, t2017, i, 1).values.ravel()

    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df, t2017, 140-i, 20, freq='7D').mean(axis=1).values

    for i in range(-16, 16):
        X["promo_{}".format(i)] = promo_df[str(t2017 + timedelta(days=i))].values.astype(np.uint8)

    X = pd.DataFrame(X)

    if is_train:
        y = df[
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
    return X

In [10]:
# data preprocessing

print("Preparing dataset...")
t2017 = date(2017, 6, 14)
num_days = 6
X_l, y_l = [], []
for i in range(num_days):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(df_2017, promo_2017, t2017 + delta)

    X_tmp2 = prepare_dataset(df_2017_item, promo_2017_item, t2017 + delta, is_train=False, name_prefix='item')
    X_tmp2.index = df_2017_item.index
    X_tmp2 = X_tmp2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

    X_tmp3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, t2017 + delta, is_train=False, name_prefix='store_class')
    X_tmp3.index = df_2017_store_class.index
    X_tmp3 = X_tmp3.reindex(axis=df_2017_store_class_index).reset_index(drop=True)

    X_tmp = pd.concat([X_tmp, X_tmp2, X_tmp3, items.reset_index(), stores.reset_index()], axis=1)
    X_l.append(X_tmp)
    y_l.append(y_tmp)

    del X_tmp2
    gc.collect()

X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

del X_l, y_l
X_val, y_val = prepare_dataset(df_2017, promo_2017, date(2017, 7, 26))

X_val2 = prepare_dataset(df_2017_item, promo_2017_item, date(2017, 7, 26), is_train=False, name_prefix='item')
X_val2.index = df_2017_item.index
X_val2 = X_val2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

X_val3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 7, 26), is_train=False, name_prefix='store_class')
X_val3.index = df_2017_store_class.index
X_val3 = X_val3.reindex(axis=df_2017_store_class_index).reset_index(drop=True)

X_val = pd.concat([X_val, X_val2, X_val3, items.reset_index(), stores.reset_index()], axis=1)

X_test = prepare_dataset(df_2017, promo_2017, date(2017, 8, 16), is_train=False)

X_test2 = prepare_dataset(df_2017_item, promo_2017_item, date(2017, 8, 16), is_train=False, name_prefix='item')
X_test2.index = df_2017_item.index
X_test2 = X_test2.reindex(df_2017.index.get_level_values(1)).reset_index(drop=True)

X_test3 = prepare_dataset(df_2017_store_class, df_2017_promo_store_class, date(2017, 8, 16), is_train=False, name_prefix='store_class')
X_test3.index = df_2017_store_class.index
X_test3 = X_test3.reindex(axis=df_2017_store_class_index).reset_index(drop=True)

X_test = pd.concat([X_test, X_test2, X_test3, items.reset_index(), stores.reset_index()], axis=1)

del X_test2, X_val2, df_2017_item, promo_2017_item, df_2017_store_class, df_2017_promo_store_class, df_2017_store_class_index
gc.collect()

Preparing dataset...


0

In [11]:
# make a list of data chunk for each X_train & y_train

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape)

n = X_val.shape[0]

list_X_train = [X_train[i : i+n] for i in range(0, X_train.shape[0], n)]
list_y_train = [y_train[i : i+n] for i in range(0, y_train.shape[0], n)]

(1005090, 633) (1005090, 16)
(167515, 633) (167515, 16)
(167515, 633)


In [12]:
# train & predict

print("Training and predicting models...")
params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 2,
    'verbose': -1
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("\n" + "=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    
    bst = None
    for X_train_chunk, y_train_chunk in zip(list_X_train, list_y_train):
        dtrain = lgb.Dataset(
            X_train_chunk, label=y_train_chunk[:, i],
            categorical_feature=cate_vars,
            weight=items["perishable"] * 0.25 + 1
        )
        dval = lgb.Dataset(
            X_val, label=y_val[:, i], reference=dtrain,
            weight=items["perishable"] * 0.25 + 1,
            categorical_feature=cate_vars)

        bst = lgb.train(
            params, dtrain, num_boost_round=MAX_ROUNDS, init_model=bst, keep_training_booster=True,
            valid_sets=[dtrain, dval], early_stopping_rounds=125, verbose_eval=50
        )
    
    val_pred.append(bst.predict(X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))

Training and predicting models...

Step 1
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.415819	valid_1's l2: 0.406823
[100]	training's l2: 0.310791	valid_1's l2: 0.310133
[150]	training's l2: 0.288704	valid_1's l2: 0.29398
[200]	training's l2: 0.279292	valid_1's l2: 0.289951
[250]	training's l2: 0.272803	valid_1's l2: 0.288225
[300]	training's l2: 0.267978	valid_1's l2: 0.287679
[350]	training's l2: 0.263767	valid_1's l2: 0.287434
[400]	training's l2: 0.259897	valid_1's l2: 0.287217
[450]	training's l2: 0.256266	valid_1's l2: 0.287157
[500]	training's l2: 0.252881	valid_1's l2: 0.287117
[550]	training's l2: 0.249609	valid_1's l2: 0.287113
[600]	training's l2: 0.246432	valid_1's l2: 0.286988
[650]	training's l2: 0.243448	valid_1's l2: 0.287049
[700]	training's l2: 0.240553	valid_1's l2: 0.287129
Early stopping, best iteration is:
[590]	training's l2: 0.247061	valid_1's l2: 0.286979
Training until validation scores don't improve for 125 rounds
[750]	

[1250]	training's l2: 0.298887	valid_1's l2: 0.33319
[1300]	training's l2: 0.29354	valid_1's l2: 0.333162
[1350]	training's l2: 0.28875	valid_1's l2: 0.333162
[1400]	training's l2: 0.284376	valid_1's l2: 0.333309
Early stopping, best iteration is:
[1320]	training's l2: 0.291574	valid_1's l2: 0.333078
Training until validation scores don't improve for 125 rounds
[1450]	training's l2: 0.315862	valid_1's l2: 0.332756
[1500]	training's l2: 0.304958	valid_1's l2: 0.331357
[1550]	training's l2: 0.298261	valid_1's l2: 0.331049
[1600]	training's l2: 0.292825	valid_1's l2: 0.331043
[1650]	training's l2: 0.288069	valid_1's l2: 0.330993
[1700]	training's l2: 0.283741	valid_1's l2: 0.330932
[1750]	training's l2: 0.279712	valid_1's l2: 0.330942
[1800]	training's l2: 0.275964	valid_1's l2: 0.331003
Early stopping, best iteration is:
[1722]	training's l2: 0.281948	valid_1's l2: 0.330893
Training until validation scores don't improve for 125 rounds
[1850]	training's l2: 0.309825	valid_1's l2: 0.330823

[850]	training's l2: 0.340005	valid_1's l2: 0.352149
[900]	training's l2: 0.325987	valid_1's l2: 0.356254
[950]	training's l2: 0.317932	valid_1's l2: 0.358687
Early stopping, best iteration is:
[837]	training's l2: 0.34683	valid_1's l2: 0.35168
Training until validation scores don't improve for 125 rounds
[1000]	training's l2: 0.334765	valid_1's l2: 0.3538
[1050]	training's l2: 0.325714	valid_1's l2: 0.351887
[1100]	training's l2: 0.319324	valid_1's l2: 0.35151
[1150]	training's l2: 0.313699	valid_1's l2: 0.351549
[1200]	training's l2: 0.308608	valid_1's l2: 0.351622
Early stopping, best iteration is:
[1109]	training's l2: 0.318304	valid_1's l2: 0.351494
Training until validation scores don't improve for 125 rounds
[1250]	training's l2: 0.344279	valid_1's l2: 0.350973
[1300]	training's l2: 0.332563	valid_1's l2: 0.351158
[1350]	training's l2: 0.324723	valid_1's l2: 0.352034
Early stopping, best iteration is:
[1263]	training's l2: 0.340448	valid_1's l2: 0.350745

Step 7
Training until v

[1100]	training's l2: 0.342338	valid_1's l2: 0.376254
[1150]	training's l2: 0.327609	valid_1's l2: 0.366893
[1200]	training's l2: 0.318722	valid_1's l2: 0.364923
[1250]	training's l2: 0.312197	valid_1's l2: 0.364365
[1300]	training's l2: 0.306573	valid_1's l2: 0.364338
[1350]	training's l2: 0.30164	valid_1's l2: 0.364317
[1400]	training's l2: 0.297095	valid_1's l2: 0.364688
Early stopping, best iteration is:
[1294]	training's l2: 0.307205	valid_1's l2: 0.364311
Training until validation scores don't improve for 125 rounds
[1450]	training's l2: 0.331255	valid_1's l2: 0.365984
[1500]	training's l2: 0.321461	valid_1's l2: 0.367335
Early stopping, best iteration is:
[1420]	training's l2: 0.340319	valid_1's l2: 0.364662

Step 10
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.446642	valid_1's l2: 0.491745
[100]	training's l2: 0.345354	valid_1's l2: 0.390843
[150]	training's l2: 0.32149	valid_1's l2: 0.37227
[200]	training's l2: 0.310249	valid_1's l2: 0.36

[1150]	training's l2: 0.328707	valid_1's l2: 0.38061
Early stopping, best iteration is:
[1038]	training's l2: 0.355759	valid_1's l2: 0.375379
Training until validation scores don't improve for 125 rounds
[1200]	training's l2: 0.353503	valid_1's l2: 0.374064
[1250]	training's l2: 0.341555	valid_1's l2: 0.371211
[1300]	training's l2: 0.332766	valid_1's l2: 0.370574
[1350]	training's l2: 0.326192	valid_1's l2: 0.370093
[1400]	training's l2: 0.320477	valid_1's l2: 0.369828
[1450]	training's l2: 0.315278	valid_1's l2: 0.369819
[1500]	training's l2: 0.310493	valid_1's l2: 0.369737
[1550]	training's l2: 0.306051	valid_1's l2: 0.369782
[1600]	training's l2: 0.301856	valid_1's l2: 0.369696
[1650]	training's l2: 0.297775	valid_1's l2: 0.369833
[1700]	training's l2: 0.293958	valid_1's l2: 0.369814
Early stopping, best iteration is:
[1599]	training's l2: 0.301934	valid_1's l2: 0.369674

Step 13
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.459929	valid_1's l2:


Step 16
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.428263	valid_1's l2: 0.466181
[100]	training's l2: 0.343478	valid_1's l2: 0.388548
[150]	training's l2: 0.322929	valid_1's l2: 0.375358
[200]	training's l2: 0.312265	valid_1's l2: 0.372904
[250]	training's l2: 0.304018	valid_1's l2: 0.372222
[300]	training's l2: 0.297877	valid_1's l2: 0.372485
[350]	training's l2: 0.292794	valid_1's l2: 0.372626
Early stopping, best iteration is:
[242]	training's l2: 0.30521	valid_1's l2: 0.372158
Training until validation scores don't improve for 125 rounds
[400]	training's l2: 0.349719	valid_1's l2: 0.367507
[450]	training's l2: 0.337365	valid_1's l2: 0.366054
[500]	training's l2: 0.329124	valid_1's l2: 0.365871
[550]	training's l2: 0.322248	valid_1's l2: 0.365953
[600]	training's l2: 0.316583	valid_1's l2: 0.366045
Early stopping, best iteration is:
[492]	training's l2: 0.33033	valid_1's l2: 0.365847
Training until validation scores don't improve for 125 rou

In [13]:
# cross validation output

print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

weight = items["perishable"] * 0.25 + 1
err = (y_val - np.array(val_pred).transpose())**2
err = err.sum(axis=1) * weight
err = np.sqrt(err.sum() / weight.sum() / 16)
print('nwrmsle = {}'.format(err))

y_val = np.array(val_pred).transpose()
df_preds = pd.DataFrame(
    y_val, index=df_2017.index,
    columns=pd.date_range("2017-07-26", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
df_preds["unit_sales"] = np.clip(np.expm1(df_preds["unit_sales"]), 0, 1000)
df_preds.reset_index().to_csv('lgb_cv.csv', index=False)

Validation mse: 0.3520074451285224
nwrmsle = 0.5931245655523382


In [14]:
# submit prediction

print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
submission.to_csv('lgb_sub.csv', float_format='%.4f', index=None)

Making submission...
