In [1]:
from datetime import date, timedelta
import gc
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder

import lightgbm as lgb

df_train = pd.read_csv(
    '../input/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "../input/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

In [2]:
items = pd.read_csv(
    "../input/items.csv",
).set_index("item_nbr")

stores = pd.read_csv(
    "../input/stores.csv",
).set_index("store_nbr")

In [3]:
items.head()

Unnamed: 0_level_0,family,class,perishable
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
96995,GROCERY I,1093,0
99197,GROCERY I,1067,0
103501,CLEANING,3008,0
103520,GROCERY I,1028,0
103665,BREAD/BAKERY,2712,1


In [4]:
stores.head()

Unnamed: 0_level_0,city,state,type,cluster
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Quito,Pichincha,D,13
2,Quito,Pichincha,D,13
3,Quito,Pichincha,D,8
4,Quito,Pichincha,D,9
5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [5]:
df_2017 = df_train.loc[df_train.date>=pd.datetime(2017,1,1)]
del df_train

In [6]:
df_2017.head()

Unnamed: 0,date,store_nbr,item_nbr,unit_sales,onpromotion
35229871,2017-01-01,25,99197,0.693147,False
35229872,2017-01-01,25,103665,2.079442,False
35229873,2017-01-01,25,105574,0.693147,False
35229874,2017-01-01,25,105857,1.609438,False
35229875,2017-01-01,25,106716,1.098612,False


In [7]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

In [8]:
promo_2017.head()

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-22 00:00:00,2017-08-23 00:00:00,2017-08-24 00:00:00,2017-08-25 00:00:00,2017-08-26 00:00:00,2017-08-27 00:00:00,2017-08-28 00:00:00,2017-08-29 00:00:00,2017-08-30 00:00:00,2017-08-31 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,99197,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103520,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,103665,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,105574,False,False,True,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [9]:
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)

In [10]:
df_2017.head()

Unnamed: 0_level_0,date,2017-01-01 00:00:00,2017-01-02 00:00:00,2017-01-03 00:00:00,2017-01-04 00:00:00,2017-01-05 00:00:00,2017-01-06 00:00:00,2017-01-07 00:00:00,2017-01-08 00:00:00,2017-01-09 00:00:00,2017-01-10 00:00:00,...,2017-08-06 00:00:00,2017-08-07 00:00:00,2017-08-08 00:00:00,2017-08-09 00:00:00,2017-08-10 00:00:00,2017-08-11 00:00:00,2017-08-12 00:00:00,2017-08-13 00:00:00,2017-08-14 00:00:00,2017-08-15 00:00:00
store_nbr,item_nbr,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1,96995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.098612,1.098612,0.0,0.0,0.693147,0.0,0.0,0.0,0.0,0.0
1,99197,0.0,0.0,1.386294,0.693147,0.693147,0.693147,1.098612,0.0,0.0,0.693147,...,0.0,1.098612,0.0,1.098612,0.0,0.0,0.0,0.0,0.0,0.0
1,103520,0.0,0.693147,1.098612,0.0,1.098612,1.386294,0.693147,0.0,0.693147,0.693147,...,0.0,0.0,1.386294,0.0,1.386294,0.693147,0.693147,0.693147,0.0,0.0
1,103665,0.0,0.0,0.0,1.386294,1.098612,1.098612,0.693147,1.098612,0.0,2.079442,...,0.693147,1.098612,0.0,2.079442,2.302585,1.098612,0.0,0.0,0.693147,0.693147
1,105574,0.0,0.0,1.791759,2.564949,2.302585,1.94591,1.609438,1.098612,1.386294,2.302585,...,0.0,1.791759,2.079442,1.94591,2.397895,1.791759,1.791759,0.0,1.386294,1.609438


In [11]:
items['class'] = items['class'].astype('category')

In [12]:
items = pd.get_dummies(items)

In [13]:
items.head()

Unnamed: 0_level_0,perishable,family_AUTOMOTIVE,family_BABY CARE,family_BEAUTY,family_BEVERAGES,family_BOOKS,family_BREAD/BAKERY,family_CELEBRATION,family_CLEANING,family_DAIRY,...,class_6920,class_6922,class_6924,class_6936,class_6954,class_6960,class_7002,class_7016,class_7034,class_7780
item_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
96995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99197,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103501,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
103520,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
103665,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
items = items.reindex(df_2017.index.get_level_values(1))

In [15]:
items.shape

(167515, 371)

In [16]:
stores.head()

Unnamed: 0_level_0,city,state,type,cluster
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Quito,Pichincha,D,13
2,Quito,Pichincha,D,13
3,Quito,Pichincha,D,8
4,Quito,Pichincha,D,9
5,Santo Domingo,Santo Domingo de los Tsachilas,D,4


In [17]:
stores['cluster'] = stores.cluster.astype('category')

In [18]:
stores = pd.get_dummies(stores)

In [19]:
stores = stores.reindex(df_2017.index.get_level_values(0))

In [20]:
stores.head()

Unnamed: 0_level_0,city_Ambato,city_Babahoyo,city_Cayambe,city_Cuenca,city_Daule,city_El Carmen,city_Esmeraldas,city_Guaranda,city_Guayaquil,city_Ibarra,...,cluster_8,cluster_9,cluster_10,cluster_11,cluster_12,cluster_13,cluster_14,cluster_15,cluster_16,cluster_17
store_nbr,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [21]:
stores.shape

(167515, 60)

In [22]:
#sum of sales for each item across all stores for each day
df_2017_item = df_2017.groupby('item_nbr')[df_2017.columns].sum()

In [23]:
df_2017_item.shape

(4018, 227)

In [24]:
#sum of promotion for each item across all stores for each day
promo_2017_item = promo_2017.groupby('item_nbr')[promo_2017.columns].sum()

In [25]:
def get_timespan(df, dt, minus, periods, freq='D'):
    return df[pd.date_range(dt - timedelta(days=minus), periods=periods, freq=freq)]

In [31]:
def prepare_dataset(df, promo_df, t2017, is_train=True, name_prefix=None):
    X = {
        "promo_7_2017": get_timespan(promo_df, t2017, 7, 7).sum(axis=1).values,
        "promo_14_2017": get_timespan(promo_df, t2017, 14, 14).sum(axis=1).values,
        "promo_30_2017": get_timespan(promo_df, t2017, 30, 30).sum(axis=1).values,
        "promo_3_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=15), 14, 3).sum(axis=1).values,
        "promo_7_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=15), 14, 7).sum(axis=1).values,
        "promo_14_2017_aft": get_timespan(promo_df, t2017 + timedelta(days=15), 14, 14).sum(axis=1).values,
    }

    for i in [3, 7, 14, 30]:
        tmp1 = get_timespan(df, t2017, i, i)
        tmp2 = (get_timespan(promo_df, t2017, i, i) > 0) * 1

        X['has_promo_mean_%s' % i] = (tmp1 * tmp2.replace(0, np.nan)).mean(axis=1).values
        X['no_promo_mean_%s' % i] = (tmp1 * (1 - tmp2).replace(0, np.nan)).mean(axis=1).values
                
        
    for i in [3, 7, 14, 30]:
        tmp = get_timespan(df, t2017, i, i)
        X['diff_%s_mean' % i] = tmp.diff(axis=1).mean(axis=1).values
        X['mean_%s' % i] = tmp.mean(axis=1).values
        X['median_%s' % i] = tmp.median(axis=1).values
        X['min_%s' % i] = tmp.min(axis=1).values
        X['max_%s' % i] = tmp.max(axis=1).values
        X['std_%s' % i] = tmp.std(axis=1).values

    for i in [7, 14, 30]:
        tmp = get_timespan(df, t2017, i, i)
        X['has_sales_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        X['last_has_sales_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        X['first_has_sales_day_in_last_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

        tmp = get_timespan(promo_df, t2017, i, i)
        X['has_promo_days_in_last_%s' % i] = (tmp > 0).sum(axis=1).values
        X['last_has_promo_day_in_last_%s' % i] = i - ((tmp > 0) * np.arange(i)).max(axis=1).values
        X['first_has_promo_day_in_last_%s' % i] = ((tmp > 0) * np.arange(i, 0, -1)).max(axis=1).values

    tmp = get_timespan(promo_df, t2017 + timedelta(days=15), 14, 14)
    X['has_promo_days_in_after_14_days'] = (tmp > 0).sum(axis=1).values
    X['last_has_promo_day_in_after_14_days'] = i - ((tmp > 0) * np.arange(14)).max(axis=1).values
    X['first_has_promo_day_in_after_14_days'] = ((tmp > 0) * np.arange(14, 0, -1)).max(axis=1).values

    for i in range(1, 15):
        X['day_%s_2017' % i] = get_timespan(df, t2017, i, 1).values.ravel()

    for i in range(7):
        X['mean_4_dow{}_2017'.format(i)] = get_timespan(df, t2017, 28-i, 4, freq='7D').mean(axis=1).values
        X['mean_20_dow{}_2017'.format(i)] = get_timespan(df, t2017, 140-i, 20, freq='7D').mean(axis=1).values        
        
    for i in range(-14, 15):
        X["promo_{}".format(i)] = promo_df[t2017 + timedelta(days=i)].values.astype(np.uint8)

    X = pd.DataFrame(X)

    if is_train:
        y = df[
            pd.date_range(t2017, periods=15)
        ].values
        return X, y
    if name_prefix is not None:
        X.columns = ['%s_%s' % (name_prefix, c) for c in X.columns]
    return X

In [27]:
from tqdm import tqdm

In [32]:
print("Preparing dataset...")
t2017 = date(2017, 6, 13)
num_days = 4
X_l, y_l = [], []
for i in tqdm(range(num_days)):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(df_2017, promo_2017, t2017 + delta)

    X_tmp = pd.concat([X_tmp, items.reset_index(drop=True), stores.reset_index(drop=True)], axis=1)
    X_l.append(X_tmp)
    y_l.append(y_tmp)

X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)

del X_l, y_l
gc.collect()



  0%|          | 0/4 [00:00<?, ?it/s][A[A

Preparing dataset...




 25%|██▌       | 1/4 [00:16<00:49, 16.59s/it][A[A

 50%|█████     | 2/4 [00:33<00:33, 16.59s/it][A[A

 75%|███████▌  | 3/4 [00:49<00:16, 16.55s/it][A[A

100%|██████████| 4/4 [01:06<00:00, 16.64s/it][A[A

[A[A

89

### Test with project date

In [34]:
X_val, y_val = prepare_dataset(df_2017, promo_2017, date(2017, 7, 11))

X_val = pd.concat([X_val, items.reset_index(drop=True), stores.reset_index(drop=True)], axis=1)

X_test,y_test = prepare_dataset(df_2017, promo_2017, date(2017, 8, 1))

X_test = pd.concat([X_test, items.reset_index(drop=True), stores.reset_index(drop=True)], axis=1)

In [33]:
X_train.shape, y_train.shape

((670060, 547), (670060, 15))

In [35]:
X_val.shape, y_val.shape

((167515, 547), (167515, 15))

In [36]:
X_test.shape,y_test.shape

((167515, 547), (167515, 15))

In [37]:
import datetime

In [38]:
print("Training and predicting models...")

params = {
    'num_leaves': 80,
    'objective': 'regression',
    'min_data_in_leaf': 200,
    'learning_rate': 0.02,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.7,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 16
}

MAX_ROUNDS = 5000
val_pred = []
test_pred = []
cate_vars = []
for i in range(15):
    start = datetime.datetime.now()
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * num_days) * 0.25 + 1
    )
    dval = lgb.Dataset(
        X_val, label=y_val[:, i], reference=dtrain,
        weight=items["perishable"] * 0.25 + 1,
        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=MAX_ROUNDS,
        valid_sets=[dtrain, dval], early_stopping_rounds=125, verbose_eval=50
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
    val_pred.append(bst.predict(
        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=bst.best_iteration or MAX_ROUNDS))
    end = datetime.datetime.now()
    print(f'Run Time: {end-start}')


Training and predicting models...
Step 1




Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.423573	valid_1's l2: 0.410645
[100]	training's l2: 0.325874	valid_1's l2: 0.319626
[150]	training's l2: 0.308468	valid_1's l2: 0.305588
[200]	training's l2: 0.30351	valid_1's l2: 0.302621
[250]	training's l2: 0.300738	valid_1's l2: 0.30143
[300]	training's l2: 0.298525	valid_1's l2: 0.300532
[350]	training's l2: 0.296801	valid_1's l2: 0.300017
[400]	training's l2: 0.29527	valid_1's l2: 0.299678
[450]	training's l2: 0.294017	valid_1's l2: 0.299518
[500]	training's l2: 0.292806	valid_1's l2: 0.299311
[550]	training's l2: 0.291642	valid_1's l2: 0.299204
[600]	training's l2: 0.290534	valid_1's l2: 0.299086
[650]	training's l2: 0.289456	valid_1's l2: 0.298989
[700]	training's l2: 0.288416	valid_1's l2: 0.298924
[750]	training's l2: 0.28741	valid_1's l2: 0.298861
[800]	training's l2: 0.286465	valid_1's l2: 0.298831
[850]	training's l2: 0.285528	valid_1's l2: 0.298782
[900]	training's l2: 0.28463	valid_1's l2:

Run Time: 0:02:57.422397
Step 2
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.423626	valid_1's l2: 0.40972
[100]	training's l2: 0.323462	valid_1's l2: 0.316471
[150]	training's l2: 0.305771	valid_1's l2: 0.302134
[200]	training's l2: 0.300358	valid_1's l2: 0.298721
[250]	training's l2: 0.297451	valid_1's l2: 0.297333
[300]	training's l2: 0.295052	valid_1's l2: 0.296332
[350]	training's l2: 0.2933	valid_1's l2: 0.295752
[400]	training's l2: 0.291865	valid_1's l2: 0.29543
[450]	training's l2: 0.290595	valid_1's l2: 0.295229
[500]	training's l2: 0.289375	valid_1's l2: 0.295072
[550]	training's l2: 0.288242	valid_1's l2: 0.294965
[600]	training's l2: 0.287176	valid_1's l2: 0.294855
[650]	training's l2: 0.286095	valid_1's l2: 0.294728
[700]	training's l2: 0.285137	valid_1's l2: 0.294699
[750]	training's l2: 0.284153	valid_1's l2: 0.294669
[800]	training's l2: 0.283234	valid_1's l2: 0.294656
[850]	training's l2: 0.282314	valid_1's l2: 0.29464
[900]	train

Run Time: 0:02:32.446015
Step 3
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.419072	valid_1's l2: 0.41755
[100]	training's l2: 0.335904	valid_1's l2: 0.340633
[150]	training's l2: 0.321125	valid_1's l2: 0.328992
[200]	training's l2: 0.316447	valid_1's l2: 0.32607
[250]	training's l2: 0.313733	valid_1's l2: 0.324841
[300]	training's l2: 0.3116	valid_1's l2: 0.324073
[350]	training's l2: 0.309886	valid_1's l2: 0.32362
[400]	training's l2: 0.308443	valid_1's l2: 0.3234
[450]	training's l2: 0.307141	valid_1's l2: 0.323211
[500]	training's l2: 0.305926	valid_1's l2: 0.323036
[550]	training's l2: 0.304738	valid_1's l2: 0.322928
[600]	training's l2: 0.303613	valid_1's l2: 0.322871
[650]	training's l2: 0.302483	valid_1's l2: 0.322763
[700]	training's l2: 0.301446	valid_1's l2: 0.322729
[750]	training's l2: 0.300404	valid_1's l2: 0.322674
[800]	training's l2: 0.299383	valid_1's l2: 0.322639
[850]	training's l2: 0.298422	valid_1's l2: 0.322647
[900]	trainin

Run Time: 0:02:31.547946
Step 4
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.442645	valid_1's l2: 0.441355
[100]	training's l2: 0.342601	valid_1's l2: 0.348988
[150]	training's l2: 0.324041	valid_1's l2: 0.333846
[200]	training's l2: 0.317904	valid_1's l2: 0.329555
[250]	training's l2: 0.314362	valid_1's l2: 0.327332
[300]	training's l2: 0.311708	valid_1's l2: 0.325849
[350]	training's l2: 0.309711	valid_1's l2: 0.324969
[400]	training's l2: 0.308111	valid_1's l2: 0.324472
[450]	training's l2: 0.306644	valid_1's l2: 0.3242
[500]	training's l2: 0.305364	valid_1's l2: 0.324014
[550]	training's l2: 0.304137	valid_1's l2: 0.323851
[600]	training's l2: 0.30295	valid_1's l2: 0.323656
[650]	training's l2: 0.301842	valid_1's l2: 0.323592
[700]	training's l2: 0.300778	valid_1's l2: 0.323515
[750]	training's l2: 0.299712	valid_1's l2: 0.323477
[800]	training's l2: 0.298671	valid_1's l2: 0.323373
[850]	training's l2: 0.297695	valid_1's l2: 0.323309
[900]	tra

Run Time: 0:02:58.354795
Step 5
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.486875	valid_1's l2: 0.466006
[100]	training's l2: 0.372678	valid_1's l2: 0.366773
[150]	training's l2: 0.352785	valid_1's l2: 0.353331
[200]	training's l2: 0.346794	valid_1's l2: 0.350644
[250]	training's l2: 0.343256	valid_1's l2: 0.349082
[300]	training's l2: 0.340383	valid_1's l2: 0.347795
[350]	training's l2: 0.338073	valid_1's l2: 0.346999
[400]	training's l2: 0.336249	valid_1's l2: 0.346607
[450]	training's l2: 0.334666	valid_1's l2: 0.346409
[500]	training's l2: 0.333205	valid_1's l2: 0.34625
[550]	training's l2: 0.331872	valid_1's l2: 0.34611
[600]	training's l2: 0.330602	valid_1's l2: 0.34604
[650]	training's l2: 0.329413	valid_1's l2: 0.346016
[700]	training's l2: 0.328213	valid_1's l2: 0.345929
[750]	training's l2: 0.327038	valid_1's l2: 0.345861
[800]	training's l2: 0.325965	valid_1's l2: 0.345771
[850]	training's l2: 0.324892	valid_1's l2: 0.345746
[900]	tra

Run Time: 0:02:29.779271
Step 6
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.501171	valid_1's l2: 0.504201
[100]	training's l2: 0.381479	valid_1's l2: 0.38593
[150]	training's l2: 0.359831	valid_1's l2: 0.36514
[200]	training's l2: 0.352962	valid_1's l2: 0.359623
[250]	training's l2: 0.34896	valid_1's l2: 0.35707
[300]	training's l2: 0.345694	valid_1's l2: 0.355321
[350]	training's l2: 0.343257	valid_1's l2: 0.354422
[400]	training's l2: 0.341208	valid_1's l2: 0.353842
[450]	training's l2: 0.339442	valid_1's l2: 0.353472
[500]	training's l2: 0.337864	valid_1's l2: 0.353214
[550]	training's l2: 0.336387	valid_1's l2: 0.352991
[600]	training's l2: 0.334999	valid_1's l2: 0.352915
[650]	training's l2: 0.333684	valid_1's l2: 0.352796
[700]	training's l2: 0.332434	valid_1's l2: 0.352715
[750]	training's l2: 0.331187	valid_1's l2: 0.352628
[800]	training's l2: 0.330032	valid_1's l2: 0.352554
[850]	training's l2: 0.328864	valid_1's l2: 0.352483
[900]	trai

Run Time: 0:03:03.062650
Step 7
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.467694	valid_1's l2: 0.469285
[100]	training's l2: 0.371602	valid_1's l2: 0.374854
[150]	training's l2: 0.354336	valid_1's l2: 0.359122
[200]	training's l2: 0.348681	valid_1's l2: 0.354791
[250]	training's l2: 0.345253	valid_1's l2: 0.35273
[300]	training's l2: 0.34241	valid_1's l2: 0.351199
[350]	training's l2: 0.340332	valid_1's l2: 0.350509
[400]	training's l2: 0.338586	valid_1's l2: 0.350168
[450]	training's l2: 0.337039	valid_1's l2: 0.349971
[500]	training's l2: 0.335558	valid_1's l2: 0.349729
[550]	training's l2: 0.334211	valid_1's l2: 0.34968
[600]	training's l2: 0.332932	valid_1's l2: 0.349596
[650]	training's l2: 0.331665	valid_1's l2: 0.349558
[700]	training's l2: 0.330498	valid_1's l2: 0.349553
[750]	training's l2: 0.329314	valid_1's l2: 0.349493
[800]	training's l2: 0.328174	valid_1's l2: 0.349482
[850]	training's l2: 0.327082	valid_1's l2: 0.349441
[900]	tra

Run Time: 0:02:27.463419
Step 8
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.455918	valid_1's l2: 0.449765
[100]	training's l2: 0.362935	valid_1's l2: 0.362211
[150]	training's l2: 0.345454	valid_1's l2: 0.348019
[200]	training's l2: 0.339681	valid_1's l2: 0.344966
[250]	training's l2: 0.336315	valid_1's l2: 0.343528
[300]	training's l2: 0.333595	valid_1's l2: 0.342486
[350]	training's l2: 0.331555	valid_1's l2: 0.342021
[400]	training's l2: 0.329819	valid_1's l2: 0.341714
[450]	training's l2: 0.328302	valid_1's l2: 0.341565
[500]	training's l2: 0.326888	valid_1's l2: 0.341478
[550]	training's l2: 0.32557	valid_1's l2: 0.341376
[600]	training's l2: 0.324326	valid_1's l2: 0.341339
[650]	training's l2: 0.323097	valid_1's l2: 0.341279
[700]	training's l2: 0.32193	valid_1's l2: 0.341204
[750]	training's l2: 0.320827	valid_1's l2: 0.341186
[800]	training's l2: 0.319741	valid_1's l2: 0.341162
[850]	training's l2: 0.318662	valid_1's l2: 0.341158
[900]	tr

Run Time: 0:02:06.610254
Step 9
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.44586	valid_1's l2: 0.451148
[100]	training's l2: 0.347187	valid_1's l2: 0.355692
[150]	training's l2: 0.329051	valid_1's l2: 0.339466
[200]	training's l2: 0.323089	valid_1's l2: 0.335005
[250]	training's l2: 0.319802	valid_1's l2: 0.333111
[300]	training's l2: 0.317277	valid_1's l2: 0.331983
[350]	training's l2: 0.315181	valid_1's l2: 0.331104
[400]	training's l2: 0.313618	valid_1's l2: 0.330779
[450]	training's l2: 0.312187	valid_1's l2: 0.330548
[500]	training's l2: 0.310847	valid_1's l2: 0.330321
[550]	training's l2: 0.309613	valid_1's l2: 0.330203
[600]	training's l2: 0.308447	valid_1's l2: 0.330095
[650]	training's l2: 0.307323	valid_1's l2: 0.330019
[700]	training's l2: 0.306263	valid_1's l2: 0.329951
[750]	training's l2: 0.305211	valid_1's l2: 0.329916
[800]	training's l2: 0.304187	valid_1's l2: 0.32989
[850]	training's l2: 0.303209	valid_1's l2: 0.329899
[900]	tr

Run Time: 0:02:02.151233
Step 10
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.436676	valid_1's l2: 0.458686
[100]	training's l2: 0.355238	valid_1's l2: 0.370835
[150]	training's l2: 0.33992	valid_1's l2: 0.35427
[200]	training's l2: 0.334822	valid_1's l2: 0.349787
[250]	training's l2: 0.331665	valid_1's l2: 0.3479
[300]	training's l2: 0.329051	valid_1's l2: 0.346641
[350]	training's l2: 0.327039	valid_1's l2: 0.345948
[400]	training's l2: 0.325417	valid_1's l2: 0.345566
[450]	training's l2: 0.323985	valid_1's l2: 0.345302
[500]	training's l2: 0.322596	valid_1's l2: 0.345149
[550]	training's l2: 0.321324	valid_1's l2: 0.34505
[600]	training's l2: 0.320123	valid_1's l2: 0.344918
[650]	training's l2: 0.318927	valid_1's l2: 0.344781
[700]	training's l2: 0.317833	valid_1's l2: 0.344778
[750]	training's l2: 0.31678	valid_1's l2: 0.344734
[800]	training's l2: 0.315734	valid_1's l2: 0.344758
[850]	training's l2: 0.314706	valid_1's l2: 0.344756
[900]	train

Run Time: 0:02:24.911352
Step 11
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.459173	valid_1's l2: 0.464969
[100]	training's l2: 0.362701	valid_1's l2: 0.369843
[150]	training's l2: 0.343857	valid_1's l2: 0.352484
[200]	training's l2: 0.337225	valid_1's l2: 0.347332
[250]	training's l2: 0.333211	valid_1's l2: 0.344731
[300]	training's l2: 0.330062	valid_1's l2: 0.342935
[350]	training's l2: 0.327594	valid_1's l2: 0.341683
[400]	training's l2: 0.325776	valid_1's l2: 0.341088
[450]	training's l2: 0.324135	valid_1's l2: 0.340631
[500]	training's l2: 0.322645	valid_1's l2: 0.34036
[550]	training's l2: 0.321282	valid_1's l2: 0.340086
[600]	training's l2: 0.32002	valid_1's l2: 0.3399
[650]	training's l2: 0.318833	valid_1's l2: 0.339733
[700]	training's l2: 0.31768	valid_1's l2: 0.339638
[750]	training's l2: 0.316594	valid_1's l2: 0.339591
[800]	training's l2: 0.31552	valid_1's l2: 0.339503
[850]	training's l2: 0.314476	valid_1's l2: 0.339427
[900]	train

Run Time: 0:03:07.361332
Step 12
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.503986	valid_1's l2: 0.485358
[100]	training's l2: 0.394058	valid_1's l2: 0.390557
[150]	training's l2: 0.373617	valid_1's l2: 0.377187
[200]	training's l2: 0.366748	valid_1's l2: 0.374199
[250]	training's l2: 0.362504	valid_1's l2: 0.373012
[300]	training's l2: 0.359005	valid_1's l2: 0.371532
[350]	training's l2: 0.356259	valid_1's l2: 0.370726
[400]	training's l2: 0.354124	valid_1's l2: 0.370162
[450]	training's l2: 0.352303	valid_1's l2: 0.369836
[500]	training's l2: 0.350695	valid_1's l2: 0.369606
[550]	training's l2: 0.349269	valid_1's l2: 0.369512
[600]	training's l2: 0.347851	valid_1's l2: 0.369365
[650]	training's l2: 0.346532	valid_1's l2: 0.369286
[700]	training's l2: 0.345233	valid_1's l2: 0.36919
[750]	training's l2: 0.344025	valid_1's l2: 0.369156
[800]	training's l2: 0.342819	valid_1's l2: 0.369098
[850]	training's l2: 0.34161	valid_1's l2: 0.369042
[900]	t

Run Time: 0:02:23.822133
Step 13
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.527013	valid_1's l2: 0.505117
[100]	training's l2: 0.407054	valid_1's l2: 0.400757
[150]	training's l2: 0.38418	valid_1's l2: 0.38552
[200]	training's l2: 0.37622	valid_1's l2: 0.382356
[250]	training's l2: 0.371476	valid_1's l2: 0.381067
[300]	training's l2: 0.367684	valid_1's l2: 0.379815
[350]	training's l2: 0.364634	valid_1's l2: 0.37897
[400]	training's l2: 0.362394	valid_1's l2: 0.378632
[450]	training's l2: 0.360304	valid_1's l2: 0.378273
[500]	training's l2: 0.35842	valid_1's l2: 0.378039
[550]	training's l2: 0.356678	valid_1's l2: 0.377855
[600]	training's l2: 0.355122	valid_1's l2: 0.377874
[650]	training's l2: 0.353644	valid_1's l2: 0.377742
[700]	training's l2: 0.352246	valid_1's l2: 0.377749
[750]	training's l2: 0.350914	valid_1's l2: 0.377683
[800]	training's l2: 0.349637	valid_1's l2: 0.377638
[850]	training's l2: 0.348391	valid_1's l2: 0.377574
[900]	trai

Run Time: 0:02:22.940212
Step 14
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.488912	valid_1's l2: 0.490503
[100]	training's l2: 0.393772	valid_1's l2: 0.40144
[150]	training's l2: 0.375743	valid_1's l2: 0.387409
[200]	training's l2: 0.369658	valid_1's l2: 0.384373
[250]	training's l2: 0.365638	valid_1's l2: 0.382958
[300]	training's l2: 0.362589	valid_1's l2: 0.381958
[350]	training's l2: 0.360227	valid_1's l2: 0.381301
[400]	training's l2: 0.358279	valid_1's l2: 0.380949
[450]	training's l2: 0.356481	valid_1's l2: 0.380661
[500]	training's l2: 0.354881	valid_1's l2: 0.380553
[550]	training's l2: 0.353383	valid_1's l2: 0.380366
[600]	training's l2: 0.351992	valid_1's l2: 0.380303
[650]	training's l2: 0.350632	valid_1's l2: 0.380266
[700]	training's l2: 0.349312	valid_1's l2: 0.380148
[750]	training's l2: 0.34806	valid_1's l2: 0.380033
[800]	training's l2: 0.346879	valid_1's l2: 0.380004
[850]	training's l2: 0.345687	valid_1's l2: 0.379981
[900]	t

Run Time: 0:02:05.363536
Step 15
Training until validation scores don't improve for 125 rounds
[50]	training's l2: 0.470656	valid_1's l2: 0.465708
[100]	training's l2: 0.381054	valid_1's l2: 0.383166
[150]	training's l2: 0.363963	valid_1's l2: 0.370282
[200]	training's l2: 0.358095	valid_1's l2: 0.367423
[250]	training's l2: 0.354314	valid_1's l2: 0.366318
[300]	training's l2: 0.351312	valid_1's l2: 0.365353
[350]	training's l2: 0.349103	valid_1's l2: 0.364782
[400]	training's l2: 0.347255	valid_1's l2: 0.364425
[450]	training's l2: 0.345579	valid_1's l2: 0.364163
[500]	training's l2: 0.344059	valid_1's l2: 0.364055
[550]	training's l2: 0.342658	valid_1's l2: 0.363928
[600]	training's l2: 0.341339	valid_1's l2: 0.363822
[650]	training's l2: 0.340104	valid_1's l2: 0.363721
[700]	training's l2: 0.338869	valid_1's l2: 0.363629
[750]	training's l2: 0.337696	valid_1's l2: 0.363632
[800]	training's l2: 0.3365	valid_1's l2: 0.363618
[850]	training's l2: 0.335347	valid_1's l2: 0.363595
[900]	t

Run Time: 0:02:53.774235


In [40]:
print("Validation mse:", mean_squared_error(
    y_val, np.array(val_pred).transpose()))

weight = items["perishable"] * 0.25 + 1
val_err = (y_val - np.array(val_pred).transpose())**2
val_err = val_err.sum(axis=1) * weight
val_err = np.sqrt(val_err.sum() / weight.sum() / 15)
print('nwrmsle = {}'.format(val_err))

Validation mse: 0.34224540976658513
nwrmsle = 0.5847986873442191


In [42]:
print("Test mse:", mean_squared_error(
    y_test, np.array(test_pred).transpose()))

weight = items["perishable"] * 0.25 + 1
test_err = (y_test - np.array(test_pred).transpose())**2
test_err = test_err.sum(axis=1) * weight
test_err = np.sqrt(test_err.sum() / weight.sum() / 15)
print('nwrmsle = {}'.format(test_err))

Test mse: 0.3570734314951243
nwrmsle = 0.5971471217194578


In [None]:
# y_val = np.array(val_pred).transpose()
# df_val = pd.DataFrame(
#     y_val, index=df_2017.index,
#     columns=pd.date_range("2017-07-26", periods=16)
# ).stack().to_frame("unit_sales")
# df_val.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)
# df_val["unit_sales"] = np.clip(np.expm1(df_preds["unit_sales"]), 0, 1000)
# df_val.reset_index().to_csv('lgb_cv.csv', index=False)

In [None]:
# #print("Making submission...")
# y_test = np.array(test_pred).transpose()
# df_preds = pd.DataFrame(
#     y_test, index=df_2017.index,
#     columns=pd.date_range("2017-08-16", periods=16)
# ).stack().to_frame("unit_sales")
# df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

In [None]:
y_test = np.array(test_pred).transpose()
y_test.shape

In [None]:
df_preds_test = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-01", periods=15)
).stack().to_frame("unit_sales")

In [None]:
df_preds_test.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

In [None]:
df_preds_test.head()

In [None]:
df_preds_test.loc[(1,96995),:]

In [None]:
df_2017.head()

In [None]:
df_preds_test = pd.read_csv('lgb_new_fe_project_date_2019-11-06.csv')

In [None]:
df_preds_test.set_index(['store_nbr','item_nbr','date'],inplace=True)

In [None]:
df_preds_test.head()

In [None]:
df_true = df_2017[pd.date_range('2017-08-01','2017-08-15')].stack().to_frame('unit_sales')

In [None]:
df_true_agg = df_true.groupby(['store_nbr','item_nbr'])['unit_sales'].sum().sort_values(ascending=False)

In [None]:
df_true_agg[:20]

In [None]:
store_item_list = df_true_agg.index[0:10]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
date_range = pd.date_range('2017-08-01',periods=15)

plt.figure(figsize=(15,20))
for i, store_item in enumerate(store_item_list, 1):
    
    store_nbr = store_item[0]
    item_nbr = store_item[1]
    d = {'true':list(np.expm1(df_true.loc[(store_nbr,item_nbr)].values)),
    'predicted':list(np.expm1(df_preds_test.loc[(store_nbr,item_nbr)].values))}

    temp_df = pd.DataFrame(d,index=date_range)
    
    plt.subplot(len(store_item_list)/2,2,i)
    plt.plot(temp_df.true)
    plt.plot(temp_df.predicted)
    plt.legend()
    plt.xticks(rotation=90)
    plt.ylim(0,)
    plt.axvline('2017-08-05',ls=':',c='r')
    plt.axvline('2017-08-06',ls=':',c='r')
    plt.axvline('2017-08-12',ls=':',c='r')
    plt.axvline('2017-08-13',ls=':',c='r')
    plt.title(f'Store Number: {store_nbr} Item Number: {item_nbr}')
plt.tight_layout()
plt.show()
    
    

In [None]:
store_nbr = 45
item_nbr =2042941
df_2017_plot = df_2017.loc[(store_nbr,item_nbr),'2017-07-01':'2017-08-15'].to_frame('unit_sales')
plt.plot(np.expm1(df_2017_plot))
plt.xticks(rotation=90)
plt.axvline('2017-08-01',ls='--',c='r')

In [None]:
holidays = pd.read_csv('../input/holidays_events.csv',parse_dates=True)

In [None]:
holidays['date'] = pd.to_datetime(holidays['date'])

In [None]:
holidays.set_index('date',inplace=True)

In [None]:
holidays.loc['2017-07-01':'2017-08-20']

In [None]:
df_preds_test.to_csv('lgb_new_fe_project_date_2019-11-06.csv', float_format='%.4f')

In [None]:
# submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
# submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)
# submission.to_csv('lgb_new_fe_project_date_2019-11-06.csv', float_format='%.4f', index=None)