Original is [here](https://www.kaggle.com/ceshine/lgbm-starter)

This is a sloppy revision of my earlier notebook (which was substantively identical to Ceshine's script) to include validation data in the training set.  That's normally what you would do before making a final submission (at least when you use holdout validation, as in this case).  Obviously it would be better to improve the model first, since it was billed as a "starter" that was "watered down."  I was just curious to see how it would do.  Turns out it does perform a ittle bit better (same 3-decimal-place public score, but it sorts higher).  It also happens to reduce the score of my ensemble, which either means that it's producing more redundant information or that my ensemble weights are overfit, most likely the latter.

In [None]:
from datetime import date, timedelta

import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [None]:
df_train = pd.read_csv(
    '../input/train.csv', usecols=[1, 2, 3, 4, 5],
    dtype={'onpromotion': bool},
    converters={'unit_sales': lambda u: np.log1p(
        float(u)) if float(u) > 0 else 0},
    parse_dates=["date"],
    skiprows=range(1, 66458909)  # 2016-01-01
)

df_test = pd.read_csv(
    "../input/test.csv", usecols=[0, 1, 2, 3, 4],
    dtype={'onpromotion': bool},
    parse_dates=["date"]  # , date_parser=parser
).set_index(
    ['store_nbr', 'item_nbr', 'date']
)

items = pd.read_csv(
    "../input/items.csv",
).set_index("item_nbr")

In [None]:
df_train.shape

In [None]:
df_train.head()

In [None]:
df_test.shape

In [None]:
df_test.head()

In [None]:
items.shape

In [None]:
items.head()

In [None]:
df_2017 = df_train[df_train.date.isin(
    pd.date_range("2017-05-31", periods=7 * 11))].copy()
del df_train

In [None]:
df_2017.shape

In [None]:
df_2017.head()

In [None]:
promo_2017_train = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["onpromotion"]].unstack(
        level=-1).fillna(False)
promo_2017_train.columns = promo_2017_train.columns.get_level_values(1)
promo_2017_test = df_test[["onpromotion"]].unstack(level=-1).fillna(False)
promo_2017_test.columns = promo_2017_test.columns.get_level_values(1)
promo_2017_test = promo_2017_test.reindex(promo_2017_train.index).fillna(False)
promo_2017 = pd.concat([promo_2017_train, promo_2017_test], axis=1)
del promo_2017_test, promo_2017_train

In [None]:
promo_2017.shape

In [None]:
promo_2017.head()

In [None]:
promo_2017.columns

<code>promo_2017</code> is just a big data frame to tell which items were on promotion in which stores on which dates.  Rows represent store-item combinations.  Columns represent dates.

In [None]:
df_2017 = df_2017.set_index(
    ["store_nbr", "item_nbr", "date"])[["unit_sales"]].unstack(
        level=-1).fillna(0)
df_2017.columns = df_2017.columns.get_level_values(1)
df_2017.shape

In [None]:
df_2017.head()

So the rows in <code>df_2017</code> now correspond to store-item combinations, the columns represent dates (isomorphic to <code>promo_2017</code>), and the data cells contain the target variable.

In [None]:
items = items.reindex(df_2017.index.get_level_values(1))
items.head()

In [None]:
items.shape

<code>items</code> now lines up with <code>df_2017</code>, which means everything is repeated for each store.

In [None]:
# Return that portion of the data frame that corresponds to the time period
#   beginning "minus" days before "dt" and extending for "periods" days
def get_timespan(df, dt, minus, periods):
    return df[
        pd.date_range(dt - timedelta(days=minus), periods=periods)
    ]

In [None]:
def prepare_dataset(t2017, is_train=True):
    X = pd.DataFrame({  # Mean target for different retrospective timespans & total # promotions
        "mean_3_2017": get_timespan(df_2017, t2017, 3, 3).mean(axis=1).values,
        "mean_7_2017": get_timespan(df_2017, t2017, 7, 7).mean(axis=1).values,
        "mean_14_2017": get_timespan(df_2017, t2017, 14, 14).mean(axis=1).values,
        "promo_14_2017": get_timespan(promo_2017, t2017, 14, 14).sum(axis=1).values
    })
    for i in range(16):  # Promotions on future days
        X["promo_{}".format(i)] = promo_2017[
            t2017 + timedelta(days=i)].values.astype(np.uint8)
    if is_train:
        y = df_2017[  # Target values for future days
            pd.date_range(t2017, periods=16)
        ].values
        return X, y
    return X

In [None]:
print("Preparing dataset...")
t2017 = date(2017, 6, 21)
X_l, y_l = [], []
for i in range(6):
    delta = timedelta(days=7 * i)
    X_tmp, y_tmp = prepare_dataset(
        t2017 + delta
    )
    X_l.append(X_tmp)
    y_l.append(y_tmp)
X_train = pd.concat(X_l, axis=0)
y_train = np.concatenate(y_l, axis=0)
del X_l, y_l
# X_val, y_val = prepare_dataset(date(2017, 7, 26))
X_test = prepare_dataset(date(2017, 8, 16), is_train=False)

In [None]:
X_train.shape

In [None]:
X_train.head()

In [None]:
y_train.shape

In [None]:
y_train

In [None]:
#X_val.shape

In [None]:
#X_val.head()

In [None]:
#y_val.shape

In [None]:
#y_val

In [None]:
X_test.shape

In [None]:
X_test.head()

In [None]:
print("Training and predicting models...")
params = {
    'num_leaves': 2**5 - 1,
    'objective': 'regression_l2',
    'max_depth': 8,
    'min_data_in_leaf': 50,
    'learning_rate': 0.05,
    'feature_fraction': 0.75,
    'bagging_fraction': 0.75,
    'bagging_freq': 1,
    'metric': 'l2',
    'num_threads': 4
}

In [None]:
MAX_ROUNDS = 1000
val_pred = []
test_pred = []
cate_vars = []
for i in range(16):
    print("=" * 50)
    print("Step %d" % (i+1))
    print("=" * 50)
    dtrain = lgb.Dataset(
        X_train, label=y_train[:, i],
        categorical_feature=cate_vars,
        weight=pd.concat([items["perishable"]] * 6) * 0.25 + 1
    )
#    dval = lgb.Dataset(
#        X_val, label=y_val[:, i], reference=dtrain,
#        weight=items["perishable"] * 0.25 + 1,
#        categorical_feature=cate_vars)
    bst = lgb.train(
        params, dtrain, num_boost_round=700,
        valid_sets=[dtrain], verbose_eval=50
    )
    print("\n".join(("%s: %.2f" % x) for x in sorted(
        zip(X_train.columns, bst.feature_importance("gain")),
        key=lambda x: x[1], reverse=True
    )))
#    val_pred.append(bst.predict(
#        X_val, num_iteration=bst.best_iteration or MAX_ROUNDS))
    test_pred.append(bst.predict(
        X_test, num_iteration=700))

In [None]:
#print("Validation mse:", mean_squared_error(
#    y_val, np.array(val_pred).transpose()))

In [None]:
print("Making submission...")
y_test = np.array(test_pred).transpose()
df_preds = pd.DataFrame(
    y_test, index=df_2017.index,
    columns=pd.date_range("2017-08-16", periods=16)
).stack().to_frame("unit_sales")
df_preds.index.set_names(["store_nbr", "item_nbr", "date"], inplace=True)

submission = df_test[["id"]].join(df_preds, how="left").fillna(0)
submission["unit_sales"] = np.clip(np.expm1(submission["unit_sales"]), 0, 1000)

In [None]:
submission.to_csv('lgb6w.csv', float_format='%.4f', index=None)