Do some basic ensembling here

In [45]:
import gc
import numpy as np
import pandas as pd
# import seaborn as sns
import matplotlib.pyplot as plt
import futuresalesutility as fu

In [46]:
matrix = pd.read_pickle("matrix_reduced.pkl")
items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")

In [48]:
unique_feats = ['name_group_cat_unique_month',
 'name_group_new_unique_month',
 'name_group_new_cat_unique_month',
 'first_word_unique_month',
 'first_word_cat_unique_month',
 'first_word_new_unique_month',
 'first_word_new_cat_unique_month',
 'name_group_new_proportion_month',
 'unique_items_cat',
 'cat_items_proportion',
 'new_items_month',
 'item_category_id_new_items_cat_mean_rolling_mean_win_12',
 'item_name_group_name_group_new_unique_month_mean_rolling_mean_win_12',
 'new_items_cat_1_12_ratio']
matrix = matrix.drop(columns=unique_feats)

## Split train, validation, test sets from feature matrix

In [49]:
def train_test_x_y(matrix, test_month, keep_from_month=3):
    def split_train_test(matrix, test_month=33):
        # Split the matrix into train and test sets.
        test_month = fu.list_if_not(test_month, int)
        test = matrix.loc[matrix.date_block_num.isin(test_month), :]
        train = matrix.loc[matrix.date_block_num < min(test_month), :]
        return train, test

    def xysplit(matrix):
        # Split a train and test set into into x and y sets, with item_cnt as the target y variable
        y = matrix.item_cnt_month
        X = matrix.drop(columns=["item_cnt_month"])
        return (X, y)

    matrix = matrix.drop(
        columns=["item_revenue_month", "item_price", "item_cnt_month_original", "item_cnt_day_avg",], errors="ignore",
    )

    train, test = split_train_test(matrix, test_month)
    train = train[train.date_block_num >= keep_from_month]
    X_train, y_train = xysplit(train)
    X_test, y_test = xysplit(test)
    return (X_train, y_train, X_test, y_test)

In [50]:
import warnings

warnings.filterwarnings("ignore", module="lightgbm")

import lightgbm as lgbm

def fit_booster(
    X_train,
    y_train,
    X_test=None,
    y_test=None,
    params=None,
    categoricals=[],
    dropcols=[],
    early_stopping_rounds=None,
):
    # Regular booster fitting function
    if params is None:
        params = {"learning_rate": 0.1, "subsample_for_bin": 300000, "n_estimators": 5000}

    if X_test is not None:
        eval_set = [(X_train, y_train), (X_test, y_test)]
    else:
        eval_set = [(X_train, y_train)]

    booster = lgbm.LGBMRegressor(**params)
    categoricals = [c for c in categoricals if c in X_train.columns]
    booster.fit(
        X_train,
        y_train,
        eval_set=eval_set,
        eval_metric=["rmse"],
        verbose=50,
        categorical_feature=categoricals,
        early_stopping_rounds=early_stopping_rounds,
    )

    return booster

In [51]:
if (matrix == float("inf")).any().any():
    raise ValueError("Dataframe contains inf entries! This can crash some models!")

In [52]:
best_params = [{'num_leaves': 966,
  'cat_smooth': 45.01680827234465,
  'min_child_samples': 27,
  'min_child_weight': 0.021144950289224463,
  'max_bin': 214,
  'n_estimators': 500},
#  {'num_leaves': 940,
#   'cat_smooth': 43.418286701105615,
#   'min_child_samples': 29,
#   'min_child_weight': 0.003944267312494195,
#   'max_bin': 133,
#   'n_estimators': 572},
#  {'num_leaves': 971,
#   'cat_smooth': 40.103611531065525,
#   'min_child_samples': 30,
#   'min_child_weight': 0.03951287458923346,
#   'max_bin': 212,
#   'n_estimators': 828},
#  {'num_leaves': 965,
#   'cat_smooth': 40.05144976454027,
#   'min_child_samples': 27,
#   'min_child_weight': 0.029220951478909872,
#   'max_bin': 211,
#   'n_estimators': 870},
#  {'num_leaves': 961,
#   'cat_smooth': 40.013529776221134,
#   'min_child_samples': 29,
#   'min_child_weight': 0.026526521644599493,
#   'max_bin': 210,
#   'n_estimators': 897}
]

Create predictions as the mean of booster predictions with different parameters

In [57]:
best_params =     [{'num_leaves': 1023,
    'min_data_in_leaf':10,
    'feature_fraction':0.7,
    'learning_rate': 0.01,
    'num_rounds': 1000,
    'early_stopping_rounds': 30,
        }]

In [58]:
X_train, y_train, X_test, y_test = train_test_x_y(matrix, [33], keep_from_month=2)
boosterstore = []
for i, params in enumerate(best_params):
    params.update(
        {
            "n_jobs": 11,
#             "learning_rate": 0.01,
#             "subsample_for_bin": 300000,
#             "min_data_in_bin": 7,
#             "colsample_bytree": 0.8,
#             "subsample": 0.6,
#             "subsample_freq": 5,
#             'n_estimators': 500
        }
    )
    booster = fit_booster(
        X_train, y_train, X_test, y_test,
        params=params,
        categoricals=["item_category_id", "month"],
        dropcols=["shop_id", "digital"],
        early_stopping_rounds=10,
    )
    boosterstore.append(booster)

Training until validation scores don't improve for 30 rounds
[50]	training's rmse: 0.968386	training's l2: 0.937772	valid_1's rmse: 0.898216	valid_1's l2: 0.806792
[100]	training's rmse: 0.830554	training's l2: 0.68982	valid_1's rmse: 0.816243	valid_1's l2: 0.666253
[150]	training's rmse: 0.758335	training's l2: 0.575072	valid_1's rmse: 0.780877	valid_1's l2: 0.609769
[200]	training's rmse: 0.717255	training's l2: 0.514455	valid_1's rmse: 0.765201	valid_1's l2: 0.585533
[250]	training's rmse: 0.690625	training's l2: 0.476962	valid_1's rmse: 0.758236	valid_1's l2: 0.574922
[300]	training's rmse: 0.67156	training's l2: 0.450993	valid_1's rmse: 0.75455	valid_1's l2: 0.569346
[350]	training's rmse: 0.656569	training's l2: 0.431082	valid_1's rmse: 0.753019	valid_1's l2: 0.567038
[400]	training's rmse: 0.644466	training's l2: 0.415337	valid_1's rmse: 0.751246	valid_1's l2: 0.564371
[450]	training's rmse: 0.634133	training's l2: 0.402124	valid_1's rmse: 0.750622	valid_1's l2: 0.563433
[500]	t

In [39]:
def mean_prediction(boosterstore, X_test):
    predframe = X_test.loc[:, ["shop_id", "item_id", "digital"]]
    for i, booster in enumerate(boosterstore):
        predframe["prediction"] = booster.predict(X_test.drop(columns=["prediction", "shop_id", "digital"], errors="ignore"))
        predframe["prediction"] = predframe["prediction"].clip(0, 20)
        predframe.loc[(predframe.shop_id == 55) & (predframe.digital != 1), "prediction"] = 0
        predframe.loc[(predframe.shop_id != 55) & (predframe.digital == 1), "prediction"] = 0
        predframe['prediction'] = predframe["prediction"]
        predframe = predframe.rename(columns={"prediction": i})
    predframe["prediction"] = predframe.drop(columns=["shop_id", "item_id", "digital"]).mean(axis=1)
    return predframe["prediction"]

In [None]:
X_train, y_train, X_test, y_test = train_test_x_y(matrix.drop(columns=["shop_id", "digital"]), 34, keep_from_month=2)
X_test['prediction'] = mean_prediction(boosterstore, X_test)

In [87]:
X_test['item_cnt_month'] = y_test
X_test[['item_cnt_month', 'prediction']]

In [165]:
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(y_test, X_test['prediction'], squared=False)
print(rmse)

0.9026308520020633


In [None]:
_ = lgbm.plot_importance(booster, figsize=(10,70), height=0.5, importance_type="gain", ignore_zero=False)

Create predictions as the mean of booster predictions with different parameters

## Data cleaning

In [None]:
booster = fit_booster(matrix, params=None, test_run = False, categoricals=[], dropcols=[], new_item_weight_factor=1)

In [None]:
# Assume that data to be predicted from is in X_test
X_test['item_cnt_month'] = booster.predict(X_test)
# Clip the values as with the true values
X_test['item_cnt_month'] = X_test['item_cnt_month'].clip(0,20)

Sales of digital items should be zero for shops other than 55, and vice-versa, force set these values to zero if not

In [169]:
print(f"Mean predicted sales of digital items in non-digital shops is {X_test[(X_test.shop_id!=55) & (X_test.digital==1)].prediction.mean()}")
print(f"Mean predicted sales of non-digital items in digital shop 55 is {X_test[(X_test.shop_id==55) & (X_test.digital==0)].prediction.mean()}")

Mean predicted sales of digital items in non-digital shops is 0.0
Mean predicted sales of non-digital items in digital shop 55 is 0.0


In [None]:
X_test.loc[(X_test.shop_id==55) & (X_test.digital!=1), 'item_cnt_day'] = 0
X_test.loc[(X_test.shop_id!=55) & (X_test.digital==1), 'item_cnt_day'] = 0

Optional: replace the predictions for shop 36 (if any) with the predictions for shop 37 from the same city.

In [149]:
shop37 = X_test.loc[X_test.shop_id==37,:]
X_test = X_test.loc[X_test.shop_id!=36,:]
shop37.loc[:,'shop_id'] = 36
X_test = pd.concat([X_test,shop37])

In [170]:
# Merge the predictions with the provided template
test_orig = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
test = test_orig.merge(X_test[['shop_id','item_id','prediction']], on=['shop_id','item_id'], how='inner', copy=True)
test = test.rename(columns={'prediction':'item_cnt_month'})
# Check that the indices math the original
assert test_orig.equals(test[['ID','shop_id','item_id']])
test[['ID','item_cnt_month']].to_csv('submission.csv', index=False)

In [142]:
test['prediction'].isna().any()

False