In [13]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
from itertools import product

import lightgbm as lgb
from tqdm import tqdm_notebook

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/competitive-data-science-predict-future-sales/items.csv
/kaggle/input/competitive-data-science-predict-future-sales/sample_submission.csv
/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv
/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv
/kaggle/input/competitive-data-science-predict-future-sales/shops.csv
/kaggle/input/competitive-data-science-predict-future-sales/test.csv


In [14]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df


In [15]:
item_categories = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
sales_train = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
sample_submission = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")
shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

In [16]:
sales = sales_train

# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
all_data2 = all_data.copy()
del grid, gb 
gc.collect();

in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


In [17]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




In [24]:
dates = all_data['date_block_num']

last_block = dates.max()

X_train = all_data.loc[dates <=  last_block].drop(to_drop_cols, axis=1)
X_train_part = all_data.loc[dates <  last_block].drop(to_drop_cols, axis=1)
X_test =  all_data.loc[dates == last_block].drop(to_drop_cols, axis=1)

y_train = all_data.loc[dates <=  last_block, 'target'].values
y_train_part = all_data.loc[dates <  last_block, 'target'].values
y_test =  all_data.loc[dates == last_block, 'target'].values

# print(X_train_part.shape)
# print(y_train_part.shape)

X_train.to_pickle("X_train.pkl")
X_train_part.to_pickle("X_train_part.pkl")
X_test.to_pickle("X_test.pkl")

np.save("y_train.npy", y_train)
np.save("y_train_part.npy", y_train_part)
np.save("y_test.npy", y_test)

In [None]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data2.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

test_data = test.copy()
test_data['date_block_num'] = 34

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data2[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    test_data = pd.merge(test_data, train_shift, on=index_cols, how='left').fillna(0)

test_data = pd.merge(test_data, item_category_mapping, how='left', on='item_id')
test_data = downcast_dtypes(test_data)
gc.collect();


In [27]:
def lgbm_train(X_train,y_train,X_test,y_test,num_iter):
    lgb_params = {
                   'feature_fraction': 0.75,
                   'metric': 'rmse',
                   'nthread':1, 
                   'min_data_in_leaf': 2**7, 
                   'bagging_fraction': 0.75, 
                   'learning_rate': 0.03, 
                   'objective': 'mse', 
                   'bagging_seed': 2**7, 
                   'num_leaves': 2**7,
                   'bagging_freq':1,
                   'verbose':0 
                  }

    evals_result = {}
    
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_valid = lgb.Dataset(X_test, label=y_test)
    model = lgb.train(lgb_params, lgb_train, num_iter,
                      valid_sets=[lgb_train, lgb_valid],
                      evals_result=evals_result,
                      verbose_eval=10)
    return model,evals_result


In [25]:

X_train = pd.read_pickle("X_train.pkl")
X_train_part = pd.read_pickle("X_train_part.pkl")
X_test =  pd.read_pickle("X_test.pkl")

y_train = np.load("y_train.npy")
y_train_part = np.load("y_train_part.npy")
y_test =  np.load("y_test.npy")


In [30]:
y_train_part = y_train_part.clip(0,20)

print(X_train_part.shape)
print(y_train_part.shape)
model,evals_result  = lgbm_train(X_train_part, y_train_part, X_test, y_test, 200)

(6186922, 21)
(6186922,)
[10]	training's rmse: 1.06297	valid_1's rmse: 5.30067
[20]	training's rmse: 0.982795	valid_1's rmse: 5.27263
[30]	training's rmse: 0.933585	valid_1's rmse: 5.25393
[40]	training's rmse: 0.901946	valid_1's rmse: 5.24084
[50]	training's rmse: 0.881238	valid_1's rmse: 5.23152
[60]	training's rmse: 0.867942	valid_1's rmse: 5.22483
[70]	training's rmse: 0.857954	valid_1's rmse: 5.22012
[80]	training's rmse: 0.850646	valid_1's rmse: 5.21607
[90]	training's rmse: 0.844865	valid_1's rmse: 5.21276
[100]	training's rmse: 0.839959	valid_1's rmse: 5.21101
[110]	training's rmse: 0.83615	valid_1's rmse: 5.20904
[120]	training's rmse: 0.832414	valid_1's rmse: 5.20766
[130]	training's rmse: 0.829149	valid_1's rmse: 5.20669
[140]	training's rmse: 0.825837	valid_1's rmse: 5.20546
[150]	training's rmse: 0.823013	valid_1's rmse: 5.20503
[160]	training's rmse: 0.820613	valid_1's rmse: 5.20446
[170]	training's rmse: 0.818294	valid_1's rmse: 5.20394
[180]	training's rmse: 0.816221	va

In [None]:
# full model traininig for submission
y_train = y_train.clip(0,20)

print(X_train.shape)
print(y_train.shape)
model_full, evals_result  = lgbm_train(X_train, y_train, X_test, y_test, 200) 

(6425094, 21)
(6425094,)
[10]	training's rmse: 1.0627	valid_1's rmse: 5.29971


In [None]:
to_drop_cols = ['ID','date_block_num']
X_test =  test_data.drop(to_drop_cols, axis=1)
pred_lgb = model.predict(X_test)
#print(pred_lgb)

merged = test.copy()
merged['item_cnt_month'] = pred_lgb
merged['item_cnt_month']=merged['item_cnt_month'].clip(lower=0,upper=20)
merged=merged.drop(['shop_id','item_id'],axis=1)
merged['ID']=merged['ID'].astype('int')
merged.to_csv("lightgbm_moreiter.csv",index=False)
print(merged.head)