# Import libraries

In [4]:
import IPython.display

import os
import time

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# in this project, the metric is rmse, not mse
from sklearn.metrics import mean_squared_error

# models
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

from sklearn.neighbors import KNeighborsRegressor

import lightgbm as lgb
import xgboost as xgb

# Load datasets

In [37]:
sales = pd.read_csv('./dataset/sales_train.csv.gz')
shops = pd.read_csv('./dataset/shops.csv')
items = pd.read_csv('./dataset/items.csv')
item_cats = pd.read_csv('./dataset/item_categories.csv')

# Make utilities to submit

In [39]:
def make_submission_df(all_prediction):
    test = pd.read_csv("./dataset/test.csv.gz")
    df = test.merge(all_prediction, on=["shop_id", "item_id"], how="left")[["ID", "item_cnt_month"]]
    
    # clip the predicted values between 0 and 20
    df["item_cnt_month"] = df["item_cnt_month"].fillna(0).clip(0, 20)
    
    return df

def make_submission_file(df, comment="", add_time_stamp=True):
    name = "submission"
    
    if add_time_stamp:
        name = "%s_%d" % (name, time.time())
    
    if len(comment) > 0:
        name = "%s_%s" % (name, comment)
        
    df.to_csv("./submission/%s.csv" % name, sep=",", index=False)
    
def make_submission(all_prediction, comment="", add_time_stamp=True):
    make_submission_file(make_submission_df(all_prediction), comment, add_time_stamp)

# Make benchmarks

There should be benchmarks to measure my prediction's quality, so I made very simple ones.

In [38]:
sample = pd.read_csv('./dataset/sample_submission.csv.gz')
make_submission_file(sample, 'sample_value', False)

sample['item_cnt_month'] = 0
make_submission_file(sample, 'zero_value', False)

previous_month = sales[sales["date_block_num"] == 33].groupby(["shop_id", "item_id"], as_index=False).item_cnt_day.sum().rename(columns={"item_cnt_day": "item_cnt_month"})
make_submission(previous_month, "previous_month_value", False)

* sample value: 1.23646
* zero value: 1.25011
* previous month value: 1.16777

In [None]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

# Get a feature matrix

In [5]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

from itertools import product
# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)

del grid, gb 

import gc
gc.collect();

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


After creating a grid, we can calculate some features. We will use lags from [1, 2, 3, 4, 5, 12] months ago.

In [6]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

from tqdm import tqdm_notebook

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Don't use old data from year 2013(because we use 12 months lag data in the target)
all_data = all_data[all_data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();




To this end, we've created a feature matrix. It is stored in `all_data` variable. Take a look:

In [7]:
all_data

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_2,...,target_lag_4,target_item_lag_4,target_shop_lag_4,target_lag_5,target_item_lag_5,target_shop_lag_5,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id
0,54,10297,12,4.0,8198.0,23.0,3.0,42.0,10055.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37
1,54,10296,12,3.0,8198.0,17.0,0.0,24.0,10055.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38
2,54,10298,12,14.0,8198.0,182.0,21.0,369.0,10055.0,119.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40
3,54,10300,12,3.0,8198.0,26.0,1.0,54.0,10055.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37
4,54,10284,12,1.0,8198.0,3.0,0.0,4.0,10055.0,0.0,...,0.0,3.0,7827.0,0.0,10.0,7792.0,0.0,0.0,0.0,57
5,54,10292,12,9.0,8198.0,93.0,8.0,156.0,10055.0,16.0,...,38.0,445.0,7827.0,11.0,140.0,7792.0,0.0,0.0,0.0,40
6,54,10109,12,2.0,8198.0,17.0,1.0,19.0,10055.0,0.0,...,0.0,18.0,7827.0,1.0,28.0,7792.0,0.0,0.0,0.0,40
7,54,10107,12,1.0,8198.0,26.0,2.0,23.0,10055.0,6.0,...,6.0,67.0,7827.0,2.0,75.0,7792.0,3.0,32.0,9386.0,37
8,54,10121,12,1.0,8198.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,9386.0,37
9,54,10143,12,1.0,8198.0,12.0,1.0,18.0,10055.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55


# Train/test split

In [8]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = all_data['date_block_num']

last_block = dates.max()
print('Test `date_block_num` is %d' % last_block)

Test `date_block_num` is 33


In [9]:
dates_train = dates[dates <  last_block]
dates_valid  = dates[dates == last_block]

X_train = all_data.loc[dates <  last_block].drop(to_drop_cols, axis=1)
X_valid =  all_data.loc[dates == last_block].drop(to_drop_cols, axis=1)

y_train = all_data.loc[dates <  last_block, 'target'].values
y_valid =  all_data.loc[dates == last_block, 'target'].values

# First level models 

In [11]:
lr = LinearRegression()
lr.fit(X_train.values, y_train)
pred_lr = lr.predict(X_valid.values)


rmse = np.sqrt(mean_squared_error(y_valid, pred_lr))
print('Test rmse for linreg is %f' % rmse)

rmse = np.sqrt(mean_squared_error(np.clip(y_valid, 0, 20), np.clip(pred_lr, 0, 20)))
print('Clipped Test rmse for linreg is %f' % rmse)

Test rmse for linreg is 4.611500
Clipped Test rmse for linreg is 1.004226


In [12]:
rf = RandomForestRegressor(max_depth=2, random_state=0)
rf.fit(X_train.values, y_train)
pred_rf = rf.predict(X_valid.values)

rmse =  np.sqrt(mean_squared_error(y_valid, pred_rf))
print('Test rmse for RandomForest is %f' % rmse)

rmse = np.sqrt(mean_squared_error(np.clip(y_valid, 0, 20), np.clip(pred_rf, 0, 20)))
print('Clipped Test rmse for RandomForest is %f' % rmse)

Test rmse for RandomForest is 4.860887
Clipped Test rmse for RandomForest is 1.108771


In [13]:
lgb_params = {
               'feature_fraction': 0.75,
               'metric': 'rmse',
               'nthread':1, 
               'min_data_in_leaf': 2**7, 
               'bagging_fraction': 0.75, 
               'learning_rate': 0.03, 
               'objective': 'mse', 
               'bagging_seed': 2**7, 
               'num_leaves': 2**7,
               'bagging_freq':1,
               'verbose':0 
              }

model = lgb.train(lgb_params, lgb.Dataset(X_train, label=y_train), 100)
pred_lgb = model.predict(X_valid)

rmse =  np.sqrt(mean_squared_error(y_valid, pred_lgb))

print('Test rmse for LightGBM is %f' % rmse)

rmse =  np.sqrt(mean_squared_error(np.clip(y_valid, 0, 20), np.clip(pred_lgb, 0, 20)))
print('Clipped Test rmse for LightGBM is %f' % rmse)

Test rmse for LightGBM is 4.573615
Clipped Test rmse for LightGBM is 1.078774
