# Import libraries

In [2]:
# you have to install ipython-autotime using 'pip install ipython-autotime'
%load_ext autotime

import IPython.display
import os
import time
from tqdm import tqdm_notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# in this project, the metric is rmse, not mse
from sklearn.metrics import mean_squared_error

# models
from sklearn.linear_model import LinearRegression
#SVR is too slow
#from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

# Load datasets

In [3]:
sales = pd.read_csv('./dataset/sales_train.csv.gz')
shops = pd.read_csv('./dataset/shops.csv')
items = pd.read_csv('./dataset/items.csv')
item_cats = pd.read_csv('./dataset/item_categories.csv')
test = pd.read_csv("./dataset/test.csv.gz")

time: 2.59 s


# Analyze raw datasets

Let's start to anylyze basic information about give datasets.

In [30]:
pd.options.display.float_format = '{:,.0f}'.format
display(sales.describe())
display(test.describe())

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849,2935849,2935849,2935849,2935849
mean,15,33,10197,891,1
std,9,16,6324,1730,3
min,0,0,0,-1,-22
25%,7,22,4476,249,1
50%,14,31,9343,399,1
75%,23,47,15684,999,1
max,33,59,22169,307980,2169


Unnamed: 0,ID,shop_id,item_id
count,214200,214200,214200
mean,107100,32,11019
std,61834,18,6253
min,0,2,30
25%,53550,16,5382
50%,107100,34,11203
75%,160649,47,16072
max,214199,59,22167


time: 859 ms


We need to do simple calculations here. The number of shop_id is 60, and the number of item_id is 22,170. Therefore, the total number of combinations of them is 1,330,200. However, in the test, it has only 214,200 ID. It means that this competition only requires 16.1% of the full prediction.

We can use this fact in 3 ways.
1. get a prediction of the test IDs in the submission using full data in the training and the validation.
2. get a prediction of the test IDs in the validation and the submission using full data in the training.
3. Reduce data before training to make training short.

I think we should take 2 or 3. In the first way, the validation score can not be fitting to the test score. My strategy is using 3 till the validation and using 2 in the submission only. I think full data has other shops or other items, but it can give some information about how the price is going especially if I use RNN algorithms.

Before trimming data, I want to make a useful helper

# Make utilities to submit

Utility function makes codes simple, so it's good to make these functions

In [None]:
def make_submission_df(all_prediction):
    df = test.merge(all_prediction, on=["shop_id", "item_id"], how="left")[["ID", "item_cnt_month"]]
    df["item_cnt_month"] = df["item_cnt_month"].fillna(0).clip(0, 20)
    
    return df

def make_submission_file(df, comment="", add_time_stamp=True):
    name = "submission"
    
    if add_time_stamp:
        name = "%s_%d" % (name, time.time())
    
    if len(comment) > 0:
        name = "%s_%s" % (name, comment)
        
    df.to_csv("./submission/%s.csv" % name, sep=",", index=False)
    
def make_submission(all_prediction, comment="", add_time_stamp=True):
    make_submission_file(make_submission_df(all_prediction), comment, add_time_stamp)

# Make benchmarks

There should be benchmarks to measure my prediction's quality, so I made very simple ones. I think it should be done in first phase.

In [None]:
sample = pd.read_csv('./dataset/sample_submission.csv.gz')
make_submission_file(sample, 'sample_value', False)

sample['item_cnt_month'] = 0
make_submission_file(sample, 'zero_value', False)

previous_month = sales[sales["date_block_num"] == 33].groupby(["shop_id", "item_id"], as_index=False).item_cnt_day.sum().rename(columns={"item_cnt_day": "item_cnt_month"})
make_submission(previous_month, "previous_month_value", False)

* sample value(all 0.5): 1.23646
* zero value: 1.25011
* previous month value: 1.16777

In [None]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

# Get base data form

The form should have 'shop_id', 'item_id', 'date_block_num' because the required form of this competition is 'ID' made of 'shop_id' and 'item_id', and 'item_cnt_month'.

In [13]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
gb = sales.groupby(index_cols, as_index=False).sum().rename(columns={'item_cnt_day':'item_cnt_month'})
gb = gb.drop('item_price', axis=1)

df1 = pd.DataFrame({'shop_id':gb.shop_id.unique(), 'key':np.zeros(len(gb.shop_id.unique()))})
df2 = pd.DataFrame({'item_id':gb.item_id.unique(), 'key':np.zeros(len(gb.item_id.unique()))})
df3 = pd.DataFrame({'date_block_num':gb.date_block_num.unique(), 'key':np.zeros(len(gb.date_block_num.unique()))})

df = df1.merge(df2).merge(df3)

df = df.drop('key', axis=1)
df = df.sort_values(by=index_cols)

df = df.merge(gb, how='outer').fillna(0)
del df1, df2, df3, gb

df.head()

Unnamed: 0,shop_id,item_id,date_block_num,item_cnt_month
0,0,0,0,0.0
1,0,0,1,0.0
2,0,0,2,0.0
3,0,0,3,0.0
4,0,0,4,0.0


time: 47.4 s


# Make reduced data to save time

In [None]:
reduced_df = df[df.shop_id.isin([1,2,3,4])]
reduced_df.head()

In [None]:
# List of columns that we will use to create lags

cols_to_rename = list(reduced_df.columns.difference(index_cols)) 

shift_range = [i for i in range(1, 13)]

lag_df = reduced_df.copy()

for month_shift in tqdm_notebook(shift_range):
    train_shift = lag_df[index_cols + cols_to_rename].copy()
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    lag_df = lag_df.merge(train_shift, how='outer')

In [None]:
# Don't use old data from year 2013(because we use 12 months lag data in the target)
# to make submission 33 -> 34
lag_df = lag_df[12 <= lag_df.date_block_num]
lag_df = lag_df[lag_df.date_block_num <= 33]

# List of all lagged features
fit_cols = [col for col in lag_df.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(lag_df.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

lag_df = downcast_dtypes(lag_df)

# Train/test split

In [None]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
lag_df = lag_df.fillna(0)
dates = lag_df['date_block_num']

# to make submission file, change 33 to 34
last_block = 33
print('Test `date_block_num` is %d' % last_block)

In [None]:
dates_train = dates[dates <  last_block]
dates_valid  = dates[dates == last_block]

X_train = lag_df.loc[dates <  last_block].drop(to_drop_cols, axis=1)
X_valid =  lag_df.loc[dates == last_block].drop(to_drop_cols, axis=1)

y_train = lag_df.loc[dates <  last_block, 'item_cnt_month'].values
y_valid =  lag_df.loc[dates == last_block, 'item_cnt_month'].values

# Define this competition metric as a function

In [None]:
def rmse(pred, valid):
    return np.sqrt(mean_squared_error(np.clip(pred, 0, 20), np.clip(valid, 0, 20)))

# First level models 

In [None]:
lr = LinearRegression()
lr.fit(X_train.values, y_train)
pred_lr = lr.predict(X_valid.values)

print('Clipped rmse for Linear Regression is %f' % rmse(pred_lr, y_valid))

In [None]:
from sklearn.linear_model import ElasticNet
enet = ElasticNet()
enet.fit(X_train.values, y_train)
pred_enet = enet.predict(X_valid.values)

print('Clipped rmse for ElasticNet is %f' % rmse(pred_enet, y_valid))

In [None]:
knr = KNeighborsRegressor()
knr.fit(X_train.values, y_train)
pred_knr = knr.predict(X_valid.values)

print('Clipped rmse for KNR is %f' % rmse(pred_knr, y_valid))

In [None]:
#
svr = SVR()
svr.fit(X_train.values, y_train)
pred_svr = svr.predict(X_valid.values)

print('Clipped rmse for SVR is %f' % rmse(pred_svr, y_valid))

In [None]:
rf = RandomForest(max_depth=5)
rf.fit(X_train.values, y_train)
pred_rf = rf.predict(X_valid.values)

print('Clipped rmse for Random Forest is %f' % rmse(pred_rf, y_valid))

In [None]:
dd = lag_df[dates == last_block]
dd.item_cnt_month = pred_lr
dd = dd[['shop_id', 'item_id', 'item_cnt_month']]
make_submission(dd, 'linereg_with_12month_lag')

# Submit to kaggle

This cell automatically submits the submission file to kaggle. However, it should be carefully executed because the submitting opportunities are limited.
- remove '#' before submitting
- add a meaningful message to a submission

In [None]:
#!kaggle competitions submit -c competitive-data-science-final-project -f ./submission/submission_1529781197_linereg_with_12month_lag.csv -m "Linear Regression with 1 ~ 12 months lagged item sold record"
!kaggle competitions submissions -c competitive-data-science-final-project

# Check public score

In [None]:
!kaggle competitions submissions -c competitive-data-science-final-project