# Import libraries

In [2]:
# you have to install ipython-autotime using 'pip install ipython-autotime'
%load_ext autotime

import gc
import IPython.display
import os
import datetime
from tqdm import tqdm_notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# in this project, the metric is rmse, not mse
from sklearn.metrics import mean_squared_error

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet

# SVR and KNeighborsRegressor is too slow to apply this data
# from sklearn.svm import SVR
# from sklearn.neighbors import KNeighborsRegressor

#TODO: sklearn AdaBoost, GradientBoosting 사용
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

seed = 180718

# lag 양을 여기서 결정
shift_range = [1,2,3,12]

# Load datasets

In [2]:
sales = pd.read_csv('./dataset/sales_train.csv.gz')
shops = pd.read_csv('./dataset/shops.csv')
items = pd.read_csv('./dataset/items.csv')
#item_cats = pd.read_csv('./dataset/item_categories.csv')
test = pd.read_csv("./dataset/test.csv.gz")

time: 2.76 s


# Feature engineering

First, define a useful function to save a memory.

In [4]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

time: 8.77 ms


### 1. Get base data form

The form should have 'shop_id', 'item_id', 'date_block_num' because the required form of this competition is 'ID' made of 'shop_id' and 'item_id', and 'item_cnt_month'.

In [5]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
gb = sales.groupby(index_cols, as_index=False).sum().rename(columns={'item_cnt_day':'month_sale'})

del sales

# 가격 정보를 사용할 것인지 아직 모름. 사용한다면 살려야 함.
gb = gb.drop('item_price', axis=1)

time: 1.59 s


In [6]:
df1 = pd.DataFrame({'shop_id': np.sort(shops.shop_id.unique()), 'key':np.zeros(len(shops.shop_id.unique()))})
df2 = pd.DataFrame({'item_id': np.sort(items.item_id.unique()), 'key':np.zeros(len(items.item_id.unique()))})
df3 = pd.DataFrame({'date_block_num': np.sort(gb.date_block_num.unique()), 'key':np.zeros(len(gb.date_block_num.unique()))})

df = df1.merge(df2).merge(df3)

del df1, df2, df3, shops

df = df.drop('key', axis=1)

time: 9.24 s


In [7]:
df = df.merge(gb, how='outer').fillna(0)

del gb

df.head()

df = downcast_dtypes(df)

gc.collect()

179

time: 19.5 s


# Clip (0, 20) before making something

In [8]:
df.month_sale = df.month_sale.values.clip(0,20)

time: 441 ms


# montly sale in the shop, item, category

In [9]:
shop_gb_sum = df.groupby(['shop_id', 'date_block_num'], as_index=False).sum()
shop_gb_sum = shop_gb_sum.rename(columns={'month_sale':'month_sale_shop_sum'})
shop_gb_sum = shop_gb_sum.drop(columns=['item_id'])

shop_gb_mean = df.groupby(['shop_id', 'date_block_num'], as_index=False).mean()
shop_gb_mean = shop_gb_mean.rename(columns={'month_sale':'month_sale_shop_mean'})
shop_gb_mean = shop_gb_mean.drop(columns=['item_id'])

item_gb_sum = df.groupby(['item_id', 'date_block_num'], as_index=False).sum()
item_gb_sum = item_gb_sum.rename(columns={'month_sale':'month_sale_item_sum'})
item_gb_sum = item_gb_sum.drop(columns=['shop_id'])

item_gb_mean = df.groupby(['item_id', 'date_block_num'], as_index=False).mean()
item_gb_mean = item_gb_mean.rename(columns={'month_sale':'month_sale_item_mean'})
item_gb_mean = item_gb_mean.drop(columns=['shop_id'])

category = items.drop(columns=['item_name'], axis=1)
category = category.rename(columns={'item_category_id':'category_id'})

item_sum_with_cat_gb = item_gb_sum.merge(category, how='left')

category_gb_sum = item_sum_with_cat_gb.groupby(['category_id', 'date_block_num'], as_index=False).sum()
category_gb_sum = category_gb_sum.rename(columns={'month_sale_item_sum':'month_sale_category_sum'})
category_gb_sum = category_gb_sum.drop(columns=['item_id'], axis=1)

category_gb_mean = item_sum_with_cat_gb.groupby(['category_id', 'date_block_num'], as_index=False).mean()
category_gb_mean = category_gb_mean.rename(columns={'month_sale_item_sum':'month_sale_category_mean'})
category_gb_mean = category_gb_mean.drop(columns=['item_id'], axis=1)

time: 22.6 s


In [10]:
df = df.merge(shop_gb_sum, how='left').fillna(0)
df = df.merge(shop_gb_mean, how='left').fillna(0)

df = df.merge(item_gb_sum, how='left').fillna(0)
df = df.merge(item_gb_mean, how='left').fillna(0)

df = df.merge(category, how='left').fillna(0)
df = df.merge(category_gb_sum, how='left').fillna(0)
df = df.merge(category_gb_mean, how='left').fillna(0)

time: 1min 46s


In [11]:
del shop_gb_sum, shop_gb_mean, item_gb_sum, item_gb_mean, category_gb_sum, category_gb_mean, category, item_sum_with_cat_gb, items
gc.collect()

458

time: 384 ms


# montly sale in the item

### 2. Make lag features

In [12]:
#12개월 전체를 할 수도 있고 일부를 할 수도 있다. 난 여기서 일부만 사용
#ensembling에선 1~5,12를 사용

index_cols = ['shop_id', 'item_id', 'date_block_num', 'category_id']

cols_to_rename = list(df.columns.difference(index_cols)) 
df = downcast_dtypes(df)
lag_df = df.copy()

del df
gc.collect()

32

time: 3.44 s


In [13]:
for month_shift in tqdm_notebook(shift_range):
    train_shift = lag_df[index_cols + cols_to_rename].copy()
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    lag_df = lag_df.merge(train_shift, how='outer')
    
    del train_shift, foo
    gc.collect()


time: 13min 38s


In [14]:
# 이유는 모르겠는데 끝나고 나면 lag_df에서 index column들이 64비트로 변경됨
lag_df[index_cols] = downcast_dtypes(lag_df[index_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


time: 2min 7s


### 3. Abort unnecessary data in lag_df

In [15]:
lag_df = lag_df[12 <= lag_df.date_block_num]
lag_df = lag_df[lag_df.date_block_num <= 34]

time: 1min 30s


### 4. Save and Load lag_df

In [16]:
#압축하면서 저장하는게 너무 오래걸린다. 나중에 시간 있을 때 한번 돌려보자.
#lag_df.to_csv("lag_df.csv.gz", index=False, compression='gzip')

# lag_df = pd.read_csv("lag_df.csv.gz")
# lag_df = downcast_dtypes(lag_df)

time: 9.1 ms


# Split train / valid dataset

In [17]:
dates = lag_df['date_block_num']
fit_cols = [col for col in lag_df.columns if col[-1] in [str(item) for item in shift_range]] 
to_drop_cols = list(set(list(lag_df.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

time: 32.9 ms


In [18]:
valid_block = 33

dates_train = dates[dates <  valid_block]
dates_valid  = dates[dates == valid_block]

X_train = lag_df.loc[dates <  valid_block].drop(to_drop_cols, axis=1).to_sparse()
y_train = lag_df.loc[dates <  valid_block, 'month_sale'].values.clip(0,20)

valid = lag_df.loc[dates == valid_block]
valid = test.merge(valid, how='left').fillna(0).drop('ID', axis=1).to_sparse()
X_valid =  valid.drop(to_drop_cols, axis=1).to_sparse()
y_valid =  valid['month_sale'].values.clip(0,20)

del valid, lag_df

gc.collect()

120

time: 1min 8s


# Make full train dataset

In [27]:
# test_block = 34

# dates_full_train = dates[dates < test_block]
# dates_test = dates[dates == test_block]

# X_full_train = lag_df.loc[dates <  test_block].drop(to_drop_cols, axis=1)
# y_full_train = lag_df.loc[dates <  test_block, 'month_sale'].values.clip(0,20)
# X_test = lag_df.loc[dates == test_block].drop(to_drop_cols, axis=1)

time: 2.98 ms


# Define this competition metric as a function

In [19]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.clip(y_true, 0, 20), np.clip(y_pred, 0, 20)))

def get_valid_rmse(reg):
    reg.fit(X_train.values, y_train)
    pred = reg.predict(X_valid.values)
    return rmse(pred, y_valid)

def get_prediction(reg):
    reg.fit(X_full_train.values, y_full_train)
    pred = reg.predict(X_test.values)
    return pred

def make_submission_from_prediction(pred, filename):
    sub = lag_df[dates == test_block]
    sub = sub.rename(columns={'month_sale':'item_cnt_month'})
    sub = sub[['shop_id', 'item_id', 'month_sale']]
    make_submission(sub, filename)

time: 35.8 ms


# First level models 

In [None]:
lr = LinearRegression()
get_valid_rmse(lr)

0.89983426883881978

time: 7min 56s


In [None]:
enet = ElasticNet()
get_valid_rmse(enet)

In [None]:
lgb = LGBMRegressor(random_state=seed)
get_valid_rmse(lgb)

In [53]:
rf = RandomForestRegressor(n_estimators=20, max_depth=13, random_state=seed)
get_valid_rmse(rf)

0.8821269853328018

time: 43min 3s


In [54]:
for i in range(len(X_train.columns.values)):
    print(str(X_train.columns.values[i]) + ": " + str(rf.feature_importances_[i]))

shop_id: 0.005938857500999187
item_id: 0.021326064336808163
category_id: 0.007972726863236742
month_sale_lag_1: 0.6465714500061555
month_sale_category_mean_lag_1: 0.008624141447710783
month_sale_category_sum_lag_1: 0.007536008250074994
month_sale_item_mean_lag_1: 0.039725975588308204
month_sale_item_sum_lag_1: 0.03251625320541881
month_sale_shop_mean_lag_1: 0.006472418298511968
month_sale_shop_sum_lag_1: 0.0070434549731058504
month_sale_lag_2: 0.03364581297228696
month_sale_category_mean_lag_2: 0.008653950221609997
month_sale_category_sum_lag_2: 0.008892765429332431
month_sale_item_mean_lag_2: 0.008119195875594955
month_sale_item_sum_lag_2: 0.007996687915974625
month_sale_shop_mean_lag_2: 0.006869937708972531
month_sale_shop_sum_lag_2: 0.0058250433257976495
month_sale_lag_3: 0.04117458427741591
month_sale_category_mean_lag_3: 0.00802149236251935
month_sale_category_sum_lag_3: 0.010712345698485195
month_sale_item_mean_lag_3: 0.004117669543735631
month_sale_item_sum_lag_3: 0.003854843900

In [55]:
xgb = XGBRegressor(max_depth=10, learning_rate=0.03, random_state=seed)
get_valid_rmse(xgb)

0.87428755

time: 37min 59s


In [None]:
make_submission_from_prediction(get_prediction(rf), 'rf_n20_m13')

In [None]:
make_submission_from_prediction(get_prediction(xgb), 'xgb_m10_l3')

In [None]:
# lgb_params = {
#                'feature_fraction': 0.75,
#                'metric': 'rmse',
#                'nthread':1, 
#                'min_data_in_leaf': 2**7, 
#                'bagging_fraction': 0.75, 
#                'learning_rate': 0.03, 
#                'objective': 'mse', 
#                'bagging_seed': 2**7, 
#                'num_leaves': 2**7,
#                'bagging_freq':1,
#                'verbose':0 
#               }

# lgb = lightgbm.train(lgb_params, lightgbm.Dataset(X_train.values, label=y_train), 100)
# pred_lgb = lgb.predict(X_valid)
# rmse(pred_lgb, y_valid)

# Make utilities to submit

Utility function makes codes simple, so it's good to make these functions

In [3]:
def make_submission_df(all_prediction):
    df = test.merge(all_prediction, on=["shop_id", "item_id"], how="left")[["ID", "item_cnt_month"]]
    df["item_cnt_month"] = df["item_cnt_month"].fillna(0).clip(0, 20)    
    return df

def make_submission_file(df, filename):
    df.to_csv("./submission/%s.csv" % filename, index=False)
    
def make_submission(all_prediction, filename="no_name"):
    make_submission_file(make_submission_df(all_prediction), filename)

time: 8.97 ms


# Submit to kaggle

This cell automatically submits the submission file to kaggle. However, it should be carefully executed because the submitting opportunities are limited.
- remove '#' before submitting
- add a meaningful message to a submission
- my rule is using a filename for models and a message for data
    - ex) filename: rf_n20m13.csv, message: ot-sic-sm-20

In [None]:
!kaggle competitions submit -c competitive-data-science-final-project -f ./submission/rf_n20_m13.csv -m "ot-sic-sm-20"

# Check public score

In [None]:
!kaggle competitions submissions -c competitive-data-science-final-project