# Import libraries

In [2]:
# you have to install ipython-autotime using 'pip install ipython-autotime'
%load_ext autotime

import gc
import IPython.display
import os
import datetime
from tqdm import tqdm_notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# in this project, the metric is rmse, not mse
from sklearn.metrics import mean_squared_error

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet

# SVR and KNeighborsRegressor is too slow to apply this data
# from sklearn.svm import SVR
# from sklearn.neighbors import KNeighborsRegressor

#TODO: sklearn AdaBoost, GradientBoosting 사용
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

seed = 180718

# lag 양을 여기서 결정
shift_range = [1,2,3,12]

# Load datasets

In [3]:
sales = pd.read_csv('./dataset/sales_train.csv.gz')
shops = pd.read_csv('./dataset/shops.csv')
items = pd.read_csv('./dataset/items.csv')
#item_cats = pd.read_csv('./dataset/item_categories.csv')
test = pd.read_csv("./dataset/test.csv.gz")

time: 1.92 s


# Make utilities to submit

Utility function makes codes simple, so it's good to make these functions

In [4]:
def make_submission_df(all_prediction):
    df = test.merge(all_prediction, on=["shop_id", "item_id"], how="left")[["ID", "item_cnt_month"]]
    df["item_cnt_month"] = df["item_cnt_month"].fillna(0).clip(0, 20)    
    return df

def make_submission_file(df, filename):
    df.to_csv("./submission/%s.csv" % filename, index=False)
    
def make_submission(all_prediction, filename="no_name"):
    make_submission_file(make_submission_df(all_prediction), filename)

time: 6.98 ms


# Make benchmarks

There should be benchmarks to measure my prediction's quality, so I made very simple ones. I think it should be done in first phase.

In [None]:
# sample = pd.read_csv('./dataset/sample_submission.csv.gz')
# make_submission_file(sample, 'sample_value')

# sample['item_cnt_month'] = 0
# make_submission_file(sample, 'zero_value')

# previous_month = sales[sales["date_block_num"] == 33].groupby(["shop_id", "item_id"], as_index=False).item_cnt_day.sum().rename(columns={"item_cnt_day": "item_cnt_month"})
# make_submission(previous_month, "previous_month_value")

# del sample, previous_month

# Benchmark results

* sample value(all 0.5): 1.23646
* zero value: 1.25011
* previous month value: 1.16777

# Analyze raw datasets

Let's start to anylyze basic information about give datasets.

In [None]:
# make float data looks integer data
pd.options.display.float_format = '{:,.0f}'.format

sales.describe()

In [None]:
test.describe()

We need to do simple calculations here. The number of shop_id is 60, and the number of item_id is 22,170. Therefore, the total number of combinations of them is 1,330,200. However, there are only 214,200 IDs in the test. It means that this competition only requires 16.1% of the all shop_id and item_id combinations.

We can use this fact in 3 ways.
1. get a prediction of the test IDs in the submission using full data in the training and the validation.
2. get a prediction of the test IDs in the validation and the submission using full data in the training.
3. Reduce data before training to make training short.

I think we should take 2 or 3. In the first way, the validation score can not be fitting to the test score. My strategy is using 3 till the validation and using 2 in the submission only. I think full data has other shops or other items, but it can give some information about how the price is going especially if I use RNN algorithms.

# Plan EDA(Exploratory data analysis)

I think item_price and item_cnt_day have interesting qualtiles and min-max values. First of all, item_cnt_day must not be zero value because sales data is record of something that occured in sales. However, the target is item_cnt_month, so it would be better to analyze monthly data of item_cnt. In the item, the max price is so much higher than others. I'm not sure but, it's possible to use the extream price for prediction.

My plans is as below.

1. reduce data using test id combinations
2. aggregate the total item_cnt_month of shops month by month
3. aggregate the total item_cnt month of items month by month
4. aggregate the total item_cnt_month month by month

The purpose of them is to know if there are correlations between them and if there are patterns in time flow.

# Reduce data using test id combinations

In [None]:
reduced_sales = sales.merge(test)
reduced_sales = reduced_sales.drop('ID', axis=1)
reduced_sales.describe()

Before reducing data, the total number of rows is 2,935,849. Now, the amount of data is reduced to 41.7%.
It means that test data is not randomly picked in all combinations of shop_id and item_id.
One of the possible scenarios is that the host of this competition chose test targets in combinations that appeared in the sales data, not in all combinations.

# Analyze combinations of shop_id and item_id

In [None]:
full_comb = sales[['shop_id', 'item_id']]
full_comb = full_comb.drop_duplicates()
display(full_comb.describe())
display('unique value of shop_id: ' + str(len(full_comb.shop_id.unique())))
display('unique value of item_id: ' + str(len(full_comb.item_id.unique())))

In [None]:
reduced_comb = reduced_sales[['shop_id', 'item_id']]
reduced_comb = reduced_comb.drop_duplicates()
display(reduced_comb.describe())
display('unique value of shop_id: ' + str(len(reduced_comb.shop_id.unique())))
display('unique value of item_id: ' + str(len(reduced_comb.item_id.unique())))

In [None]:
display(test.describe())
display('unique value of shop_id: ' + str(len(test.shop_id.unique())))
display('unique value of item_id: ' + str(len(test.item_id.unique())))

| data | full | reduced | test |
|------|------| ------- | ---- |
| shop_id | 60 | 42 | 42 |
| item_id | 21,807 | 4,716 | 5100 |
| total | 424,124 | 111,404 | 214,200 |
| possible | 1,308,420 | 198,072 | 214,200 |
| ratio | 32.4% | 56.2% | 100% |

Our target is 214,200 combination. However, in the reduced data, there is only 4,716 unique item_ids. It means that 385 item was not sold in that period. In the combination, there is more zero sold combinations. It's almost half of the test combinations. In the full data set, zero sold combination ratios is abount 1/3. This is not so big gap between them. I think the important is the number of item_id. The test unique item_id is almost 1/4 of full data item_id. If we use one hot encoding for item_id, we can use only 1/4 of memory.

I focused on something else. The total number of the combinations in the test is 214,200, but 111,404 in the reduced dataset. It means that only about half of combinations exists in sales data. One more data selection options is selecting only data in 111,404 combinations. I'm going to use the test dataset first, and then I'll use smaller and bigger one.

In [None]:
del full_comb, reduced_comb, reduced_sales
pd.options.display.float_format = '{:,.5f}'.format

# Get applicable dataset to models

First, define a useful function to save a memory.

In [5]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

time: 8.98 ms


### 1. Get base data form

The form should have 'shop_id', 'item_id', 'date_block_num' because the required form of this competition is 'ID' made of 'shop_id' and 'item_id', and 'item_cnt_month'.

In [6]:
index_cols = ['shop_id', 'item_id', 'date_block_num']
gb = sales.groupby(index_cols, as_index=False).sum().rename(columns={'item_cnt_day':'month_sale'})

del sales

# 가격 정보를 사용할 것인지 아직 모름. 사용한다면 살려야 함.
gb = gb.drop('item_price', axis=1)

time: 1.27 s


In [7]:
df1 = pd.DataFrame({'shop_id': np.sort(shops.shop_id.unique()), 'key':np.zeros(len(shops.shop_id.unique()))})
df2 = pd.DataFrame({'item_id': np.sort(items.item_id.unique()), 'key':np.zeros(len(items.item_id.unique()))})
df3 = pd.DataFrame({'date_block_num': np.sort(gb.date_block_num.unique()), 'key':np.zeros(len(gb.date_block_num.unique()))})

df = df1.merge(df2).merge(df3)

del df1, df2, df3, shops

df = df.drop('key', axis=1)

time: 4.88 s


In [8]:
df = df.merge(gb, how='outer').fillna(0)

del gb

df.head()

df = downcast_dtypes(df)

gc.collect()

179

time: 11.4 s


# Clip (0, 20) before making something

In [9]:
df.month_sale = df.month_sale.values.clip(0,20)
df.head()

Unnamed: 0,shop_id,item_id,date_block_num,month_sale
0,0,0,0,0.0
1,0,0,1,0.0
2,0,0,2,0.0
3,0,0,3,0.0
4,0,0,4,0.0


time: 250 ms


# montly sale in the shop, item, category

In [10]:
shop_gb_sum = df.groupby(['shop_id', 'date_block_num'], as_index=False).sum()
shop_gb_sum = shop_gb_sum.rename(columns={'month_sale':'month_sale_shop_sum'})
shop_gb_sum = shop_gb_sum.drop(columns=['item_id'])

shop_gb_mean = df.groupby(['shop_id', 'date_block_num'], as_index=False).mean()
shop_gb_mean = shop_gb_mean.rename(columns={'month_sale':'month_sale_shop_mean'})
shop_gb_mean = shop_gb_mean.drop(columns=['item_id'])

item_gb_sum = df.groupby(['item_id', 'date_block_num'], as_index=False).sum()
item_gb_sum = item_gb_sum.rename(columns={'month_sale':'month_sale_item_sum'})
item_gb_sum = item_gb_sum.drop(columns=['shop_id'])

item_gb_mean = df.groupby(['item_id', 'date_block_num'], as_index=False).mean()
item_gb_mean = item_gb_mean.rename(columns={'month_sale':'month_sale_item_mean'})
item_gb_mean = item_gb_mean.drop(columns=['shop_id'])

category = items.drop(columns=['item_name'], axis=1)
category = category.rename(columns={'item_category_id':'category_id'})

item_sum_with_cat_gb = item_gb_sum.merge(category, how='left')

category_gb_sum = item_sum_with_cat_gb.groupby(['category_id', 'date_block_num'], as_index=False).sum()
category_gb_sum = category_gb_sum.rename(columns={'month_sale_item_sum':'month_sale_category_sum'})
category_gb_sum = category_gb_sum.drop(columns=['item_id'], axis=1)

category_gb_mean = item_sum_with_cat_gb.groupby(['category_id', 'date_block_num'], as_index=False).mean()
category_gb_mean = category_gb_mean.rename(columns={'month_sale_item_sum':'month_sale_category_mean'})
category_gb_mean = category_gb_mean.drop(columns=['item_id'], axis=1)

time: 20.5 s


In [11]:
df = df.merge(shop_gb_sum, how='left').fillna(0)
df = df.merge(shop_gb_mean, how='left').fillna(0)

df = df.merge(item_gb_sum, how='left').fillna(0)
df = df.merge(item_gb_mean, how='left').fillna(0)

df = df.merge(category, how='left').fillna(0)
df = df.merge(category_gb_sum, how='left').fillna(0)
df = df.merge(category_gb_mean, how='left').fillna(0)

time: 1min 2s


In [12]:
del shop_gb_sum, shop_gb_mean, item_gb_sum, item_gb_mean, category_gb_sum, category_gb_mean, category, item_sum_with_cat_gb, items
gc.collect()

135

time: 112 ms


# montly sale in the item

### 2. Make lag features

In [13]:
#12개월 전체를 할 수도 있고 일부를 할 수도 있다. 난 여기서 일부만 사용
#ensembling에선 1~5,12를 사용

index_cols = ['shop_id', 'item_id', 'date_block_num', 'category_id']

cols_to_rename = list(df.columns.difference(index_cols)) 
df = downcast_dtypes(df)
lag_df = df.copy()

del df
gc.collect()

time: 1.83 s


In [14]:
for month_shift in tqdm_notebook(shift_range):
    train_shift = lag_df[index_cols + cols_to_rename].copy()
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    lag_df = lag_df.merge(train_shift, how='outer')
    
    del train_shift, foo
    gc.collect()


time: 5min 32s


In [15]:
# 이유는 모르겠는데 끝나고 나면 lag_df에서 index column들이 64비트로 변경됨
lag_df[index_cols] = downcast_dtypes(lag_df[index_cols])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


time: 1min 22s


### 3. Abort unnecessary data in lag_df

In [16]:
lag_df = lag_df[12 <= lag_df.date_block_num]
lag_df = lag_df[lag_df.date_block_num <= 34]

time: 53.4 s


### 4. Save and Load lag_df

In [17]:
#압축하면서 저장하는게 너무 오래걸린다. 나중에 시간 있을 때 한번 돌려보자.
#lag_df.to_csv("lag_df.csv.gz", index=False, compression='gzip')

# lag_df = pd.read_csv("lag_df.csv.gz")
# lag_df = downcast_dtypes(lag_df)

time: 17 ms


# train / valid / test split

In [18]:
gc.collect()

dates = lag_df['date_block_num']
fit_cols = [col for col in lag_df.columns if col[-1] in [str(item) for item in shift_range]] 
to_drop_cols = list(set(list(lag_df.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

time: 834 ms


In [19]:
valid_block = 33

dates_train = dates[dates <  valid_block]
dates_valid  = dates[dates == valid_block]

X_train = lag_df.loc[dates <  valid_block].drop(to_drop_cols, axis=1)
y_train = lag_df.loc[dates <  valid_block, 'month_sale'].values.clip(0,20)

valid = lag_df.loc[dates == valid_block]
valid = test.merge(valid, how='left').fillna(0).drop('ID', axis=1)
X_valid =  valid.drop(to_drop_cols, axis=1)
y_valid =  valid['month_sale'].values.clip(0,20)

del valid

time: 9.91 s


In [25]:
test_block = 34

dates_full_train = dates[dates < test_block]
dates_test = dates[dates == test_block]

X_full_train = lag_df.loc[dates <  test_block].drop(to_drop_cols, axis=1)
y_full_train = lag_df.loc[dates <  test_block, 'month_sale'].values.clip(0,20)
X_test = lag_df.loc[dates == test_block].drop(to_drop_cols, axis=1)

time: 40 s


# Define this competition metric as a function

In [20]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.clip(y_true, 0, 20), np.clip(y_pred, 0, 20)))

def get_valid_rmse(reg):
    reg.fit(X_train.values, y_train)
    pred = reg.predict(X_valid.values)
    return rmse(pred, y_valid)

def get_prediction(reg):
    reg.fit(X_full_train.values, y_full_train)
    pred = reg.predict(X_test.values)
    return pred

def make_submission_from_prediction(pred, filename):
    sub = lag_df[dates == test_block]
    sub = sub.rename(columns={'month_sale':'item_cnt_month'})
    sub = sub[['shop_id', 'item_id', 'month_sale']]
    make_submission(sub, filename)

time: 19.9 ms


# First level models 

In [39]:
lr = LinearRegression(n_jobs=4)
get_valid_rmse(lr)

0.8998342661018064

time: 58.1 s


In [40]:
enet = ElasticNet(n_jobs=4)
get_valid_rmse(enet)

0.9970715380115442

time: 6min 50s


In [42]:
lgb = LGBMRegressor(n_jobs=4, random_state=seed)
get_valid_rmse(lgb)

0.8867281599288455

time: 1min 54s


In [21]:
rf = RandomForestRegressor(n_estimators=20, max_depth=13, n_jobs=4, random_state=seed)
get_valid_rmse(rf)

0.8821269853328018

time: 40min 15s


In [22]:
for i in range(len(X_train.columns.values)):
    print(str(X_train.columns.values[i]) + ": " + str(rf.feature_importances_[i]))

shop_id: 0.005938857500999187
item_id: 0.021326064336808163
category_id: 0.007972726863236742
month_sale_lag_1: 0.6465714500061555
month_sale_category_mean_lag_1: 0.008624141447710783
month_sale_category_sum_lag_1: 0.007536008250074994
month_sale_item_mean_lag_1: 0.039725975588308204
month_sale_item_sum_lag_1: 0.03251625320541881
month_sale_shop_mean_lag_1: 0.006472418298511968
month_sale_shop_sum_lag_1: 0.0070434549731058504
month_sale_lag_2: 0.03364581297228696
month_sale_category_mean_lag_2: 0.008653950221609997
month_sale_category_sum_lag_2: 0.008892765429332431
month_sale_item_mean_lag_2: 0.008119195875594955
month_sale_item_sum_lag_2: 0.007996687915974625
month_sale_shop_mean_lag_2: 0.006869937708972531
month_sale_shop_sum_lag_2: 0.0058250433257976495
month_sale_lag_3: 0.04117458427741591
month_sale_category_mean_lag_3: 0.00802149236251935
month_sale_category_sum_lag_3: 0.010712345698485195
month_sale_item_mean_lag_3: 0.004117669543735631
month_sale_item_sum_lag_3: 0.003854843900

In [23]:
xgb = XGBRegressor(max_depth=10, learning_rate=0.03, n_jobs=4, random_state=seed)
get_valid_rmse(xgb)

0.87428755

time: 36min 21s


In [26]:
make_submission_from_prediction(get_prediction(rf), 'rf_n20_m13')

MemoryError: 

time: 10.2 s


In [None]:
make_submission_from_prediction(get_prediction(xgb), 'xgb_m10_l3')

In [None]:
# lgb_params = {
#                'feature_fraction': 0.75,
#                'metric': 'rmse',
#                'nthread':1, 
#                'min_data_in_leaf': 2**7, 
#                'bagging_fraction': 0.75, 
#                'learning_rate': 0.03, 
#                'objective': 'mse', 
#                'bagging_seed': 2**7, 
#                'num_leaves': 2**7,
#                'bagging_freq':1,
#                'verbose':0 
#               }

# lgb = lightgbm.train(lgb_params, lightgbm.Dataset(X_train.values, label=y_train), 100)
# pred_lgb = lgb.predict(X_valid)
# rmse(pred_lgb, y_valid)

# Simple ensembling structure - data는.. cat전


- ensemble two model
    1. Linear Regression(0.90268626150226106), lightgbm(0.87875613368405925) =>
        1. last 3
            1. Linear Regression: 0.879674
            2. lightgbm: 0.947700
            3. Random Forest: 0.881135
        2. last 6
            1. Linear Regression: 0.877625
            2. lightgbm: 0.959168
            3. Random Forest: 0.878719
        3. last 6 + X_train
            1. Linear Regression: 0.876767 => 1.00935
            2. lightgbm: 0.882532
            3. **Random Forest: 0.877934 => 1.00924**
            4. ElasticNet(alpha=0.01): 0.888654
            5. xgb: ??? => 1.01291
- ensemble three model
    1. Linear Regression, lightgbm, KnnRegressor
        - winner in two models
- ensemble five models
    1. Linear Regression, ElasticNet, lightgbm, Random Forest, KnnRegressor
        - ?

In [None]:
X_test_level2 = np.c_[pred_lr, pred_lgb]

In [None]:
#level2_date_blocks = [27,28,29,30,31,32]
# dates_train_level2 = dates_train[dates_train.isin(level2_date_blocks)]
#y_train_level2 = y_train[dates_train.isin(level2_date_blocks)]


# for prediction
level2_date_blocks = [28,29,30,31,32,33]
dates_train_level2 = dates_full_train[dates_full_train.isin(level2_date_blocks)]
y_train_level2 = y_full_train[dates_full_train.isin(level2_date_blocks)]


In [None]:
# And here we create 2nd level feeature matrix, init it with zeros first
X_train_level2 = np.zeros([y_train_level2.shape[0], 2])

# Now fill `X_train_level2` with metafeatures
for cur_block_num in tqdm_notebook(level2_date_blocks): 
    
     gc.collect()
#     lr = LinearRegression()
#     lr.fit(X_train[dates_train < cur_block_num].values, y_train[dates_train < cur_block_num])
#     pred_lr_level2 = lr.predict(X_train[dates_train == cur_block_num].values)
    
#     lgb = lightgbm.train(lgb_params, lightgbm.Dataset(X_train[dates_train < cur_block_num], label=y_train[dates_train < cur_block_num]), 100)
#     pred_lgb_level2 = lgb.predict(X_train[dates_train == cur_block_num].values)
    
#     X_train_level2[dates_train_level2.isin([cur_block_num]), 0] = pred_lr_level2.copy()
#     X_train_level2[dates_train_level2.isin([cur_block_num]), 1] = pred_lgb_level2.copy()
    
    lr = LinearRegression()
    lr.fit(X_full_train[dates_full_train < cur_block_num].values, y_full_train[dates_full_train < cur_block_num])
    pred_lr_level2 = lr.predict(X_full_train[dates_full_train == cur_block_num].values)
    
    lgb = lightgbm.train(lgb_params, lightgbm.Dataset(X_full_train[dates_full_train < cur_block_num], label=y_full_train[dates_full_train < cur_block_num]), 100)
    pred_lgb_level2 = lgb.predict(X_full_train[dates_full_train == cur_block_num].values)
    
    X_train_level2[dates_train_level2.isin([cur_block_num]), 0] = pred_lr_level2.copy()
    X_train_level2[dates_train_level2.isin([cur_block_num]), 1] = pred_lgb_level2.copy()

In [None]:
plt.scatter(X_train_level2[:,0], X_train_level2[:,1])

In [None]:
# X_train_plus_level2 = np.c_[X_full_train[dates_train.isin(level2_date_blocks)].values, X_train_level2]

X_train_plus_level2 = np.c_[X_full_train[dates_full_train.isin(level2_date_blocks)].values, X_train_level2]

In [None]:
X_train_plus_level2.shape

In [None]:
# meta_model = lightgbm.train(lgb_params, lightgbm.Dataset(X_train_plus_level2, label=y_train_level2), 100)

In [None]:
meta_model = RandomForestRegressor(max_depth=5)

meta_model.fit(X_train_plus_level2, y_train_level2)

In [None]:
meta_model = LinearRegression()
meta_model.fit(X_train_plus_level2, y_train_level2)

In [None]:

meta_model = XGBRegressor(max_depth=5,learning_rate=0.03,n_jobs=-1,random_state=seed)
meta_model.fit(X_train_plus_level2, y_train_level2)

In [None]:
X_test_plus = np.c_[X_test.values, X_test_level2]

In [None]:
 # valid_preds = meta_model.predict(valid_plus)
# rmse_valid_stacking = rmse(y_valid, valid_preds)

print('Train rmse for stacking is %f' % rmse_train_stacking)
# print('Test  rmse for stacking is %f' % rmse_valid_stacking)

In [None]:
ensemble_pred = meta_model.predict(X_test_plus)

# Submit to kaggle

This cell automatically submits the submission file to kaggle. However, it should be carefully executed because the submitting opportunities are limited.
- remove '#' before submitting
- add a meaningful message to a submission
- my rule is using a filename for models and a message for data
    - ex) filename: rf_n20m13.csv, message: ot-sic-sm-20

In [None]:
!kaggle competitions submit -c competitive-data-science-final-project -f ./submission/rf_n20_m13.csv -m "ot-sic-sm-20"

In [None]:
!kaggle competitions submit -c competitive-data-science-final-project -f ./submission/xgb_m10_eta003.csv -m "ot-sic-sm-20"

# Check public score

In [None]:
!kaggle competitions submissions -c competitive-data-science-final-project