# Import libraries

In [1]:
# you have to install ipython-autotime using 'pip install ipython-autotime'
%load_ext autotime

import gc
import IPython.display
import os
import datetime
from tqdm import tqdm_notebook

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# in this project, the metric is rmse, not mse
from sklearn.metrics import mean_squared_error

# models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet

#SVR and KNeighborsRegressor is too slow
#from sklearn.svm import SVR
#from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
#TODO: sklearn AdaBoost, GradientBoosting 사용
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

seed = 180718

# Load datasets

In [2]:
sales = pd.read_csv('./dataset/sales_train.csv.gz')
shops = pd.read_csv('./dataset/shops.csv')
items = pd.read_csv('./dataset/items.csv')
item_cats = pd.read_csv('./dataset/item_categories.csv')
test = pd.read_csv("./dataset/test.csv.gz")

time: 1.98 s


# Analyze raw datasets

Let's start to anylyze basic information about give datasets.

In [3]:
# make float data looks integer data
pd.options.display.float_format = '{:,.0f}'.format

sales.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,2935849,2935849,2935849,2935849,2935849
mean,15,33,10197,891,1
std,9,16,6324,1730,3
min,0,0,0,-1,-22
25%,7,22,4476,249,1
50%,14,31,9343,399,1
75%,23,47,15684,999,1
max,33,59,22169,307980,2169


time: 857 ms


In [4]:
test.describe()

Unnamed: 0,ID,shop_id,item_id
count,214200,214200,214200
mean,107100,32,11019
std,61834,18,6253
min,0,2,30
25%,53550,16,5382
50%,107100,34,11203
75%,160649,47,16072
max,214199,59,22167


time: 46.9 ms


We need to do simple calculations here. The number of shop_id is 60, and the number of item_id is 22,170. Therefore, the total number of combinations of them is 1,330,200. However, there are only 214,200 IDs in the test. It means that this competition only requires 16.1% of the all shop_id and item_id combinations.

We can use this fact in 3 ways.
1. get a prediction of the test IDs in the submission using full data in the training and the validation.
2. get a prediction of the test IDs in the validation and the submission using full data in the training.
3. Reduce data before training to make training short.

I think we should take 2 or 3. In the first way, the validation score can not be fitting to the test score. My strategy is using 3 till the validation and using 2 in the submission only. I think full data has other shops or other items, but it can give some information about how the price is going especially if I use RNN algorithms.

# Plan EDA(Exploratory data analysis)

I think item_price and item_cnt_day have interesting qualtiles and min-max values. First of all, item_cnt_day must not be zero value because sales data is record of something that occured in sales. However, the target is item_cnt_month, so it would be better to analyze monthly data of item_cnt. In the item, the max price is so much higher than others. I'm not sure but, it's possible to use the extream price for prediction.

My plans is as below.

1. reduce data using test id combinations
2. aggregate the total item_cnt_month of shops month by month
3. aggregate the total item_cnt month of items month by month
4. aggregate the total item_cnt_month month by month

The purpose of them is to know if there are correlations between them and if there are patterns in time flow.

# Make utilities to submit

Utility function makes codes simple, so it's good to make these functions

In [5]:
def make_submission_df(all_prediction):
    df = test.merge(all_prediction, on=["shop_id", "item_id"], how="left")[["ID", "item_cnt_month"]]
    df["item_cnt_month"] = df["item_cnt_month"].fillna(0).clip(0, 20)
    
    return df

def make_submission_file(df, name=""):
    df.to_csv("./submission/%s.csv" % name, sep=",", index=False)
    
def make_submission(all_prediction, name=""):
    make_submission_file(make_submission_df(all_prediction), name)

time: 7.01 ms


# Make benchmarks

There should be benchmarks to measure my prediction's quality, so I made very simple ones. I think it should be done in first phase.

In [6]:
# sample = pd.read_csv('./dataset/sample_submission.csv.gz')
# make_submission_file(sample, 'sample_value')

# sample['item_cnt_month'] = 0
# make_submission_file(sample, 'zero_value')

# previous_month = sales[sales["date_block_num"] == 33].groupby(["shop_id", "item_id"], as_index=False).item_cnt_day.sum().rename(columns={"item_cnt_day": "item_cnt_month"})
# make_submission(previous_month, "previous_month_value")

# del sample, previous_month

time: 4.02 ms


# Benchmark results

* sample value(all 0.5): 1.23646
* zero value: 1.25011
* previous month value: 1.16777

# Reduce data using test id combinations

In [7]:
reduced_sales = sales.merge(test)
reduced_sales = reduced_sales.drop('ID', axis=1)
reduced_sales.describe()

Unnamed: 0,date_block_num,shop_id,item_id,item_price,item_cnt_day
count,1224439,1224439,1224439,1224439,1224439
mean,19,32,9615,1031,1
std,9,16,6300,1827,3
min,0,2,30,0,-16
25%,12,19,4181,299,1
50%,21,31,7856,549,1
75%,27,46,15229,1199,1
max,33,59,22167,59200,2169


time: 846 ms


Before reducing data, the total number of rows is 2,935,849. Now, the amount of data is reduced to 41.7%.
It means that test data is not randomly picked in all combinations of shop_id and item_id.
One of the possible scenarios is that the host of this competition chose test targets in combinations that appeared in the sales data, not in all combinations.

# Analyze combinations of shop_id and item_id

In [8]:
full_comb = sales[['shop_id', 'item_id']]
full_comb = full_comb.drop_duplicates()
display(full_comb.describe())
display('unique value of shop_id: ' + str(len(full_comb.shop_id.unique())))
display('unique value of item_id: ' + str(len(full_comb.item_id.unique())))

Unnamed: 0,shop_id,item_id
count,424124,424124
mean,31,11458
std,17,6133
min,0,0
25%,18,6244
50%,30,11614
75%,46,16662
max,59,22169


'unique value of shop_id: 60'

'unique value of item_id: 21807'

time: 276 ms


In [9]:
reduced_comb = reduced_sales[['shop_id', 'item_id']]
reduced_comb = reduced_comb.drop_duplicates()
display(reduced_comb.describe())
display('unique value of shop_id: ' + str(len(reduced_comb.shop_id.unique())))
display('unique value of item_id: ' + str(len(reduced_comb.item_id.unique())))

Unnamed: 0,shop_id,item_id
count,111404,111404
mean,31,10884
std,17,6154
min,2,30
25%,16,5241
50%,31,10889
75%,47,16028
max,59,22167


'unique value of shop_id: 42'

'unique value of item_id: 4716'

time: 104 ms


In [10]:
display(test.describe())
display('unique value of shop_id: ' + str(len(test.shop_id.unique())))
display('unique value of item_id: ' + str(len(test.item_id.unique())))

Unnamed: 0,ID,shop_id,item_id
count,214200,214200,214200
mean,107100,32,11019
std,61834,18,6253
min,0,2,30
25%,53550,16,5382
50%,107100,34,11203
75%,160649,47,16072
max,214199,59,22167


'unique value of shop_id: 42'

'unique value of item_id: 5100'

time: 50.9 ms


| data | full | reduced | test |
|------|------| ------- | ---- |
| shop_id | 60 | 42 | 42 |
| item_id | 21,807 | 4,716 | 5100 |
| total | 424,124 | 111,404 | 214,200 |
| possible | 1,308,420 | 198,072 | 214,200 |
| ratio | 32.4% | 56.2% | 100% |

Our target is 214,200 combination. However, in the reduced data, there is only 4,716 unique item_ids. It means that 385 item was not sold in that period. In the combination, there is more zero sold combinations. It's almost half of the test combinations. In the full data set, zero sold combination ratios is abount 1/3. This is not so big gap between them. I think the important is the number of item_id. The test unique item_id is almost 1/4 of full data item_id. If we use one hot encoding for item_id, we can use only 1/4 of memory.

I focused on something else. The total number of the combinations in the test is 214,200, but 111,404 in the reduced dataset. It means that only about half of combinations exists in sales data. One more data selection options is selecting only data in 111,404 combinations. I'm going to use the test dataset first, and then I'll use smaller and bigger one.

In [11]:
del full_comb, reduced_comb, reduced_sales

time: 997 µs


# Get applicable dataset to models

First, define a useful function to save a memory.

In [12]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

time: 7.01 ms


### 1. Get base data form

The form should have 'shop_id', 'item_id', 'date_block_num' because the required form of this competition is 'ID' made of 'shop_id' and 'item_id', and 'item_cnt_month'.

In [13]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

time: 2.99 ms


In [14]:
gb = sales.groupby(index_cols, as_index=False).sum().rename(columns={'item_cnt_day':'month_sale'})
# 가격 정보를 사용할 것인지 아직 모름. 사용한다면 살려야 함.
gb = gb.drop('item_price', axis=1)

df1 = pd.DataFrame({'shop_id': np.sort(shops.shop_id.unique()), 'key':np.zeros(len(shops.shop_id.unique()))})
df2 = pd.DataFrame({'item_id': np.sort(items.item_id.unique()), 'key':np.zeros(len(items.item_id.unique()))})
df3 = pd.DataFrame({'date_block_num': np.sort(gb.date_block_num.unique()), 'key':np.zeros(len(gb.date_block_num.unique()))})

df = df1.merge(df2).merge(df3)

del df1, df2, df3

df = df.drop('key', axis=1)

time: 6.15 s


In [15]:
df = df.merge(gb, how='outer').fillna(0)

del gb

df.head()

df = downcast_dtypes(df)

gc.collect()

168

time: 12.8 s


In [16]:
pd.options.display.float_format = '{:,.3f}'.format

time: 997 µs


# Clip (0, 20) before making something

In [17]:
df.month_sale = df.month_sale.values.clip(0,20)
df.head()

Unnamed: 0,shop_id,item_id,date_block_num,month_sale
0,0,0,0,0.0
1,0,0,1,0.0
2,0,0,2,0.0
3,0,0,3,0.0
4,0,0,4,0.0


time: 270 ms


# montly sale in the shop, item, category

In [18]:
shop_gb = df.groupby(['shop_id', 'date_block_num'], as_index=False).mean()
shop_gb = shop_gb.rename(columns={'month_sale':'month_sale_shop_mean'})
shop_gb = shop_gb.drop(columns=['item_id'])

item_gb = df.groupby(['item_id', 'date_block_num'], as_index=False).mean()
item_gb = item_gb.rename(columns={'month_sale':'month_sale_item_mean'})
item_gb = item_gb.drop(columns=['shop_id'])
item_gb.head()

category = items.drop(columns=['item_name'], axis=1)
category = category.rename(columns={'item_category_id':'category_id'})

cat_gb = item_gb.merge(category, how='left')
cgb = cat_gb.groupby(['category_id', 'date_block_num'], as_index=False).mean()
cgb = cgb.rename(columns={'month_sale_item':'month_sale_category_mean'})
cgb = cgb.drop(columns=['item_id'], axis=1)

time: 10.8 s


In [19]:
df = df.merge(shop_gb, how='left').fillna(0)
df = df.merge(item_gb, how='left').fillna(0)
df = df.merge(category, how='left').fillna(0)
df = df.merge(cgb, how='left').fillna(0)

time: 45.5 s


In [20]:
shop_gb = df.groupby(['shop_id', 'date_block_num'], as_index=False).sum()
shop_gb = shop_gb.rename(columns={'month_sale':'month_sale_shop_sum'})
shop_gb = shop_gb.drop(columns=['item_id'])

item_gb = df.groupby(['item_id', 'date_block_num'], as_index=False).sum()
item_gb = item_gb.rename(columns={'month_sale':'month_sale_item_sum'})
item_gb = item_gb.drop(columns=['shop_id'])
item_gb.head()

category = items.drop(columns=['item_name'], axis=1)
category = category.rename(columns={'item_category_id':'category_id'})

cat_gb = item_gb.merge(category, how='left')
cgb = cat_gb.groupby(['category_id', 'date_block_num'], as_index=False).sum()
cgb = cgb.rename(columns={'month_sale_item':'month_sale_category_sum'})
cgb = cgb.drop(columns=['item_id'], axis=1)

time: 13.9 s


In [21]:
df = df.merge(shop_gb, how='left').fillna(0)
df = df.merge(item_gb, how='left').fillna(0)
df = df.merge(category, how='left').fillna(0)
df = df.merge(cgb, how='left').fillna(0)

time: 12min 37s


In [22]:
del shop_gb, item_gb, category, cat_gb, cgb

time: 5.98 ms


# montly sale in the item

### 2. Make lag features

In [23]:
#12개월 전체를 할 수도 있고 일부를 할 수도 있다. 난 여기서 일부만 사용
#ensembling에선 1~5,12를 사용

shift_range = [1,2,3,12]

time: 11 ms


In [24]:
index_cols = ['shop_id', 'item_id', 'date_block_num', 'category_id']

time: 4.99 ms


In [25]:
# List of columns that we will use to create lags

cols_to_rename = list(df.columns.difference(index_cols)) 

lag_df = df

for month_shift in tqdm_notebook(shift_range):
    train_shift = lag_df[index_cols + cols_to_rename].copy()
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    lag_df = lag_df.merge(train_shift, how='outer')
    del train_shift
    lag_df = downcast_dtypes(lag_df)
    gc.collect()


time: 4min 58s


- 위 작업이 시간이 오래걸리므로 csv나 npy파일로 만든 후에 저장했다가 부르는건 어떨까?
- validation용 traing set과 test용 traing set을 분리시킨 후에 파이프 라인을 만들어서 코드를 깔끔하게 만들자
- 작업이 끝나면 Knn feature와 mean encoding 방법을 적용할 방법을 생각해보자
- 마지막은 ensemble을 해보자

### 3. Trim lag_df

In [26]:
# Don't use old data from year 2013(because we use 12 months lag data in the target)
# to make submission 33 -> 34

valid_last = 33
test_last = 34

lag_df = lag_df[12 <= lag_df.date_block_num]
lag_df = lag_df[lag_df.date_block_num <= test_last]
lag_df = lag_df.fillna(0)
lag_df = downcast_dtypes(lag_df)

time: 32.5 s


### 4. Save lag_df

In [27]:
# lag_df.to_csv("full_lag_df.csv", sep=",", index=False)

time: 11 ms


### 5. load lag_df

In [28]:
# lag_df = pd.read_csv("full_lag_df.csv")
# lag_df = downcast_dtypes(lag_df)

time: 31.9 ms


In [29]:
#shift_range = [1,2,3,12]
# List of all lagged features
fit_cols = [col for col in lag_df.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(lag_df.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

time: 12 ms


In [30]:
lag_df.head()

Unnamed: 0,shop_id,item_id,date_block_num,month_sale,month_sale_shop_mean,month_sale_item_mean,category_id,month_sale_shop_sum,month_sale_item_sum,month_sale_lag_1,...,month_sale_lag_3,month_sale_item_mean_lag_3,month_sale_item_sum_lag_3,month_sale_shop_mean_lag_3,month_sale_shop_sum_lag_3,month_sale_lag_12,month_sale_item_mean_lag_12,month_sale_item_sum_lag_12,month_sale_shop_mean_lag_12,month_sale_shop_sum_lag_12
12,0,0,12,0.0,0.0,0.0,40,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.244,0.0
13,0,0,13,0.0,0.0,0.0,40,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.263,0.0
14,0,0,14,0.0,0.0,0.0,40,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0,0,15,0.0,0.0,0.0,40,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0,0,16,0.0,0.0,0.0,40,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


time: 69.8 ms


# train / valid / test split

In [None]:
gc.collect()

# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = lag_df['date_block_num']


valid_block = 33

dates_train = dates[dates <  valid_block]
dates_valid  = dates[dates == valid_block]

X_train = lag_df.loc[dates <  valid_block].drop(to_drop_cols, axis=1)
y_train = lag_df.loc[dates <  valid_block, 'month_sale'].values.clip(0,20)

valid = lag_df.loc[dates == valid_block]
valid = test.merge(valid, how='left').fillna(0).drop('ID', axis=1)
X_valid =  valid.drop(to_drop_cols, axis=1)
y_valid =  valid['month_sale'].values.clip(0,20)

test_block = 34

dates_full_train = dates[dates < test_block]
dates_test = dates[dates == test_block]

X_full_train = lag_df.loc[dates <  test_block].drop(to_drop_cols, axis=1)
y_full_train = lag_df.loc[dates <  test_block, 'month_sale'].values.clip(0,20)
X_test = lag_df.loc[dates == test_block].drop(to_drop_cols, axis=1)


time: 38.9 s


# Define this competition metric as a function

# First level models 

validation score를 믿을만하다고 생각했었는데 feature변경하면서 하다보니 안맞는 부분이 있군..
- 이름 정보 활용?

In [32]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(np.clip(y_true, 0, 20), np.clip(y_pred, 0, 20)))

def get_valid_rmse(reg):
    reg.fit(X_train.values, y_train)
    pred = reg.predict(X_valid.values)
    print(rmse(pred, y_valid))

def get_prediction(reg):
    reg.fit(X_full_train.values, y_full_train)
    pred = reg.predict(X_test.values)
    return pred

time: 10 ms


In [33]:
lr = LinearRegression()
lr.fit(X_train.values, y_train)
pred_lr = lr.predict(X_valid.values)

rmse(pred_lr, y_valid)

# lr = LinearRegression()
# lr.fit(X_full_train.values, y_full_train)
# pred_lr = lr.predict(X_test.values)

0.9026512320815283

time: 35.3 s


In [34]:
enet = ElasticNet()
enet.fit(X_train.values, y_train)
pred_enet = enet.predict(X_valid.values)

rmse(pred_enet, y_valid)

1.1063908214041736

time: 24.6 s


In [35]:
# lgb_params = {
#                'feature_fraction': 0.75,
#                'metric': 'rmse',
#                'nthread':1, 
#                'min_data_in_leaf': 2**7, 
#                'bagging_fraction': 0.75, 
#                'learning_rate': 0.03, 
#                'objective': 'mse', 
#                'bagging_seed': 2**7, 
#                'num_leaves': 2**7,
#                'bagging_freq':1,
#                'verbose':0 
#               }

# lgb = lightgbm.train(lgb_params, lightgbm.Dataset(X_train.values, label=y_train), 100)
# pred_lgb = lgb.predict(X_valid)
# rmse(pred_lgb, y_valid)

time: 11 ms


In [36]:
lgb = LGBMRegressor(n_jobs=2, random_state=seed)
lgb.fit(X_train.values, y_train)
pred_lgb = lgb.predict(X_valid.values)

rmse(pred_lgb, y_valid)

0.8760807496834856

time: 1min 32s


In [37]:
rf = RandomForestRegressor(n_estimators=20, max_depth=13, n_jobs=2, random_state=seed)
rf.fit(X_train.values, y_train)
pred_rf = rf.predict(X_valid.values)

rmse(pred_rf, y_valid)

0.8745548685151435

time: 23min 20s


In [38]:
xgb = XGBRegressor(max_depth=10,learning_rate=0.03,n_jobs=2,random_state=seed)
xgb.fit(X_train.values, y_train)
pred_xgb = xgb.predict(X_valid.values)

rmse(pred_xgb, y_valid)

0.8709845

time: 39min 10s


In [None]:
rf = RandomForestRegressor(n_estimators=20, max_depth=13, n_jobs=2)
rf.fit(X_full_train.values, y_full_train)
pred_rf = rf.predict(X_test.values)

In [None]:
xgb = XGBRegressor(max_depth=10,learning_rate=0.03,n_jobs=2,random_state=seed)
xgb.fit(X_full_train.values, y_full_train)
pred_xgb = xgb.predict(X_test.values)

In [None]:
#TODO: pred만 보내서 submission 만들도록 함수 만들기

pred = pred_rf
dd = lag_df[dates == test_block]
dd.month_sale = pred
dd = dd[['shop_id', 'item_id', 'month_sale']]
dd = dd.rename(columns={'month_sale':'item_cnt_month'})
make_submission(dd, 'rf_n20m13_summean')

In [None]:
#TODO: pred만 보내서 submission 만들도록 함수 만들기

pred = pred_xgb
dd = lag_df[dates == test_block]
dd.month_sale = pred
dd = dd[['shop_id', 'item_id', 'month_sale']]
dd = dd.rename(columns={'month_sale':'item_cnt_month'})
make_submission(dd, 'xgb_summean')

# XGB hyper parameter tuning

lgbm도 스킷런 wrapper가 있음. 이용해보기. 이걸 이용해서 한번 하이퍼 파라미터 튜닝!

그전에 할일
- meta cat 넣기
- mean coding average 로 해보기
- half로 다시 도전..?
- Scaler 이용해보기
- Pipeline 만들어보기

# Simple ensembling structure - data는.. cat전


- ensemble two model
    1. Linear Regression(0.90268626150226106), lightgbm(0.87875613368405925) =>
        1. last 3
            1. Linear Regression: 0.879674
            2. lightgbm: 0.947700
            3. Random Forest: 0.881135
        2. last 6
            1. Linear Regression: 0.877625
            2. lightgbm: 0.959168
            3. Random Forest: 0.878719
        3. last 6 + X_train
            1. Linear Regression: 0.876767 => 1.00935
            2. lightgbm: 0.882532
            3. **Random Forest: 0.877934 => 1.00924**
            4. ElasticNet(alpha=0.01): 0.888654
            5. xgb: ??? => 1.01291
- ensemble three model
    1. Linear Regression, lightgbm, KnnRegressor
        - winner in two models
- ensemble five models
    1. Linear Regression, ElasticNet, lightgbm, Random Forest, KnnRegressor
        - ?

In [None]:
X_test_level2 = np.c_[pred_lr, pred_lgb]

In [None]:
#level2_date_blocks = [27,28,29,30,31,32]
# dates_train_level2 = dates_train[dates_train.isin(level2_date_blocks)]
#y_train_level2 = y_train[dates_train.isin(level2_date_blocks)]


# for prediction
level2_date_blocks = [28,29,30,31,32,33]
dates_train_level2 = dates_full_train[dates_full_train.isin(level2_date_blocks)]
y_train_level2 = y_full_train[dates_full_train.isin(level2_date_blocks)]


In [None]:
# And here we create 2nd level feeature matrix, init it with zeros first
X_train_level2 = np.zeros([y_train_level2.shape[0], 2])

# Now fill `X_train_level2` with metafeatures
for cur_block_num in tqdm_notebook(level2_date_blocks): 
    
     gc.collect()
#     lr = LinearRegression()
#     lr.fit(X_train[dates_train < cur_block_num].values, y_train[dates_train < cur_block_num])
#     pred_lr_level2 = lr.predict(X_train[dates_train == cur_block_num].values)
    
#     lgb = lightgbm.train(lgb_params, lightgbm.Dataset(X_train[dates_train < cur_block_num], label=y_train[dates_train < cur_block_num]), 100)
#     pred_lgb_level2 = lgb.predict(X_train[dates_train == cur_block_num].values)
    
#     X_train_level2[dates_train_level2.isin([cur_block_num]), 0] = pred_lr_level2.copy()
#     X_train_level2[dates_train_level2.isin([cur_block_num]), 1] = pred_lgb_level2.copy()
    
    lr = LinearRegression()
    lr.fit(X_full_train[dates_full_train < cur_block_num].values, y_full_train[dates_full_train < cur_block_num])
    pred_lr_level2 = lr.predict(X_full_train[dates_full_train == cur_block_num].values)
    
    lgb = lightgbm.train(lgb_params, lightgbm.Dataset(X_full_train[dates_full_train < cur_block_num], label=y_full_train[dates_full_train < cur_block_num]), 100)
    pred_lgb_level2 = lgb.predict(X_full_train[dates_full_train == cur_block_num].values)
    
    X_train_level2[dates_train_level2.isin([cur_block_num]), 0] = pred_lr_level2.copy()
    X_train_level2[dates_train_level2.isin([cur_block_num]), 1] = pred_lgb_level2.copy()

In [None]:
plt.scatter(X_train_level2[:,0], X_train_level2[:,1])

In [None]:
# X_train_plus_level2 = np.c_[X_full_train[dates_train.isin(level2_date_blocks)].values, X_train_level2]

X_train_plus_level2 = np.c_[X_full_train[dates_full_train.isin(level2_date_blocks)].values, X_train_level2]

In [None]:
X_train_plus_level2.shape

In [None]:
# meta_model = lightgbm.train(lgb_params, lightgbm.Dataset(X_train_plus_level2, label=y_train_level2), 100)

In [None]:
meta_model = RandomForestRegressor(max_depth=5)

meta_model.fit(X_train_plus_level2, y_train_level2)

In [None]:
meta_model = LinearRegression()
meta_model.fit(X_train_plus_level2, y_train_level2)

In [None]:

meta_model = XGBRegressor(max_depth=5,learning_rate=0.03,n_jobs=-1,random_state=seed)
meta_model.fit(X_train_plus_level2, y_train_level2)

In [None]:
X_test_plus = np.c_[X_test.values, X_test_level2]

In [None]:
 # valid_preds = meta_model.predict(valid_plus)
# rmse_valid_stacking = rmse(y_valid, valid_preds)

print('Train rmse for stacking is %f' % rmse_train_stacking)
# print('Test  rmse for stacking is %f' % rmse_valid_stacking)

In [None]:
ensemble_pred = meta_model.predict(X_test_plus)

In [None]:
#TODO: pred만 보내서 submission 만들도록 함수 만들기

pred = pred_lgb
dd = lag_df[dates == test_block]
dd.month_sale = pred
dd = dd[['shop_id', 'item_id', 'month_sale']]
dd = dd.rename(columns={'month_sale':'item_cnt_month'})
make_submission(dd, 'lgb_with_category_sum')

# Submit to kaggle

This cell automatically submits the submission file to kaggle. However, it should be carefully executed because the submitting opportunities are limited.
- remove '#' before submitting
- add a meaningful message to a submission

In [None]:
!kaggle competitions submit -c competitive-data-science-final-project -f ./submission/rf_n20m13_summean.csv -m ""

# Check public score

In [None]:
!kaggle competitions submissions -c competitive-data-science-final-project