In [124]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
from itertools import product

import lightgbm as lgb
from tqdm import tqdm_notebook

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [125]:
def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df


In [126]:
item_categories = pd.read_csv("../input/competitive-data-science-predict-future-sales/item_categories.csv")
items = pd.read_csv("../input/competitive-data-science-predict-future-sales/items.csv")
sales_train = pd.read_csv("../input/competitive-data-science-predict-future-sales/sales_train.csv")
sample_submission = pd.read_csv("../input/competitive-data-science-predict-future-sales/sample_submission.csv")
shops = pd.read_csv("../input/competitive-data-science-predict-future-sales/shops.csv")
test = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")

In [127]:
print(sales_train.head())
print(test.head())
print(items.head())

         date  date_block_num  shop_id  item_id  item_price  item_cnt_day
0  02.01.2013               0       59    22154      999.00           1.0
1  03.01.2013               0       25     2552      899.00           1.0
2  05.01.2013               0       25     2552      899.00          -1.0
3  06.01.2013               0       25     2554     1709.05           1.0
4  15.01.2013               0       25     2555     1099.00           1.0
   ID  shop_id  item_id
0   0        5     5037
1   1        5     5320
2   2        5     5233
3   3        5     5232
4   4        5     5268
                                           item_name  item_id  \
0          ! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.)         D        0   
1  !ABBYY FineReader 12 Professional Edition Full...        1   
2      ***В ЛУЧАХ СЛАВЫ   (UNV)                    D        2   
3    ***ГОЛУБАЯ ВОЛНА  (Univ)                      D        3   
4        ***КОРОБКА (СТЕКЛО)                       D        4   

   item_category_id  

In [128]:
test_temp = test.copy()
test_temp['date_block_num'] = 34
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()
test_temp = pd.merge(test_temp, item_category_mapping, how='left', on='item_id')
test_temp = test_temp.drop('ID',axis = 1)
sales_train = sales_train.drop('date', axis = 1)

In [129]:
sales_train_full = pd.concat([sales_train,test_temp], ignore_index = True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [131]:
sales = sales_train_full

# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
#all_data2 = all_data.copy()
del grid, gb 
gc.collect();

in a future version.

For column-specific groupby renaming, use named aggregation

    >>> df.groupby(...).agg(name=('column', aggfunc))

  return super().aggregate(arg, *args, **kwargs)


In [132]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1, 2, 3, 4, 5, 12]

for month_shift in tqdm_notebook(shift_range):
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

# Don't use old data from year 2013
all_data = all_data[all_data['date_block_num'] >= 12] 

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




In [133]:
all_data

Unnamed: 0,shop_id,item_id,date_block_num,target,target_shop,target_item,target_lag_1,target_item_lag_1,target_shop_lag_1,target_lag_2,...,target_lag_4,target_item_lag_4,target_shop_lag_4,target_lag_5,target_item_lag_5,target_shop_lag_5,target_lag_12,target_item_lag_12,target_shop_lag_12,item_category_id
0,54,10297,12,4.0,8198.0,23.0,3.0,42.0,10055.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37
1,54,10296,12,3.0,8198.0,17.0,0.0,24.0,10055.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38
2,54,10298,12,14.0,8198.0,182.0,21.0,369.0,10055.0,119.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40
3,54,10300,12,3.0,8198.0,26.0,1.0,54.0,10055.0,31.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,37
4,54,10284,12,1.0,8198.0,3.0,0.0,4.0,10055.0,0.0,...,0.0,3.0,7827.0,0.0,10.0,7792.0,0.0,0.0,0.0,57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6639289,45,18454,34,0.0,0.0,0.0,1.0,2.0,702.0,0.0,...,0.0,12.0,675.0,0.0,19.0,622.0,0.0,0.0,0.0,55
6639290,45,16188,34,0.0,0.0,0.0,0.0,1.0,702.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64
6639291,45,15757,34,0.0,0.0,0.0,0.0,5.0,702.0,0.0,...,0.0,4.0,675.0,0.0,8.0,622.0,0.0,9.0,1251.0,55
6639292,45,19648,34,0.0,0.0,0.0,0.0,2.0,702.0,0.0,...,0.0,2.0,675.0,0.0,4.0,622.0,0.0,0.0,0.0,40


In [134]:
dates = all_data['date_block_num']

last_block = dates.max()
#print(last_block)

X_train = all_data.loc[dates < last_block - 1].drop(to_drop_cols, axis=1)
X_cv = all_data.loc[dates ==  last_block - 1].drop(to_drop_cols, axis=1)
X_test =  all_data.loc[dates == last_block].drop(to_drop_cols, axis=1)

y_train = all_data.loc[dates <  last_block - 1, 'target'].values
y_cv = all_data.loc[dates ==  last_block - 1, 'target'].values
y_test =  all_data.loc[dates == last_block, 'target'].values

# print(X_train_part.shape)
# print(y_train_part.shape)

# X_train.to_pickle("X_train.pkl")
# X_train_part.to_pickle("X_train_part.pkl")
# X_test.to_pickle("X_test.pkl")

# np.save("y_train.npy", y_train)
# np.save("y_train_part.npy", y_train_part)
# np.save("y_test.npy", y_test)

In [135]:
def lgbm_train(X_train,y_train,X_test,y_test,num_iter):
    lgb_params = {
                   'feature_fraction': 0.75,
                   'metric': 'rmse',
                   'nthread':1, 
                   'min_data_in_leaf': 2**7, 
                   'bagging_fraction': 0.75, 
                   'learning_rate': 0.03, 
                   'objective': 'mse', 
                   'bagging_seed': 2**7, 
                   'num_leaves': 2**7,
                   'bagging_freq':1,
                   'verbose':0 
                  }

    evals_result = {}
    
    lgb_train = lgb.Dataset(X_train, label=y_train)
    lgb_valid = lgb.Dataset(X_test, label=y_test)
    model = lgb.train(lgb_params, lgb_train, num_iter,
                      valid_sets=[lgb_train, lgb_valid],
                      evals_result=evals_result,
                      verbose_eval=10)
    return model,evals_result


In [136]:
y_train_clip = y_train.clip(0,20)
model,evals_result  = lgbm_train(X_train, y_train_clip, X_cv, y_cv, 200)

(6186922, 21)
(6186922,)
[10]	training's rmse: 1.0627	valid_1's rmse: 5.29971
[20]	training's rmse: 0.983723	valid_1's rmse: 5.27079
[30]	training's rmse: 0.935232	valid_1's rmse: 5.25121
[40]	training's rmse: 0.904167	valid_1's rmse: 5.23739
[50]	training's rmse: 0.88396	valid_1's rmse: 5.22687
[60]	training's rmse: 0.870733	valid_1's rmse: 5.21974
[70]	training's rmse: 0.861056	valid_1's rmse: 5.2142
[80]	training's rmse: 0.854165	valid_1's rmse: 5.20985
[90]	training's rmse: 0.848822	valid_1's rmse: 5.20624
[100]	training's rmse: 0.843917	valid_1's rmse: 5.20335
[110]	training's rmse: 0.840232	valid_1's rmse: 5.20135
[120]	training's rmse: 0.836431	valid_1's rmse: 5.19959
[130]	training's rmse: 0.832962	valid_1's rmse: 5.19848
[140]	training's rmse: 0.829669	valid_1's rmse: 5.19733
[150]	training's rmse: 0.826843	valid_1's rmse: 5.19628
[160]	training's rmse: 0.824436	valid_1's rmse: 5.19543
[170]	training's rmse: 0.822381	valid_1's rmse: 5.19466
[180]	training's rmse: 0.820437	vali

In [137]:
model.save_model('lgbm_model_full_clip20.txt')

<lightgbm.basic.Booster at 0x7f3b1e25d0f0>

In [138]:
# model = lgb.Booster(model_file='/kaggle/input/predict-future-sales/lgbm_model.txt')

In [139]:
y_train_clip = y_train.clip(0,20)

In [140]:
#linear regression in preparation for ensembling
lr = LinearRegression()

lr.fit(X_train.values, y_train_clip)
pred_lr = lr.predict(X_cv)

print('Test RMSE for linreg is %f' % np.sqrt(mean_squared_error(y_cv, pred_lr)))

Test RMSE for linreg is 5.229523


In [141]:
pred_lgb_cv = model.predict(X_cv)
lr.fit(X_train.values, y_train_clip)
pred_lr_cv = lr.predict(X_cv)
#Clip predictions for ensembling
pred_lr_cv = pred_lr_cv.clip(0,20)
pred_lgb_cv = pred_lgb_cv.clip(0,20)
X_cv_level2 = np.c_[pred_lr_cv, pred_lgb_cv]
np.save("X_cv_level2.npy", X_cv_level2)

In [142]:
pred_lgb_test = model.predict(X_test)
lr.fit(X_train.values, y_train_clip)
pred_lr_test = lr.predict(X_test)
pred_lr_test = pred_lr_test.clip(0,20)
pred_lgb_test = pred_lgb_test.clip(0,20)
X_test_level2 = np.c_[pred_lr_test, pred_lgb_test]
np.save("X_test_level2.npy", X_test_level2)

In [145]:
#First level predictions for ensembling

dates_train = dates[dates <  last_block - 1]
dates_train_level2 = dates_train[dates_train.isin([28, 29, 30, 31, 32])]

# That is how we get target for the 2nd level dataset
y_train_level2 = y_train[dates_train.isin([28, 29, 30, 31, 32])]

# And here we create 2nd level feeature matrix, init it with zeros first
X_train_level2 = np.zeros([y_train_level2.shape[0], 2])

count = 0
# Now fill `X_train_level2` with metafeatures
for cur_block_num in [28, 29, 30, 31, 32]:
    
    print(cur_block_num)
    
    '''
        1. Split `X_train` into parts
           Remember, that corresponding dates are stored in `dates_train` 
        2. Fit linear regression 
        3. Fit LightGBM and put predictions          
        4. Store predictions from 2. and 3. in the right place of `X_train_level2`. 
           You can use `dates_train_level2` for it
           Make sure the order of the meta-features is the same as in `X_test_level2`
    '''
    X_train_cur = X_train[dates_train.isin(range(12,cur_block_num))]
    y_train_cur = y_train_clip[dates_train.isin(range(12,cur_block_num))]
    X_test_cur = X_train[dates_train.isin([cur_block_num])]
    lr_temp = LinearRegression()
    lr_temp.fit(X_train_cur.values, y_train_cur)
    pred_lr_temp = lr_temp.predict(X_test_cur.values)
    
    model_temp,_ = lgbm_train(X_train_cur, y_train_cur, X_train_cur, y_train_cur, 200)
    pred_lgb_temp = model_temp.predict(X_test_cur)
    
    for i in range(len(pred_lr_temp)):
        X_train_level2[count+i][0]=pred_lr_temp[i]
        X_train_level2[count+i][1]=pred_lgb_temp[i]
    count+=len(pred_lr_temp)
np.save("X_train_full_level2.npy", X_train_level2)
np.save("y_train_full_level2.npy", y_train_level2)

28
[10]	training's rmse: 1.08358	valid_1's rmse: 1.08358
[20]	training's rmse: 1.00086	valid_1's rmse: 1.00086
[30]	training's rmse: 0.950159	valid_1's rmse: 0.950159
[40]	training's rmse: 0.91737	valid_1's rmse: 0.91737
[50]	training's rmse: 0.895866	valid_1's rmse: 0.895866
[60]	training's rmse: 0.881815	valid_1's rmse: 0.881815
[70]	training's rmse: 0.871556	valid_1's rmse: 0.871556
[80]	training's rmse: 0.863844	valid_1's rmse: 0.863844
[90]	training's rmse: 0.857774	valid_1's rmse: 0.857774
[100]	training's rmse: 0.852633	valid_1's rmse: 0.852633
[110]	training's rmse: 0.848298	valid_1's rmse: 0.848298
[120]	training's rmse: 0.844331	valid_1's rmse: 0.844331
[130]	training's rmse: 0.840536	valid_1's rmse: 0.840536
[140]	training's rmse: 0.837106	valid_1's rmse: 0.837106
[150]	training's rmse: 0.833922	valid_1's rmse: 0.833922
[160]	training's rmse: 0.831191	valid_1's rmse: 0.831191
[170]	training's rmse: 0.828993	valid_1's rmse: 0.828993
[180]	training's rmse: 0.826931	valid_1's r

In [147]:
print('Test RMSE for linreg is %f' % np.sqrt(mean_squared_error(y_cv, pred_lr_cv)))
print('Test RMSE for lgbm is %f' % np.sqrt(mean_squared_error(y_cv, pred_lgb_cv)))

Test RMSE for linreg is 5.247907
Test RMSE for lgbm is 5.193346


In [180]:

#Ensembling with averaging implementation

from sklearn.metrics import r2_score
#X_train_level2[:,0].shape
print(r2_score(y_train_level2.clip(0,20), X_train_level2[:,0]))
print(r2_score(y_train_level2.clip(0,20), X_train_level2[:,1]))
alphas_to_try = np.linspace(0, 1, 1001)
best_r2 = 0.0
best_alpha = -1
for alpha in alphas_to_try:
    curr_r2 = r2_score(y_train_level2.clip(0,20), np.dot(X_train_level2, [alpha,1-alpha]))
    if curr_r2 > best_r2:
        best_r2 = curr_r2
        best_alpha = alpha
# YOUR CODE GOES HERE
best_alpha = best_alpha
r2_train_simple_mix = best_r2

print('Best alpha: %f; Corresponding r2 score on train: %f' % (best_alpha, r2_train_simple_mix))

0.1638807190308701
0.4273387216055792
Best alpha: 0.000000; Corresponding r2 score on train: 0.427339


In [159]:
#Ensembling with stacking predictions
X_train_level2 = np.load("X_train_full_level2.npy")
y_train_level2 = np.load("y_train_full_level2.npy")


lr.fit(X_train_level2.clip(0,20), y_train_level2.clip(0,20))

cv_preds = lr.predict(X_cv_level2)
rmse_test_stacking = np.sqrt(mean_squared_error(y_cv, cv_preds))

test_preds = lr.predict(X_test_level2)
rmse_test_stacking

5.194295736604871

In [158]:
#predictions on the test set
merged = test.copy()
merged['item_cnt_month'] = test_preds
merged['item_cnt_month']=merged['item_cnt_month'].clip(lower=0,upper=20)
merged=merged.drop(['shop_id','item_id'],axis=1)
merged['ID']=merged['ID'].astype('int')
merged.to_csv("lightgbm_clipto20_ens_full_l2full.csv",index=False)
print(merged.head)

<bound method NDFrame.head of             ID  item_cnt_month
0            0        0.454706
1            1        0.229680
2            2        0.810350
3            3        0.321987
4            4        1.631143
...        ...             ...
214195  214195        0.138287
214196  214196        0.067720
214197  214197        0.046541
214198  214198        0.043305
214199  214199        0.043210

[214200 rows x 2 columns]>


In [182]:
np.save("X_test.npy", X_test)

In [191]:
#Code to run to get the submitted result

model_lgb = lgb.Booster(model_file='lgbm_model_full_clip20.txt')

X_test_fin = np.load('X_test.npy')
lgb_pred_test_fin = model_lgb.predict(X_test_fin)
lgb_pred_test_fin = lgb_pred_test_fin.clip(0,20)

test_fin = pd.read_csv("../input/competitive-data-science-predict-future-sales/test.csv")
test_fin['item_cnt_month'] = test_preds
test_fin['item_cnt_month']=merged['item_cnt_month'].clip(lower=0,upper=20)
test_fin = test_fin.drop(['shop_id','item_id'],axis=1)
test_fin['ID'] = test_fin['ID'].astype('int')
test_fin.to_csv("final_submission.csv",index=False)
print(test_fin.head)

<bound method NDFrame.head of             ID  item_cnt_month
0            0        0.454706
1            1        0.229680
2            2        0.810350
3            3        0.321987
4            4        1.631143
...        ...             ...
214195  214195        0.138287
214196  214196        0.067720
214197  214197        0.046541
214198  214198        0.043305
214199  214199        0.043210

[214200 rows x 2 columns]>
