In [1]:
# Import libraries
import numpy as np
import pandas as pd

import os
import gc

from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import xgboost as xgb
import catboost as cb

import datetime

import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import data
DATA_FOLDER = '../Data/'

transactions    = pd.read_csv(os.path.join(DATA_FOLDER, 'sales_train.csv.gz'))
items           = pd.read_csv(os.path.join(DATA_FOLDER, 'items.csv'))
item_categories = pd.read_csv(os.path.join(DATA_FOLDER, 'item_categories.csv'))
shops           = pd.read_csv(os.path.join(DATA_FOLDER, 'shops.csv'))
test            = pd.read_csv(os.path.join(DATA_FOLDER, 'test.csv.gz'))

In [3]:
# Set constants and options
NMONTHS = 24 # Number of last months, which data we use
pd.set_option('max_columns', 800)

In [4]:
# Extract city column from shops dataframe
shop_name_split = [el.split(' ') for el in shops.shop_name.values]
shops['city'] = pd.Series([el[0] for el in shop_name_split], index=shops.index)
shops.head(10)

Unnamed: 0,shop_name,shop_id,city
0,"!Якутск Орджоникидзе, 56 фран",0,!Якутск
1,"!Якутск ТЦ ""Центральный"" фран",1,!Якутск
2,"Адыгея ТЦ ""Мега""",2,Адыгея
3,"Балашиха ТРК ""Октябрь-Киномир""",3,Балашиха
4,"Волжский ТЦ ""Волга Молл""",4,Волжский
5,"Вологда ТРЦ ""Мармелад""",5,Вологда
6,"Воронеж (Плехановская, 13)",6,Воронеж
7,"Воронеж ТРЦ ""Максимир""",7,Воронеж
8,"Воронеж ТРЦ Сити-Парк ""Град""",8,Воронеж
9,Выездная Торговля,9,Выездная


In [5]:
# Extract new item category type feature
item_cat_types = np.array(['PC_Headphone'] + ['Accessory'] * 7 + ['Ticket', 'Delivery'] + ['Game_Console'] * 8 + 
                  ['Game'] * 7 + ['Game_Accessory'] + ['Game'] * 6 + ['Pay_Card'] * 5 + ['Cinema'] * 5 + 
                  ['Book'] * 13 + ['Music'] * 6 + ['Gift'] * 12 + ['Program'] * 6 + ['Service'] * 2 + 
                  ['Clean_Disk'] * 2 + ['Charger'])
item_categories['item_category_type'] = pd.Series(item_cat_types, index=item_categories.index)
item_categories.tail()

Unnamed: 0,item_category_name,item_category_id,item_category_type
79,Служебные,79,Service
80,Служебные - Билеты,80,Service
81,Чистые носители (шпиль),81,Clean_Disk
82,Чистые носители (штучные),82,Clean_Disk
83,Элементы питания,83,Charger


In [6]:
# New function to construct base dataframes for train, validation and test
def construct_base_traintest_with_price(months=NMONTHS):
    """
    months - number of last months to construct base train/test matrix dataset
    This function construct grouped by months sales information for train, validation and test
    """
    # Group transactions by month, shop and item (add price mean here)
    grp_trans = transactions.groupby(['date_block_num', 'shop_id', 'item_id'], as_index=False).agg({'item_cnt_day': 'sum',
                                                                                               'item_price': 'mean'})
    
    # Split dataset by months
    lst_grp_trans = []
    max_month = grp_trans.date_block_num.max() + 1
    for month in range(max_month-months-2, max_month): # Get only last year data
        month_df = grp_trans[grp_trans.date_block_num == month]
        # Rename target column and price column
        month_df.rename(index=str, columns={"item_cnt_day": "item_cnt_day_" + str(month),
                                           "item_price": "item_price_" + str(month)}, inplace=True) 
        lst_grp_trans.append(month_df)
        
    # Join data with test dataset
    all_data = test[['shop_id', 'item_id']].set_index(['shop_id', 'item_id'])
    for month in range(max_month-months-2, max_month): # [8..34)
        all_data = all_data.join(lst_grp_trans[month+months-max_month+2][['shop_id', 'item_id', 'item_cnt_day_'+str(month), 
                                                                          'item_price_' + str(month)]
                                                                        ].set_index(['shop_id', 'item_id']))

    # Replace NaNs by 0
    all_data.fillna(0, inplace=True)
    
    # Get train target and validation target column
    val_target = all_data.item_cnt_day_33
    train_target = all_data.item_cnt_day_32
    
    # Calculate x_train, x_val, x_test
    x_train = all_data.copy()
    x_train.drop(['item_cnt_day_32', 'item_price_32', 'item_cnt_day_33', 'item_price_33'], axis=1, inplace=True)
    
    x_val = all_data.copy()
    first_col = str(max_month-months-2)
    x_val.drop(['item_cnt_day_' + first_col, 'item_price_' + first_col, 'item_cnt_day_33', 'item_price_33'], 
               axis=1, inplace=True)
    
    x_test = all_data.copy()
    second_col = str(max_month-months-1)
    x_test.drop(['item_cnt_day_' + first_col, 'item_price_' + first_col, 'item_cnt_day_' + second_col, 
                 'item_price_' + second_col], axis=1, inplace=True)
    
    
    return x_train, x_val, x_test, train_target, val_target

In [7]:
def add_means_with_price(train, val, test, months=NMONTHS):
    """
    train - base train dataframe
    val - base validation dataframe (train, shifted by 1 month further)
    test - base test dataframe (val, shifted by 1 month further)
    This function add sliding means by last n column values (last n months)
    """
    # Define filters for price and item count features separation
    train_filter_item_cnt_col = [col for col in train if col.startswith('item_cnt_day')]
    train_filter_price_col = [col for col in train if col.startswith('item_price')]
    val_filter_item_cnt_col = [col for col in val if col.startswith('item_cnt_day')]
    val_filter_price_col = [col for col in val if col.startswith('item_price')]    
    test_filter_item_cnt_col = [col for col in test if col.startswith('item_cnt_day')]
    test_filter_price_col = [col for col in test if col.startswith('item_price')]
    
    # Get 6 separate dataframes
    train_item_cnt_df = train[train_filter_item_cnt_col]
    train_price_df = train[train_filter_price_col]
    val_item_cnt_df = val[val_filter_item_cnt_col]
    val_price_df = val[val_filter_price_col]
    test_item_cnt_df = test[test_filter_item_cnt_col]
    test_price_df = test[test_filter_price_col]
    
    # Add features to train
    np_train_count = np.array(train_item_cnt_df)
    np_train_price = np.array(train_price_df)
    
    for i in range(1, months+1):
        train['mean_cnt_' + str(i)] = pd.Series(np_train_count[:, -i:].mean(axis=1), index=train.index)
        train['mean_price_' + str(i)] = pd.Series(np_train_price[:, -i:].mean(axis=1), index=train.index)
        
    # Add features to validation
    np_val_count = np.array(val_item_cnt_df)
    np_val_price = np.array(val_price_df)
    
    for i in range(1, months+1):
        val['mean_cnt_' + str(i)] = pd.Series(np_val_count[:, -i:].mean(axis=1), index=val.index)
        val['mean_price_' + str(i)] = pd.Series(np_val_price[:, -i:].mean(axis=1), index=val.index)    
        
    # Add features to test
    np_test_count = np.array(test_item_cnt_df)
    np_test_price = np.array(test_price_df)
    
    for i in range(1, months+1):
        test['mean_cnt_' + str(i)] = pd.Series(np_test_count[:, -i:].mean(axis=1), index=test.index)
        test['mean_price_' + str(i)] = pd.Series(np_test_price[:, -i:].mean(axis=1), index=test.index)
        
    return train, val, test

In [8]:
def get_month_mean_encodings(train, val, test):
    """
    train - base train dataframe
    val - base validation dataframe (train, shifted by 1 month further)
    test - base test dataframe (val, shifted by 1 month further)
    Function calculate cumsum divided by cumcount for current shop and current item saled quantities,
    also it makes the same transformations for current month price
    We aggregate info by months
    """
    # Get dataframes with sold item quantities, grouped by shop and item
    
    # Define item count column filters
    train_filter_cnt_col = [col for col in train if col.startswith('item_cnt_day')]
    val_filter_cnt_col = [col for col in val if col.startswith('item_cnt_day')]
    test_filter_cnt_col = [col for col in test if col.startswith('item_cnt_day')]
    
    # Define item price filters
    train_filter_price_col = [col for col in train if col.startswith('item_price')]
    val_filter_price_col = [col for col in val if col.startswith('item_price')]
    test_filter_price_col = [col for col in test if col.startswith('item_price')]

    
    # Aggregate train by shop and item
    train_shop_agg_cnt_df = train.groupby(['shop_id'])[train_filter_cnt_col].agg(['sum'])
    train_item_agg_cnt_df = train.groupby(['item_id'])[train_filter_cnt_col].agg(['sum'])
    train_shop_agg_price_df = train.groupby(['shop_id'])[train_filter_price_col].agg(['sum'])
    train_item_agg_price_df = train.groupby(['item_id'])[train_filter_price_col].agg(['sum'])
    
    # Aggregate validation set by shop and item
    val_shop_agg_cnt_df = val.groupby(['shop_id'])[val_filter_cnt_col].agg(['sum'])
    val_item_agg_cnt_df = val.groupby(['item_id'])[val_filter_cnt_col].agg(['sum'])
    val_shop_agg_price_df = val.groupby(['shop_id'])[val_filter_price_col].agg(['sum'])
    val_item_agg_price_df = val.groupby(['item_id'])[val_filter_price_col].agg(['sum'])
    
    # Aggregate test set by shop and item
    test_shop_agg_cnt_df = test.groupby(['shop_id'])[test_filter_cnt_col].agg(['sum'])
    test_item_agg_cnt_df = test.groupby(['item_id'])[test_filter_cnt_col].agg(['sum'])
    test_shop_agg_price_df = test.groupby(['shop_id'])[test_filter_price_col].agg(['sum'])
    test_item_agg_price_df = test.groupby(['item_id'])[test_filter_price_col].agg(['sum'])
    

    # Now we get train cumulative sum of this features on rows
    train_shop_agg_cnt_df = train_shop_agg_cnt_df.cumsum(axis=1)
    train_item_agg_cnt_df = train_item_agg_cnt_df.cumsum(axis=1)
    train_shop_agg_price_df = train_shop_agg_price_df.cumsum(axis=1)
    train_item_agg_price_df = train_item_agg_price_df.cumsum(axis=1)
    
    # Get validation cumulative sum of this features on rows
    val_shop_agg_cnt_df = val_shop_agg_cnt_df.cumsum(axis=1)
    val_item_agg_cnt_df = val_item_agg_cnt_df.cumsum(axis=1)
    val_shop_agg_price_df = val_shop_agg_price_df.cumsum(axis=1)
    val_item_agg_price_df = val_item_agg_price_df.cumsum(axis=1)

    # Get test cumulative sum of this features on rows
    test_shop_agg_cnt_df = test_shop_agg_cnt_df.cumsum(axis=1)
    test_item_agg_cnt_df = test_item_agg_cnt_df.cumsum(axis=1)
    test_shop_agg_price_df = test_shop_agg_price_df.cumsum(axis=1)
    test_item_agg_price_df = test_item_agg_price_df.cumsum(axis=1)

    # Calculate arithmetic progression (1, 2, 3, 4, ...) with months length
    np_to_divide = np.array(train_item_agg_cnt_df.columns.labels[0] + 1, dtype=float)
    
    # Divide train cumulative sum by column number (np.cumcount emulation)
    train_shop_agg_cnt_df = train_shop_agg_cnt_df / np_to_divide
    train_item_agg_cnt_df = train_item_agg_cnt_df / np_to_divide
    train_shop_agg_price_df = train_shop_agg_price_df / np_to_divide
    train_item_agg_price_df = train_item_agg_price_df / np_to_divide
    
    # Divide validation cumulative sum by column number (np.cumcount emulation)
    val_shop_agg_cnt_df = val_shop_agg_cnt_df / np_to_divide
    val_item_agg_cnt_df = val_item_agg_cnt_df / np_to_divide
    val_shop_agg_price_df = val_shop_agg_price_df / np_to_divide
    val_item_agg_price_df = val_item_agg_price_df / np_to_divide
    
    # Divide test cumulative sum by column number (np.cumcount emulation)
    test_shop_agg_cnt_df = test_shop_agg_cnt_df / np_to_divide
    test_item_agg_cnt_df = test_item_agg_cnt_df / np_to_divide
    test_shop_agg_price_df = test_shop_agg_price_df / np_to_divide
    test_item_agg_price_df = test_item_agg_price_df / np_to_divide
    
    
    # Move index to columns of train daraframes
    train_shop_agg_cnt_df.reset_index(inplace=True)
    train_item_agg_cnt_df.reset_index(inplace=True)
    train_shop_agg_price_df.reset_index(inplace=True)
    train_item_agg_price_df.reset_index(inplace=True)
    
    # Move index to columns of validation daraframes
    val_shop_agg_cnt_df.reset_index(inplace=True)
    val_item_agg_cnt_df.reset_index(inplace=True)
    val_shop_agg_price_df.reset_index(inplace=True)
    val_item_agg_price_df.reset_index(inplace=True)
    
    # Move index to columns of test daraframes
    test_shop_agg_cnt_df.reset_index(inplace=True)
    test_item_agg_cnt_df.reset_index(inplace=True)
    test_shop_agg_price_df.reset_index(inplace=True)
    test_item_agg_price_df.reset_index(inplace=True)

    
    # Join all train mean encoding features with initial data
    train_ext = pd.merge(train, train_shop_agg_cnt_df, how='left', left_on='shop_id', right_on='shop_id')
    train_ext = pd.merge(train_ext, train_item_agg_cnt_df, how='left', left_on='item_id', right_on='item_id')
    train_ext = pd.merge(train_ext, train_shop_agg_price_df, how='left', left_on='shop_id', right_on='shop_id')
    train_ext = pd.merge(train_ext, train_item_agg_price_df, how='left', left_on='item_id', right_on='item_id')

    # Join all validation mean encoding features with initial data
    val_ext = pd.merge(val, val_shop_agg_cnt_df, how='left', left_on='shop_id', right_on='shop_id')
    val_ext = pd.merge(val_ext, val_item_agg_cnt_df, how='left', left_on='item_id', right_on='item_id')
    val_ext = pd.merge(val_ext, val_shop_agg_price_df, how='left', left_on='shop_id', right_on='shop_id')
    val_ext = pd.merge(val_ext, val_item_agg_price_df, how='left', left_on='item_id', right_on='item_id')
                       
    # Join all test mean encoding features with initial data
    test_ext = pd.merge(test, test_shop_agg_cnt_df, how='left', left_on='shop_id', right_on='shop_id')
    test_ext = pd.merge(test_ext, test_item_agg_cnt_df, how='left', left_on='item_id', right_on='item_id')
    test_ext = pd.merge(test_ext, test_shop_agg_price_df, how='left', left_on='shop_id', right_on='shop_id')
    test_ext = pd.merge(test_ext, test_item_agg_price_df, how='left', left_on='item_id', right_on='item_id')

    return train_ext, val_ext, test_ext

In [133]:
# Construct basic frames
tr_train, tr_val, tr_test, tr_target, val_target = construct_base_traintest_with_price()
print(tr_train.shape, tr_val.shape, tr_test.shape, tr_target.shape, val_target.shape)
# (214200, 48) (214200, 48) (214200, 48) (214200,) (214200,)

# Add means with prices
tr_train, tr_val, tr_test = add_means_with_price(tr_train, tr_val, tr_test)
print(tr_train.shape, tr_val.shape, tr_test.shape)
# (214200, 96) (214200, 96) (214200, 96)

# Add shop and item id features. Transform indexes into df columns
tr_train.reset_index(inplace=True)
tr_val.reset_index(inplace=True)
tr_test.reset_index(inplace=True)
print(tr_train.shape, tr_val.shape, tr_test.shape)
# (214200, 98) (214200, 98) (214200, 98)

# Add mean encodings
tr_train, tr_val, tr_test = get_month_mean_encodings(tr_train, tr_val, tr_test)
print(tr_train.shape, tr_val.shape, tr_test.shape)
# (214200, 196) (214200, 196) (214200, 196)

# Add item category and word2vec representations to train/test from transactions
tr_train = pd.merge(tr_train, items, how='left', left_on='item_id', right_on='item_id')
tr_val = pd.merge(tr_val, items, how='left', left_on='item_id', right_on='item_id')
tr_test = pd.merge(tr_test, items, how='left', left_on='item_id', right_on='item_id')
print(tr_train.shape, tr_val.shape, tr_test.shape)
# Drop unnecessary item_name column
tr_train.drop(['item_name'], axis=1, inplace=True)
tr_val.drop(['item_name'], axis=1, inplace=True)
tr_test.drop(['item_name'], axis=1, inplace=True)
print(tr_train.shape, tr_val.shape, tr_test.shape)
# (214200, 348) (214200, 348) (214200, 348)
# (214200, 347) (214200, 347) (214200, 347)

# Add item category type
tr_train = pd.merge(tr_train, item_categories, how='left', left_on='item_category_id', right_on='item_category_id')
tr_val = pd.merge(tr_val, item_categories, how='left', left_on='item_category_id', right_on='item_category_id')
tr_test = pd.merge(tr_test, item_categories, how='left', left_on='item_category_id', right_on='item_category_id')
print(tr_train.shape, tr_val.shape, tr_test.shape)
# Drop unnecessary item_category_name column
tr_train.drop(['item_category_name'], axis=1, inplace=True)
tr_val.drop(['item_category_name'], axis=1, inplace=True)
tr_test.drop(['item_category_name'], axis=1, inplace=True)
print(tr_train.shape, tr_val.shape, tr_test.shape)
# (214200, 349) (214200, 349) (214200, 349)
# (214200, 348) (214200, 348) (214200, 348)

# Add city feature
tr_train = pd.merge(tr_train, shops, how='left', left_on='shop_id', right_on='shop_id')
tr_val = pd.merge(tr_val, shops, how='left', left_on='shop_id', right_on='shop_id')
tr_test = pd.merge(tr_test, shops, how='left', left_on='shop_id', right_on='shop_id')
print(tr_train.shape, tr_val.shape, tr_test.shape)
# Drop unnecessary shop_name column
tr_train.drop(['shop_name'], axis=1, inplace=True)
tr_val.drop(['shop_name'], axis=1, inplace=True)
tr_test.drop(['shop_name'], axis=1, inplace=True)
print(tr_train.shape, tr_val.shape, tr_test.shape)
# (214200, 350) (214200, 350) (214200, 350)
# (214200, 349) (214200, 349) (214200, 349)

# Transform Item Category Type and City to the int categorical feature
tr_train['item_category_type'] = tr_train['item_category_type'].factorize()[0]
tr_val['item_category_type'] = tr_val['item_category_type'].factorize()[0]
tr_test['item_category_type'] = tr_test['item_category_type'].factorize()[0]
tr_train['city'] = tr_train['city'].factorize()[0]
tr_val['city'] = tr_val['city'].factorize()[0]
tr_test['city'] = tr_test['city'].factorize()[0]

# Drop duplicate shop_id_x and item_id_x column
tr_train.drop(['shop_id_x'], axis=1, inplace=True)
tr_val.drop(['shop_id_x'], axis=1, inplace=True)
tr_test.drop(['shop_id_x'], axis=1, inplace=True)
tr_train.drop(['item_id_x'], axis=1, inplace=True)
tr_val.drop(['item_id_x'], axis=1, inplace=True)
tr_test.drop(['item_id_x'], axis=1, inplace=True)
print(tr_train.shape, tr_val.shape, tr_test.shape)
# (214200, 347) (214200, 347) (214200, 347)

# Prepare for modeling
np_train_matrix = np.array(tr_train)
np_val_matrix = np.array(tr_val)
np_test_matrix = np.array(tr_test)
print (np_train_matrix.shape, np_val_matrix.shape, np_test_matrix.shape)
tr_target_clip = np.clip(np.array(tr_target), 0, 20)
val_target_clip = np.clip(np.array(val_target), 0, 20)
# (214200, 197) (214200, 197) (214200, 197)

(214200, 48) (214200, 48) (214200, 48) (214200,) (214200,)
(214200, 96) (214200, 96) (214200, 96)
(214200, 98) (214200, 98) (214200, 98)
(214200, 196) (214200, 196) (214200, 196)
(214200, 198) (214200, 198) (214200, 198)
(214200, 197) (214200, 197) (214200, 197)
(214200, 199) (214200, 199) (214200, 199)
(214200, 198) (214200, 198) (214200, 198)
(214200, 200) (214200, 200) (214200, 200)
(214200, 199) (214200, 199) (214200, 199)
(214200, 197) (214200, 197) (214200, 197)
(214200, 197) (214200, 197) (214200, 197)


In [134]:
# Convert column names into strings
tr_train.columns = map(str, tr_train.columns)
tr_val.columns = map(str, tr_val.columns)
tr_test.columns = map(str, tr_test.columns)

# Let's describe one of our datasets
tr_train.describe()

Unnamed: 0,item_id,shop_id,item_cnt_day_8_x,item_price_8_x,item_cnt_day_9_x,item_price_9_x,item_cnt_day_10_x,item_price_10_x,item_cnt_day_11_x,item_price_11_x,item_cnt_day_12_x,item_price_12_x,item_cnt_day_13_x,item_price_13_x,item_cnt_day_14_x,item_price_14_x,item_cnt_day_15_x,item_price_15_x,item_cnt_day_16_x,item_price_16_x,item_cnt_day_17_x,item_price_17_x,item_cnt_day_18_x,item_price_18_x,item_cnt_day_19_x,item_price_19_x,item_cnt_day_20_x,item_price_20_x,item_cnt_day_21_x,item_price_21_x,item_cnt_day_22_x,item_price_22_x,item_cnt_day_23_x,item_price_23_x,item_cnt_day_24_x,item_price_24_x,item_cnt_day_25_x,item_price_25_x,item_cnt_day_26_x,item_price_26_x,item_cnt_day_27_x,item_price_27_x,item_cnt_day_28_x,item_price_28_x,item_cnt_day_29_x,item_price_29_x,item_cnt_day_30_x,item_price_30_x,item_cnt_day_31_x,item_price_31_x,mean_cnt_1,mean_price_1,mean_cnt_2,mean_price_2,mean_cnt_3,mean_price_3,mean_cnt_4,mean_price_4,mean_cnt_5,mean_price_5,mean_cnt_6,mean_price_6,mean_cnt_7,mean_price_7,mean_cnt_8,mean_price_8,mean_cnt_9,mean_price_9,mean_cnt_10,mean_price_10,mean_cnt_11,mean_price_11,mean_cnt_12,mean_price_12,mean_cnt_13,mean_price_13,mean_cnt_14,mean_price_14,mean_cnt_15,mean_price_15,mean_cnt_16,mean_price_16,mean_cnt_17,mean_price_17,mean_cnt_18,mean_price_18,mean_cnt_19,mean_price_19,mean_cnt_20,mean_price_20,mean_cnt_21,mean_price_21,mean_cnt_22,mean_price_22,mean_cnt_23,mean_price_23,mean_cnt_24,mean_price_24,"('item_cnt_day_8', 'sum')_x","('item_cnt_day_9', 'sum')_x","('item_cnt_day_10', 'sum')_x","('item_cnt_day_11', 'sum')_x","('item_cnt_day_12', 'sum')_x","('item_cnt_day_13', 'sum')_x","('item_cnt_day_14', 'sum')_x","('item_cnt_day_15', 'sum')_x","('item_cnt_day_16', 'sum')_x","('item_cnt_day_17', 'sum')_x","('item_cnt_day_18', 'sum')_x","('item_cnt_day_19', 'sum')_x","('item_cnt_day_20', 'sum')_x","('item_cnt_day_21', 'sum')_x","('item_cnt_day_22', 'sum')_x","('item_cnt_day_23', 'sum')_x","('item_cnt_day_24', 'sum')_x","('item_cnt_day_25', 'sum')_x","('item_cnt_day_26', 'sum')_x","('item_cnt_day_27', 'sum')_x","('item_cnt_day_28', 'sum')_x","('item_cnt_day_29', 'sum')_x","('item_cnt_day_30', 'sum')_x","('item_cnt_day_31', 'sum')_x","('item_cnt_day_8_y', 'sum')","('item_cnt_day_9_y', 'sum')","('item_cnt_day_10_y', 'sum')","('item_cnt_day_11_y', 'sum')","('item_cnt_day_12_y', 'sum')","('item_cnt_day_13_y', 'sum')","('item_cnt_day_14_y', 'sum')","('item_cnt_day_15_y', 'sum')","('item_cnt_day_16_y', 'sum')","('item_cnt_day_17_y', 'sum')","('item_cnt_day_18_y', 'sum')","('item_cnt_day_19_y', 'sum')","('item_cnt_day_20_y', 'sum')","('item_cnt_day_21_y', 'sum')","('item_cnt_day_22_y', 'sum')","('item_cnt_day_23_y', 'sum')","('item_cnt_day_24_y', 'sum')","('item_cnt_day_25_y', 'sum')","('item_cnt_day_26_y', 'sum')","('item_cnt_day_27_y', 'sum')","('item_cnt_day_28_y', 'sum')","('item_cnt_day_29_y', 'sum')","('item_cnt_day_30_y', 'sum')","('item_cnt_day_31_y', 'sum')","('item_price_8', 'sum')_x","('item_price_9', 'sum')_x","('item_price_10', 'sum')_x","('item_price_11', 'sum')_x","('item_price_12', 'sum')_x","('item_price_13', 'sum')_x","('item_price_14', 'sum')_x","('item_price_15', 'sum')_x","('item_price_16', 'sum')_x","('item_price_17', 'sum')_x","('item_price_18', 'sum')_x","('item_price_19', 'sum')_x","('item_price_20', 'sum')_x","('item_price_21', 'sum')_x","('item_price_22', 'sum')_x","('item_price_23', 'sum')_x","('item_price_24', 'sum')_x","('item_price_25', 'sum')_x","('item_price_26', 'sum')_x","('item_price_27', 'sum')_x","('item_price_28', 'sum')_x","('item_price_29', 'sum')_x","('item_price_30', 'sum')_x","('item_price_31', 'sum')_x","('item_price_8_y', 'sum')","('item_price_9_y', 'sum')","('item_price_10_y', 'sum')","('item_price_11_y', 'sum')","('item_price_12_y', 'sum')","('item_price_13_y', 'sum')","('item_price_14_y', 'sum')","('item_price_15_y', 'sum')","('item_price_16_y', 'sum')","('item_price_17_y', 'sum')","('item_price_18_y', 'sum')","('item_price_19_y', 'sum')","('item_price_20_y', 'sum')","('item_price_21_y', 'sum')","('item_price_22_y', 'sum')","('item_price_23_y', 'sum')","('item_price_24_y', 'sum')","('item_price_25_y', 'sum')","('item_price_26_y', 'sum')","('item_price_27_y', 'sum')","('item_price_28_y', 'sum')","('item_price_29_y', 'sum')","('item_price_30_y', 'sum')","('item_price_31_y', 'sum')",item_category_id,item_category_type,city
count,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0,214200.0
mean,11019.398627,31.642857,0.205892,33.591996,0.185588,37.275864,0.207502,45.341926,0.318478,63.777567,0.195868,54.408096,0.200019,52.407441,0.229104,58.779629,0.172759,55.393558,0.199748,59.982127,0.208492,63.222773,0.206004,63.987861,0.247241,71.29272,0.225037,75.227587,0.248922,87.838922,0.3238,107.108804,0.485752,156.296332,0.315481,129.466676,0.254888,116.456312,0.256083,121.759841,0.275196,111.487412,0.263193,110.473665,0.241839,113.741981,0.244188,117.210324,0.278137,128.605106,0.278137,128.605106,0.261162,122.907715,0.254721,119.85247,0.256839,117.507769,0.260511,116.303698,0.259773,117.213055,0.259075,117.104949,0.266126,118.650165,0.290529,122.833072,0.293856,121.260645,0.289771,118.222307,0.284376,114.639414,0.28152,111.305053,0.276126,107.925253,0.271617,104.945088,0.267125,102.134903,0.261574,99.385412,0.25977,97.129535,0.256625,94.775741,0.253588,92.757358,0.256678,91.377368,0.254442,89.284848,0.251449,87.023588,0.24955,84.797272,1050.047619,998.27381,1018.269841,1169.761905,1135.595238,1116.345238,1123.785714,1093.446429,1085.142857,1082.959524,1080.019481,1095.095238,1099.141026,1111.309524,1147.314286,1230.440476,1252.705882,1255.329365,1257.997494,1265.272619,1268.939909,1267.323593,1266.36853,1272.707341,8.647451,8.221078,8.385752,9.633333,9.351961,9.193431,9.254706,9.004853,8.936471,8.91849,8.894278,9.018431,9.05175,9.151961,9.448471,10.133039,10.316401,10.338007,10.359979,10.419892,10.450093,10.436783,10.428917,10.481119,171319.177541,180713.043052,197556.635448,229483.875118,239083.357516,243782.456068,251781.549306,255622.249022,261209.648635,267332.298013,272696.461241,280271.162338,288224.203315,299635.224797,316076.536437,346141.20873,364619.963948,377359.254421,390181.145687,399101.378471,406925.869646,414796.698518,422752.174788,432466.085828,1410.863815,1488.22506,1626.936998,1889.867207,1968.921768,2007.620226,2073.495112,2105.124404,2151.138283,2201.560101,2245.735563,2308.115455,2373.611086,2467.584204,2602.983241,2850.57466,3002.752644,3107.664448,3213.256494,3286.717234,3351.154221,3415.972811,3481.488498,3561.485413,46.309608,2.876863,13.5
std,6252.64459,17.561933,4.714035,226.321949,3.830216,237.247931,4.10294,346.030364,5.56751,432.286038,3.112597,410.509388,3.127541,368.013354,3.482804,405.275669,2.464995,416.490823,2.763012,428.302041,3.120215,435.132034,2.655728,423.930544,2.83356,415.58594,2.998698,510.535819,3.040342,549.925044,4.229684,605.685384,5.561023,749.236478,4.079211,703.399033,1.879339,645.400649,1.723709,687.503163,4.119686,649.741034,3.828952,649.675734,2.286223,661.364885,2.143116,617.779298,2.149646,719.66038,2.149646,719.66038,2.063654,545.702379,2.080682,497.736949,2.314889,481.513456,2.439491,469.681284,2.169533,469.330189,2.039557,461.945985,2.163736,465.007349,2.478821,474.768118,2.589392,468.326745,2.593996,457.793563,2.597347,446.690018,2.58243,431.265212,2.558093,419.476156,2.565551,410.403758,2.542855,402.658327,2.512294,395.620755,2.524531,388.949779,2.525511,381.703772,2.533591,375.93552,2.6464,372.843836,2.690725,366.380521,2.714866,353.273754,2.74532,341.112908,899.429429,852.426538,857.836455,984.704801,950.788303,925.262775,916.064998,891.567213,871.834461,864.509609,850.890516,853.982917,851.764962,859.666682,880.701088,945.684313,961.949792,961.649375,961.487686,967.958582,969.521966,969.649352,967.423888,966.464098,130.142507,113.768245,106.646256,115.974689,108.778267,102.926802,100.353812,95.648386,92.30409,91.011616,88.972803,87.72303,86.813164,86.018761,85.974175,89.33813,89.645979,86.963551,83.968638,82.663727,81.18358,79.687425,78.259739,77.151816,95359.065499,99260.515584,106017.597338,118551.771626,121742.933154,123444.111187,124082.102026,124303.743404,123885.526841,125054.238288,125764.294032,126783.24992,128307.773087,131642.714225,137008.515643,147075.781699,153799.811961,160170.970337,166252.399574,171416.74562,175534.44513,179448.738536,183484.118004,187593.107699,6264.300876,6273.159319,6967.148895,8499.338902,9265.636769,9701.526033,10201.351702,10659.94724,11029.622781,11352.309058,11569.229293,11672.362455,11802.024736,11962.461167,12112.587888,12433.997938,12714.910824,12826.058264,12973.650018,13016.431762,13163.719223,13320.832009,13180.848481,13045.92517,16.716581,2.630397,8.071869
min,30.0,2.0,-1.0,0.0,-1.0,0.0,-4.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-2.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-1.0,0.0,-0.5,0.0,-0.333333,0.0,-0.25,0.0,-0.2,0.0,-0.166667,0.0,-0.142857,0.0,-0.125,0.0,0.0,0.0,-0.1,0.0,-0.090909,0.0,-0.083333,0.0,-0.076923,0.0,-0.071429,0.0,-0.066667,0.0,-0.0625,0.0,-0.058824,0.0,-0.055556,0.0,-0.052632,0.0,-0.05,0.0,-0.047619,0.0,-0.045455,0.0,-0.043478,0.0,-0.041667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5381.5,16.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,469.0,546.0,602.0,672.5,653.2,642.5,656.285714,641.875,645.555556,654.4,660.727273,661.083333,670.769231,679.357143,714.066667,766.3125,775.705882,776.055556,775.894737,786.5,789.904762,786.636364,794.347826,811.708333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.045455,0.173913,0.291667,127410.189804,133669.428829,153102.285886,183275.038891,200241.618676,202489.610699,211511.068006,211880.805339,214190.131702,220171.313715,224313.436332,232668.470327,239321.211153,249397.649642,259064.284459,284188.641807,299008.768106,307524.626578,314245.889002,315203.272451,320602.337034,326698.648151,335193.214657,341921.346524,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.170455,50.451087,99.65625,37.0,1.0,7.0
50%,11203.0,34.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,897.5,876.75,912.5,1066.5,1018.9,994.5,988.5,970.0,974.333333,985.5,991.590909,991.5,996.153846,1004.214286,1028.033333,1071.78125,1068.235294,1068.111111,1074.842105,1089.5,1094.833333,1091.090909,1082.152174,1086.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.214286,0.466667,0.875,1.058824,1.222222,1.473684,1.75,1.952381,2.181818,2.391304,2.583333,172881.382749,184671.066334,208952.928499,247796.356882,258842.27754,262408.848228,268230.420345,269749.640459,270422.703889,271974.461415,277668.470233,285490.736199,293852.155452,304874.003327,321816.806972,351445.244552,368225.041276,382819.542131,395946.896812,404853.241852,413462.462152,420415.879264,426219.517679,432761.116834,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,55.909821,139.733333,261.1875,322.911765,389.163056,485.886184,561.15,641.351429,711.050076,767.261739,839.629167,43.0,2.0,13.5
75%,16071.5,47.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,44.79375,0.2,59.4,0.166667,58.166667,0.142857,63.857143,0.125,68.625,0.222222,73.0,0.2,74.8,0.181818,72.636364,0.166667,74.75,0.153846,71.846154,0.214286,71.285714,0.2,69.069635,0.1875,68.625,0.176471,70.235294,0.166667,66.5,0.157895,63.105263,0.15,62.9,0.142857,64.095238,0.181818,63.454545,0.173913,60.826087,0.166667,60.854167,1253.0,1158.0,1149.666667,1318.5,1298.8,1262.166667,1283.428571,1218.75,1188.111111,1169.4,1156.272727,1171.5,1164.769231,1161.071429,1194.133333,1274.625,1313.0,1308.555556,1300.157895,1295.1,1289.619048,1283.136364,1279.73913,1289.25,0.0,1.0,1.333333,2.25,2.4,2.666667,3.0,3.375,3.666667,4.1,4.363636,4.833333,5.153846,5.642857,6.2,7.125,7.485294,7.680556,8.0,8.2625,8.428571,8.556818,8.695652,8.885417,212817.451053,216337.323167,236269.033673,267983.475655,283923.992834,287568.089446,294215.808709,294984.017065,304806.36801,312887.904255,321678.150266,334465.357085,342145.334656,352885.58696,370990.986915,399893.937412,419281.333048,431484.983588,447007.057815,464935.585992,477022.015927,488933.406509,500072.226107,510558.852229,0.0,249.0,399.666667,672.75,693.645,783.708333,854.285714,934.53125,1048.666667,1106.3,1199.001136,1300.941667,1388.490385,1526.010714,1695.384167,1939.179688,2058.763078,2162.416667,2261.353344,2345.064375,2442.148214,2519.924867,2650.131268,2731.173958,58.0,5.0,21.0
max,22167.0,59.0,950.0,14200.0,978.0,14200.0,989.0,18427.810945,1305.0,27990.0,899.0,27990.0,941.0,19434.444444,776.0,22790.0,597.0,23115.0,602.0,22990.0,771.0,22990.0,563.0,22990.0,591.0,20990.0,639.0,20999.0,634.0,20999.0,772.0,27490.0,1209.0,27490.0,1000.0,30490.0,257.0,33490.0,174.0,33490.0,813.0,28490.0,742.0,27990.0,444.0,27990.0,482.0,29990.0,436.0,29990.0,436.0,29990.0,459.0,28240.0,450.666667,26990.111111,453.0,26827.5625,458.6,27018.671429,382.166667,26931.708333,353.285714,26613.765306,402.0,26567.044643,491.666667,26095.521164,519.7,25472.506548,530.090909,24996.846861,539.166667,24546.309623,543.153846,24075.38196,542.071429,23826.46182,547.6,23770.697698,551.0,23721.904092,553.705882,23678.85091,566.055556,23562.931522,585.789474,23322.250916,601.45,23468.74616,634.952381,23407.29649,651.045455,23169.031538,665.26087,22161.682341,677.125,21238.27891,4596.0,4474.5,4542.666667,5188.0,5039.6,4940.5,4923.285714,4828.0,4737.888889,4677.8,4596.363636,4617.416667,4641.846154,4694.142857,4828.2,5182.375,5258.705882,5256.277778,5248.263158,5278.75,5273.0,5264.590909,5253.913043,5248.041667,6233.0,6216.0,6413.0,7354.5,6984.4,6655.166667,6561.0,6290.875,6098.888889,6046.4,5928.818182,5857.166667,5817.692308,5782.857143,5789.133333,6035.1875,6063.823529,5876.444444,5664.473684,5572.8,5472.571429,5372.727273,5278.695652,5205.958333,411998.120067,423159.156612,450726.581662,497810.616984,519959.843201,531629.87374,538869.949494,552956.043819,559728.384583,572557.061332,579780.703153,595012.752985,613133.511928,636698.37514,672293.390412,728547.956784,762514.626878,793421.338327,823919.008966,855948.917883,878848.70241,902300.643593,920948.17106,943521.196964,93002.985,93020.578115,206391.247664,370571.208428,441689.031462,485040.565691,531109.309808,576381.427904,611840.934248,640179.803323,659357.408837,665871.946625,673875.081226,682349.959893,687299.551798,695111.012103,702170.660784,706462.575631,713112.656868,713994.649025,725015.721634,736897.01828,720347.375601,702757.429395,83.0,13.0,27.0


In [135]:
# Make all elements non-negative: subtract global minimum from all elements
min_tr = np_train_matrix.min()
min_val = np_val_matrix.min()
min_test = np_test_matrix.min()

np_train_matrix -= min_tr
np_val_matrix -= min_val
np_test_matrix -= min_test

# Check ourselves
assert np_train_matrix.min() == 0
assert np_val_matrix.min() == 0
assert np_test_matrix.min() == 0

In [136]:
# Get all columns which name contain 'price'
price_cols = []
price_cols = [col for col in tr_train.columns if ('price' in col or "'sum')" in col 
                                                  or 'item_cnt_day' in col or 'mean_cnt' in col)] + ['item_id']
print(price_cols[:10], len(price_cols))

# Get indexes of these columns
index_list = []
for col in price_cols:
    index_list.append(tr_train.columns.get_loc(col))
    
print(index_list[:10], len(index_list))

['item_cnt_day_8_x', 'item_price_8_x', 'item_cnt_day_9_x', 'item_price_9_x', 'item_cnt_day_10_x', 'item_price_10_x', 'item_cnt_day_11_x', 'item_price_11_x', 'item_cnt_day_12_x', 'item_price_12_x'] 193
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11] 193


In [137]:
# Apply log transformation to the price columns dataframes to decrease the standard deviation
np_train_matrix[:, index_list] = np.log1p(np_train_matrix[:, index_list])
np_val_matrix[:, index_list] = np.log1p(np_val_matrix[:, index_list])
np_test_matrix[:, index_list] = np.log1p(np_test_matrix[:, index_list])

In [138]:
# Some checks
print(np.max(np_train_matrix), np.max(np_val_matrix), np.max(np_test_matrix))
print(np.isnan(np_train_matrix).sum(), np.isnan(np_val_matrix).sum(), np.isnan(np_test_matrix).sum())

87.0 87.0 87.0
0 0 0


In [132]:
np_test_matrix[:, 50].max()
tr_test.columns[50]

'mean_cnt_1'

In [139]:
# Let's train our best previous model on this data
cbr_log = cb.CatBoostRegressor(iterations=500, depth=8, random_seed=26)

cbr_log.fit(np_train_matrix, tr_target, verbose=100)

test_cbr_log = cbr_log.predict(np_test_matrix)
test_cbr_log = np.clip(test_cbr_log, 0, 20)

print (test_cbr_log[:10])
#0:	learn: 2.4830036	total: 397ms	remaining: 3m 17s
#100:	learn: 1.3900709	total: 32.8s	remaining: 2m 9s
#200:	learn: 1.2585904	total: 1m 4s	remaining: 1m 35s
#300:	learn: 1.2143731	total: 1m 39s	remaining: 1m 5s
#400:	learn: 1.1558271	total: 2m 26s	remaining: 36.2s
#499:	learn: 1.1144856	total: 3m 12s	remaining: 0us

# [ 0.53768322  0.11340991  0.88697125  0.24082134  0.78908626  0.42586568
#   1.137824    0.15323412  0.93935205  0.47286205]

0:	learn: 2.4830036	total: 391ms	remaining: 3m 14s
100:	learn: 1.3900709	total: 31s	remaining: 2m 2s
200:	learn: 1.2585904	total: 1m 1s	remaining: 1m 31s
300:	learn: 1.2143731	total: 1m 32s	remaining: 1m
400:	learn: 1.1558271	total: 2m 10s	remaining: 32.1s
499:	learn: 1.1144856	total: 2m 52s	remaining: 0us
[ 0.53768322  0.11340991  0.88697125  0.24082134  0.78908626  0.42586568
  1.137824    0.15323412  0.93935205  0.47286205]


In [142]:
val_cbr_log = cbr_log.predict(np_val_matrix)
val_cbr_log = np.clip(val_cbr_log, 0, 20)

# Evaluate model on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_log, val_target_clip))
rmse
# 0.92821027298022363

0.92819604478203943

### It doesn't work. Predictions are the same. Let's add std features to the data

## Add std features

In [6]:
# New function to construct base dataframes for train, validation and test
def construct_base_datasets_with_std(months=NMONTHS):
    """
    months - number of last months to construct base train/test matrix dataset
    This function construct grouped by months sales information for train, validation and test
    """
    # Group transactions by month, shop and item (add stds and price mean here)
    agg_func = {'item_cnt_day': ['sum', 'std'], 'item_price': ['mean', 'std']}
    grp_trans = transactions.groupby(['date_block_num', 'shop_id', 'item_id'], as_index=False).agg(agg_func)
    # Rename columns
    grp_trans.columns = ['date_block_num', 'shop_id', 'item_id', 'item_cnt_day_sum', 'item_cnt_day_std', 
                     'item_price_mean', 'item_price_std']
    
    # Split dataset by months
    lst_grp_trans = []
    max_month = grp_trans.date_block_num.max() + 1
    for month in range(max_month-months-2, max_month): # Get only last NMONTHS months data
        month_df = grp_trans[grp_trans.date_block_num == month]
        # Rename target column and price column
        month_df.rename(index=str, columns={"item_cnt_day_sum": "item_cnt_day_sum_" + str(month),
                                            "item_cnt_day_std": "item_cnt_day_std_" + str(month),
                                           "item_price_mean": "item_price_mean_" + str(month),
                                           "item_price_std": "item_price_std_" + str(month)
                                           }, inplace=True) 
        lst_grp_trans.append(month_df)
        
    #print(lst_grp_trans[0].columns)
        
    # Join data with test dataset
    all_data = test[['shop_id', 'item_id']].set_index(['shop_id', 'item_id'])
    for month in range(max_month-months-2, max_month): # [8..34)
        all_data = all_data.join(lst_grp_trans[month+months-max_month+2][['shop_id', 'item_id', 
                                                                          'item_cnt_day_sum_' + str(month),
                                                                          'item_cnt_day_std_' + str(month),
                                                                          'item_price_mean_' + str(month),
                                                                          'item_price_std_' + str(month)
                                                                         ]].set_index(['shop_id', 'item_id']))

    # Replace NaNs by 0
    all_data.fillna(0, inplace=True)
    
    # Get train target and validation target column
    val_target = all_data.item_cnt_day_sum_33
    train_target = all_data.item_cnt_day_sum_32
    
    # Calculate x_train, x_val, x_test
    x_train = all_data.copy()
    x_train.drop(['item_cnt_day_sum_32', 'item_cnt_day_std_32', 'item_price_mean_32', 'item_price_std_32', 
                  'item_cnt_day_sum_33', 'item_cnt_day_std_33', 'item_price_mean_33', 'item_price_std_33'
                 ], axis=1, inplace=True)
    
    x_val = all_data.copy()
    first_col = str(max_month-months-2)
    x_val.drop(['item_cnt_day_sum_' + first_col, 'item_cnt_day_std_' + first_col, 
                'item_price_mean_' + first_col, 'item_price_std_' + first_col,
                'item_cnt_day_sum_33', 'item_cnt_day_std_33', 
                'item_price_mean_33', 'item_price_std_33'
               ], axis=1, inplace=True)
    
    x_test = all_data.copy()
    second_col = str(max_month-months-1)
    x_test.drop(['item_cnt_day_sum_' + first_col, 'item_cnt_day_std_' + first_col, 
                 'item_price_mean_' + first_col, 'item_price_std_' + first_col, 
                 'item_cnt_day_sum_' + second_col, 'item_cnt_day_std_' + second_col, 
                 'item_price_mean_' + second_col, 'item_price_std_' + second_col
                ], axis=1, inplace=True)
    
    return x_train, x_val, x_test, train_target, val_target

In [35]:
%%time
tr_train, tr_val, tr_test, tr_target, val_target = construct_base_datasets_with_std()
print(tr_train.shape, tr_val.shape, tr_test.shape, tr_target.shape, val_target.shape)

(214200, 96) (214200, 96) (214200, 96) (214200,) (214200,)
Wall time: 10.5 s


In [7]:
def add_means_with_std(train, val, test, months=NMONTHS):
    """
    train - base train dataframe
    val - base validation dataframe (train, shifted by 1 month further)
    test - base test dataframe (val, shifted by 1 month further)
    This function add sliding means by last n column values (last n months)
    """
    # Define filters for price and item count features separation
    train_fltr_item_cnt_sum_col = [col for col in train if col.startswith('item_cnt_day_sum')]
    train_fltr_item_cnt_std_col = [col for col in train if col.startswith('item_cnt_day_std')]
    train_fltr_price_mean_col = [col for col in train if col.startswith('item_price_mean')]
    train_fltr_price_std_col = [col for col in train if col.startswith('item_price_std')]
    
    val_fltr_item_cnt_sum_col = [col for col in val if col.startswith('item_cnt_day_sum')]
    val_fltr_item_cnt_std_col = [col for col in val if col.startswith('item_cnt_day_std')]
    val_fltr_price_mean_col = [col for col in val if col.startswith('item_price_mean')]
    val_fltr_price_std_col = [col for col in val if col.startswith('item_price_std')]
    
    test_fltr_item_cnt_sum_col = [col for col in test if col.startswith('item_cnt_day_sum')]
    test_fltr_item_cnt_std_col = [col for col in test if col.startswith('item_cnt_day_std')]
    test_fltr_price_mean_col = [col for col in test if col.startswith('item_price_mean')]
    test_fltr_price_std_col = [col for col in test if col.startswith('item_price_std')]
        
    
    # Get 12 separate dataframes by this filters
    train_item_cnt_sum_df = train[train_fltr_item_cnt_sum_col]
    train_item_cnt_std_df = train[train_fltr_item_cnt_std_col]
    train_price_mean_df = train[train_fltr_price_mean_col]
    train_price_std_df = train[train_fltr_price_std_col]
    
    val_item_cnt_sum_df = val[val_fltr_item_cnt_sum_col]
    val_item_cnt_std_df = val[val_fltr_item_cnt_std_col]
    val_price_mean_df = val[val_fltr_price_mean_col]
    val_price_std_df = val[val_fltr_price_std_col]
    
    test_item_cnt_sum_df = test[test_fltr_item_cnt_sum_col]
    test_item_cnt_std_df = test[test_fltr_item_cnt_std_col]
    test_price_mean_df = test[test_fltr_price_mean_col]
    test_price_std_df = test[test_fltr_price_std_col]
    
    # Add features to train
    np_train_sum_count = np.array(train_item_cnt_sum_df)
    np_train_std_count = np.array(train_item_cnt_std_df)
    np_train_mean_price = np.array(train_price_mean_df)
    np_train_std_price = np.array(train_price_std_df)
    
    for i in range(1, months+1):
        train['mean_cnt_sum_' + str(i)] = pd.Series(np_train_sum_count[:, -i:].mean(axis=1), index=train.index)
        train['mean_cnt_std_' + str(i)] = pd.Series(np_train_std_count[:, -i:].mean(axis=1), index=train.index)
        train['mean_price_mean_' + str(i)] = pd.Series(np_train_mean_price[:, -i:].mean(axis=1), index=train.index)
        train['mean_price_std_' + str(i)] = pd.Series(np_train_std_price[:, -i:].mean(axis=1), index=train.index)
    
    
    # Add features to validation
    np_val_sum_count = np.array(val_item_cnt_sum_df)
    np_val_std_count = np.array(val_item_cnt_std_df)
    np_val_mean_price = np.array(val_price_mean_df)
    np_val_std_price = np.array(val_price_std_df)
    
    for i in range(1, months+1):
        val['mean_cnt_sum_' + str(i)] = pd.Series(np_val_sum_count[:, -i:].mean(axis=1), index=val.index)
        val['mean_cnt_std_' + str(i)] = pd.Series(np_val_std_count[:, -i:].mean(axis=1), index=val.index)
        val['mean_price_mean_' + str(i)] = pd.Series(np_val_mean_price[:, -i:].mean(axis=1), index=val.index)
        val['mean_price_std_' + str(i)] = pd.Series(np_val_std_price[:, -i:].mean(axis=1), index=val.index)
    
    
    # Add features to test
    np_test_sum_count = np.array(test_item_cnt_sum_df)
    np_test_std_count = np.array(test_item_cnt_std_df)
    np_test_mean_price = np.array(test_price_mean_df)
    np_test_std_price = np.array(test_price_std_df)
    
    for i in range(1, months+1):
        test['mean_cnt_sum_' + str(i)] = pd.Series(np_test_sum_count[:, -i:].mean(axis=1), index=test.index)
        test['mean_cnt_std_' + str(i)] = pd.Series(np_test_std_count[:, -i:].mean(axis=1), index=test.index)
        test['mean_price_mean_' + str(i)] = pd.Series(np_test_mean_price[:, -i:].mean(axis=1), index=test.index)
        test['mean_price_std_' + str(i)] = pd.Series(np_test_std_price[:, -i:].mean(axis=1), index=test.index)
        
    return train, val, test

In [36]:
%%time
tr_train, tr_val, tr_test = add_means_with_std(tr_train, tr_val, tr_test)
print(tr_train.shape, tr_val.shape, tr_test.shape)

(214200, 192) (214200, 192) (214200, 192)
Wall time: 1.97 s


In [8]:
def get_month_mean_encodings_with_std(train, val, test):
    """
    train - base train dataframe
    val - base validation dataframe (train, shifted by 1 month further)
    test - base test dataframe (val, shifted by 1 month further)
    Function calculate cumsum divided by cumcount for current shop and current item saled quantities,
    also it makes the same transformations for current month price
    We aggregate info by months
    """
    # Drop base dataframes indeces
    train.reset_index(inplace=True)
    val.reset_index(inplace=True)
    test.reset_index(inplace=True)
    
    # Get dataframes with sold item quantities, grouped by shop and item
    
    # Define item count column filters
    train_filter_cnt_sum_col = [col for col in train if col.startswith('item_cnt_day_sum')]
    train_filter_cnt_std_col = [col for col in train if col.startswith('item_cnt_day_std')]
    val_filter_cnt_sum_col = [col for col in val if col.startswith('item_cnt_day_sum')]
    val_filter_cnt_std_col = [col for col in val if col.startswith('item_cnt_day_std')]
    test_filter_cnt_sum_col = [col for col in test if col.startswith('item_cnt_day_sum')]
    test_filter_cnt_std_col = [col for col in test if col.startswith('item_cnt_day_std')]
    
    # Define item price filters
    train_filter_price_mean_col = [col for col in train if col.startswith('item_price_mean')]
    train_filter_price_std_col = [col for col in train if col.startswith('item_price_std')]
    val_filter_price_mean_col = [col for col in val if col.startswith('item_price_mean')]
    val_filter_price_std_col = [col for col in val if col.startswith('item_price_std')]
    test_filter_price_mean_col = [col for col in test if col.startswith('item_price_mean')]
    test_filter_price_std_col = [col for col in test if col.startswith('item_price_std')]

    
    # Aggregate train by shop and item
    train_shop_agg_cnt_sum_df = train.groupby(['shop_id'])[train_filter_cnt_sum_col].agg(['sum'])
    train_shop_agg_cnt_std_df = train.groupby(['shop_id'])[train_filter_cnt_std_col].agg(['sum'])
    train_item_agg_cnt_sum_df = train.groupby(['item_id'])[train_filter_cnt_sum_col].agg(['sum'])
    train_item_agg_cnt_std_df = train.groupby(['item_id'])[train_filter_cnt_std_col].agg(['sum'])
    train_shop_agg_price_mean_df = train.groupby(['shop_id'])[train_filter_price_mean_col].agg(['sum'])
    train_shop_agg_price_std_df = train.groupby(['shop_id'])[train_filter_price_std_col].agg(['sum'])
    train_item_agg_price_mean_df = train.groupby(['item_id'])[train_filter_price_mean_col].agg(['sum'])
    train_item_agg_price_std_df = train.groupby(['item_id'])[train_filter_price_std_col].agg(['sum'])
    
    # Aggregate validation set by shop and item
    val_shop_agg_cnt_sum_df = val.groupby(['shop_id'])[val_filter_cnt_sum_col].agg(['sum'])
    val_shop_agg_cnt_std_df = val.groupby(['shop_id'])[val_filter_cnt_std_col].agg(['sum'])
    val_item_agg_cnt_sum_df = val.groupby(['item_id'])[val_filter_cnt_sum_col].agg(['sum'])
    val_item_agg_cnt_std_df = val.groupby(['item_id'])[val_filter_cnt_std_col].agg(['sum'])
    val_shop_agg_price_mean_df = val.groupby(['shop_id'])[val_filter_price_mean_col].agg(['sum'])
    val_shop_agg_price_std_df = val.groupby(['shop_id'])[val_filter_price_std_col].agg(['sum'])
    val_item_agg_price_mean_df = val.groupby(['item_id'])[val_filter_price_mean_col].agg(['sum'])
    val_item_agg_price_std_df = val.groupby(['item_id'])[val_filter_price_std_col].agg(['sum'])
    
    # Aggregate test set by shop and item
    test_shop_agg_cnt_sum_df = test.groupby(['shop_id'])[test_filter_cnt_sum_col].agg(['sum'])
    test_shop_agg_cnt_std_df = test.groupby(['shop_id'])[test_filter_cnt_std_col].agg(['sum'])
    test_item_agg_cnt_sum_df = test.groupby(['item_id'])[test_filter_cnt_sum_col].agg(['sum'])
    test_item_agg_cnt_std_df = test.groupby(['item_id'])[test_filter_cnt_std_col].agg(['sum'])
    test_shop_agg_price_mean_df = test.groupby(['shop_id'])[test_filter_price_mean_col].agg(['sum'])
    test_shop_agg_price_std_df = test.groupby(['shop_id'])[test_filter_price_std_col].agg(['sum'])
    test_item_agg_price_mean_df = test.groupby(['item_id'])[test_filter_price_mean_col].agg(['sum'])
    test_item_agg_price_std_df = test.groupby(['item_id'])[test_filter_price_std_col].agg(['sum'])
    

    # Now we get train cumulative sum of this features on rows
    train_shop_agg_cnt_sum_df = train_shop_agg_cnt_sum_df.cumsum(axis=1)
    train_shop_agg_cnt_std_df = train_shop_agg_cnt_std_df.cumsum(axis=1)
    train_item_agg_cnt_sum_df = train_item_agg_cnt_sum_df.cumsum(axis=1)
    train_item_agg_cnt_std_df = train_item_agg_cnt_std_df.cumsum(axis=1)
    train_shop_agg_price_mean_df = train_shop_agg_price_mean_df.cumsum(axis=1)
    train_shop_agg_price_std_df = train_shop_agg_price_std_df.cumsum(axis=1)
    train_item_agg_price_mean_df = train_item_agg_price_mean_df.cumsum(axis=1)
    train_item_agg_price_std_df = train_item_agg_price_std_df.cumsum(axis=1)
    
    # Get validation cumulative sum of this features on rows
    val_shop_agg_cnt_sum_df = val_shop_agg_cnt_sum_df.cumsum(axis=1)
    val_shop_agg_cnt_std_df = val_shop_agg_cnt_std_df.cumsum(axis=1)
    val_item_agg_cnt_sum_df = val_item_agg_cnt_sum_df.cumsum(axis=1)
    val_item_agg_cnt_std_df = val_item_agg_cnt_std_df.cumsum(axis=1)
    val_shop_agg_price_mean_df = val_shop_agg_price_mean_df.cumsum(axis=1)
    val_shop_agg_price_std_df = val_shop_agg_price_std_df.cumsum(axis=1)
    val_item_agg_price_mean_df = val_item_agg_price_mean_df.cumsum(axis=1)
    val_item_agg_price_std_df = val_item_agg_price_std_df.cumsum(axis=1)

    # Get test cumulative sum of this features on rows
    test_shop_agg_cnt_sum_df = test_shop_agg_cnt_sum_df.cumsum(axis=1)
    test_shop_agg_cnt_std_df = test_shop_agg_cnt_std_df.cumsum(axis=1)
    test_item_agg_cnt_sum_df = test_item_agg_cnt_sum_df.cumsum(axis=1)
    test_item_agg_cnt_std_df = test_item_agg_cnt_std_df.cumsum(axis=1)
    test_shop_agg_price_mean_df = test_shop_agg_price_mean_df.cumsum(axis=1)
    test_shop_agg_price_std_df = test_shop_agg_price_std_df.cumsum(axis=1)
    test_item_agg_price_mean_df = test_item_agg_price_mean_df.cumsum(axis=1)
    test_item_agg_price_std_df = test_item_agg_price_std_df.cumsum(axis=1)


    # Calculate arithmetic progression (1, 2, 3, 4, ...) with months length
    np_to_divide = np.array(train_item_agg_cnt_sum_df.columns.labels[0] + 1, dtype=float)

    
    # Divide train cumulative sum by column number (np.cumcount emulation)
    train_shop_agg_cnt_sum_df = train_shop_agg_cnt_sum_df / np_to_divide
    train_shop_agg_cnt_std_df = train_shop_agg_cnt_std_df / np_to_divide
    train_item_agg_cnt_sum_df = train_item_agg_cnt_sum_df / np_to_divide
    train_item_agg_cnt_std_df = train_item_agg_cnt_std_df / np_to_divide
    train_shop_agg_price_mean_df = train_shop_agg_price_mean_df / np_to_divide
    train_shop_agg_price_std_df = train_shop_agg_price_std_df / np_to_divide
    train_item_agg_price_mean_df = train_item_agg_price_mean_df / np_to_divide
    train_item_agg_price_std_df = train_item_agg_price_std_df / np_to_divide
    
    # Divide validation cumulative sum by column number (np.cumcount emulation)
    val_shop_agg_cnt_sum_df = val_shop_agg_cnt_sum_df / np_to_divide
    val_shop_agg_cnt_std_df = val_shop_agg_cnt_std_df / np_to_divide
    val_item_agg_cnt_sum_df = val_item_agg_cnt_sum_df / np_to_divide
    val_item_agg_cnt_std_df = val_item_agg_cnt_std_df / np_to_divide
    val_shop_agg_price_mean_df = val_shop_agg_price_mean_df / np_to_divide
    val_shop_agg_price_std_df = val_shop_agg_price_std_df / np_to_divide
    val_item_agg_price_mean_df = val_item_agg_price_mean_df / np_to_divide
    val_item_agg_price_std_df = val_item_agg_price_std_df / np_to_divide
    
    # Divide test cumulative sum by column number (np.cumcount emulation)
    test_shop_agg_cnt_sum_df = test_shop_agg_cnt_sum_df / np_to_divide
    test_shop_agg_cnt_std_df = test_shop_agg_cnt_std_df / np_to_divide
    test_item_agg_cnt_sum_df = test_item_agg_cnt_sum_df / np_to_divide
    test_item_agg_cnt_std_df = test_item_agg_cnt_std_df / np_to_divide
    test_shop_agg_price_mean_df = test_shop_agg_price_mean_df / np_to_divide
    test_shop_agg_price_std_df = test_shop_agg_price_std_df / np_to_divide
    test_item_agg_price_mean_df = test_item_agg_price_mean_df / np_to_divide
    test_item_agg_price_std_df = test_item_agg_price_std_df / np_to_divide
    
    
    # Move index to columns of train daraframes
    train_shop_agg_cnt_sum_df.reset_index(inplace=True)
    train_shop_agg_cnt_std_df.reset_index(inplace=True)
    train_item_agg_cnt_sum_df.reset_index(inplace=True)
    train_item_agg_cnt_std_df.reset_index(inplace=True)
    train_shop_agg_price_mean_df.reset_index(inplace=True)
    train_shop_agg_price_std_df.reset_index(inplace=True)
    train_item_agg_price_mean_df.reset_index(inplace=True)
    train_item_agg_price_std_df.reset_index(inplace=True)
    
    # Move index to columns of validation daraframes
    val_shop_agg_cnt_sum_df.reset_index(inplace=True)
    val_shop_agg_cnt_std_df.reset_index(inplace=True)
    val_item_agg_cnt_sum_df.reset_index(inplace=True)
    val_item_agg_cnt_std_df.reset_index(inplace=True)
    val_shop_agg_price_mean_df.reset_index(inplace=True)
    val_shop_agg_price_std_df.reset_index(inplace=True)
    val_item_agg_price_mean_df.reset_index(inplace=True)
    val_item_agg_price_std_df.reset_index(inplace=True)
    
    # Move index to columns of test daraframes
    test_shop_agg_cnt_sum_df.reset_index(inplace=True)
    test_shop_agg_cnt_std_df.reset_index(inplace=True)
    test_item_agg_cnt_sum_df.reset_index(inplace=True)
    test_item_agg_cnt_std_df.reset_index(inplace=True)
    test_shop_agg_price_mean_df.reset_index(inplace=True)
    test_shop_agg_price_std_df.reset_index(inplace=True)
    test_item_agg_price_mean_df.reset_index(inplace=True)
    test_item_agg_price_std_df.reset_index(inplace=True)

    
    # Drop second level of multilevel index on train dataframes
    train_shop_agg_cnt_sum_df.columns = train_shop_agg_cnt_sum_df.columns.droplevel(1)
    train_shop_agg_cnt_std_df.columns = train_shop_agg_cnt_std_df.columns.droplevel(1)
    train_item_agg_cnt_sum_df.columns = train_item_agg_cnt_sum_df.columns.droplevel(1)
    train_item_agg_cnt_std_df.columns = train_item_agg_cnt_std_df.columns.droplevel(1)
    train_shop_agg_price_mean_df.columns = train_shop_agg_price_mean_df.columns.droplevel(1)
    train_shop_agg_price_std_df.columns = train_shop_agg_price_std_df.columns.droplevel(1)
    train_item_agg_price_mean_df.columns = train_item_agg_price_mean_df.columns.droplevel(1)
    train_item_agg_price_std_df.columns = train_item_agg_price_std_df.columns.droplevel(1)

    # Drop second level of multilevel index on val dataframes	
    val_shop_agg_cnt_sum_df.columns = val_shop_agg_cnt_sum_df.columns.droplevel(1)
    val_shop_agg_cnt_std_df.columns = val_shop_agg_cnt_std_df.columns.droplevel(1)
    val_item_agg_cnt_sum_df.columns = val_item_agg_cnt_sum_df.columns.droplevel(1)
    val_item_agg_cnt_std_df.columns = val_item_agg_cnt_std_df.columns.droplevel(1)
    val_shop_agg_price_mean_df.columns = val_shop_agg_price_mean_df.columns.droplevel(1)
    val_shop_agg_price_std_df.columns = val_shop_agg_price_std_df.columns.droplevel(1)
    val_item_agg_price_mean_df.columns = val_item_agg_price_mean_df.columns.droplevel(1)
    val_item_agg_price_std_df.columns = val_item_agg_price_std_df.columns.droplevel(1)

    # Drop second level of multilevel index on test dataframes	
    test_shop_agg_cnt_sum_df.columns = test_shop_agg_cnt_sum_df.columns.droplevel(1)
    test_shop_agg_cnt_std_df.columns = test_shop_agg_cnt_std_df.columns.droplevel(1)
    test_item_agg_cnt_sum_df.columns = test_item_agg_cnt_sum_df.columns.droplevel(1)
    test_item_agg_cnt_std_df.columns = test_item_agg_cnt_std_df.columns.droplevel(1)
    test_shop_agg_price_mean_df.columns = test_shop_agg_price_mean_df.columns.droplevel(1)
    test_shop_agg_price_std_df.columns = test_shop_agg_price_std_df.columns.droplevel(1)
    test_item_agg_price_mean_df.columns = test_item_agg_price_mean_df.columns.droplevel(1)
    test_item_agg_price_std_df.columns = test_item_agg_price_std_df.columns.droplevel(1)

    # Join all train mean encoding features with initial data
    train_ext = pd.merge(train, train_shop_agg_cnt_sum_df, how='left', left_on='shop_id', right_on='shop_id')
    train_ext = pd.merge(train_ext, train_shop_agg_cnt_std_df, how='left', left_on='shop_id', right_on='shop_id')
    train_ext = pd.merge(train_ext, train_item_agg_cnt_sum_df, how='left', left_on='item_id', right_on='item_id')
    train_ext = pd.merge(train_ext, train_item_agg_cnt_std_df, how='left', left_on='item_id', right_on='item_id')
    train_ext = pd.merge(train_ext, train_shop_agg_price_mean_df, how='left', left_on='shop_id', right_on='shop_id')
    train_ext = pd.merge(train_ext, train_shop_agg_price_std_df, how='left', left_on='shop_id', right_on='shop_id')
    train_ext = pd.merge(train_ext, train_item_agg_price_mean_df, how='left', left_on='item_id', right_on='item_id')
    train_ext = pd.merge(train_ext, train_item_agg_price_std_df, how='left', left_on='item_id', right_on='item_id')

    # Join all validation mean encoding features with initial data
    val_ext = pd.merge(val, val_shop_agg_cnt_sum_df, how='left', left_on='shop_id', right_on='shop_id')
    val_ext = pd.merge(val_ext, val_shop_agg_cnt_std_df, how='left', left_on='shop_id', right_on='shop_id')
    val_ext = pd.merge(val_ext, val_item_agg_cnt_sum_df, how='left', left_on='item_id', right_on='item_id')
    val_ext = pd.merge(val_ext, val_item_agg_cnt_std_df, how='left', left_on='item_id', right_on='item_id')
    val_ext = pd.merge(val_ext, val_shop_agg_price_mean_df, how='left', left_on='shop_id', right_on='shop_id')
    val_ext = pd.merge(val_ext, val_shop_agg_price_std_df, how='left', left_on='shop_id', right_on='shop_id')
    val_ext = pd.merge(val_ext, val_item_agg_price_mean_df, how='left', left_on='item_id', right_on='item_id')
    val_ext = pd.merge(val_ext, val_item_agg_price_std_df, how='left', left_on='item_id', right_on='item_id')
                       
    # Join all test mean encoding features with initial data
    test_ext = pd.merge(test, test_shop_agg_cnt_sum_df, how='left', left_on='shop_id', right_on='shop_id')
    test_ext = pd.merge(test_ext, test_shop_agg_cnt_std_df, how='left', left_on='shop_id', right_on='shop_id')
    test_ext = pd.merge(test_ext, test_item_agg_cnt_sum_df, how='left', left_on='item_id', right_on='item_id')
    test_ext = pd.merge(test_ext, test_item_agg_cnt_std_df, how='left', left_on='item_id', right_on='item_id')
    test_ext = pd.merge(test_ext, test_shop_agg_price_mean_df, how='left', left_on='shop_id', right_on='shop_id')
    test_ext = pd.merge(test_ext, test_shop_agg_price_std_df, how='left', left_on='shop_id', right_on='shop_id')
    test_ext = pd.merge(test_ext, test_item_agg_price_mean_df, how='left', left_on='item_id', right_on='item_id')
    test_ext = pd.merge(test_ext, test_item_agg_price_std_df, how='left', left_on='item_id', right_on='item_id')

    return train_ext, val_ext, test_ext

In [37]:
%%time
tr_train, tr_val, tr_test = get_month_mean_encodings_with_std(tr_train, tr_val, tr_test)
print(tr_train.shape, tr_val.shape, tr_test.shape)

(214200, 386) (214200, 386) (214200, 386)
Wall time: 13.4 s


In [9]:
# Join all this data together

# Construct basic dataframes
tr_train, tr_val, tr_test, tr_target, val_target = construct_base_datasets_with_std()
print(tr_train.shape, tr_val.shape, tr_test.shape, tr_target.shape, val_target.shape)

# Add month means
tr_train, tr_val, tr_test = add_means_with_std(tr_train, tr_val, tr_test)
print(tr_train.shape, tr_val.shape, tr_test.shape)

# Add mean encodings
tr_train, tr_val, tr_test = get_month_mean_encodings_with_std(tr_train, tr_val, tr_test)
print(tr_train.shape, tr_val.shape, tr_test.shape)

# Add item category to train/val/test from transactions
tr_train = pd.merge(tr_train, items, how='left', left_on='item_id', right_on='item_id')
tr_val = pd.merge(tr_val, items, how='left', left_on='item_id', right_on='item_id')
tr_test = pd.merge(tr_test, items, how='left', left_on='item_id', right_on='item_id')
print(tr_train.shape, tr_val.shape, tr_test.shape)
# Drop unnecessary item_name column
tr_train.drop(['item_name'], axis=1, inplace=True)
tr_val.drop(['item_name'], axis=1, inplace=True)
tr_test.drop(['item_name'], axis=1, inplace=True)
print(tr_train.shape, tr_val.shape, tr_test.shape)

# Add item category type
tr_train = pd.merge(tr_train, item_categories, how='left', left_on='item_category_id', right_on='item_category_id')
tr_val = pd.merge(tr_val, item_categories, how='left', left_on='item_category_id', right_on='item_category_id')
tr_test = pd.merge(tr_test, item_categories, how='left', left_on='item_category_id', right_on='item_category_id')
print(tr_train.shape, tr_val.shape, tr_test.shape)
# Drop unnecessary item_category_name column
tr_train.drop(['item_category_name'], axis=1, inplace=True)
tr_val.drop(['item_category_name'], axis=1, inplace=True)
tr_test.drop(['item_category_name'], axis=1, inplace=True)
print(tr_train.shape, tr_val.shape, tr_test.shape)

# Add city feature
tr_train = pd.merge(tr_train, shops, how='left', left_on='shop_id', right_on='shop_id')
tr_val = pd.merge(tr_val, shops, how='left', left_on='shop_id', right_on='shop_id')
tr_test = pd.merge(tr_test, shops, how='left', left_on='shop_id', right_on='shop_id')
print(tr_train.shape, tr_val.shape, tr_test.shape)
# Drop unnecessary shop_name column
tr_train.drop(['shop_name'], axis=1, inplace=True)
tr_val.drop(['shop_name'], axis=1, inplace=True)
tr_test.drop(['shop_name'], axis=1, inplace=True)
print(tr_train.shape, tr_val.shape, tr_test.shape)

# Transform Item Category Type and City to the int categorical feature
tr_train['item_category_type'] = tr_train['item_category_type'].factorize()[0]
tr_val['item_category_type'] = tr_val['item_category_type'].factorize()[0]
tr_test['item_category_type'] = tr_test['item_category_type'].factorize()[0]
tr_train['city'] = tr_train['city'].factorize()[0]
tr_val['city'] = tr_val['city'].factorize()[0]
tr_test['city'] = tr_test['city'].factorize()[0]

# Drop duplicate shop_id_x and item_id_x column
if 'shop_id_x' in tr_train.columns:
    tr_train.drop(['shop_id_x'], axis=1, inplace=True)
if 'shop_id_x' in tr_val.columns:
    tr_val.drop(['shop_id_x'], axis=1, inplace=True)
if 'shop_id_x' in tr_test.columns:
    tr_test.drop(['shop_id_x'], axis=1, inplace=True)
if 'item_id_x' in tr_train.columns:
    tr_train.drop(['item_id_x'], axis=1, inplace=True)
if 'item_id_x' in tr_val.columns:
    tr_val.drop(['item_id_x'], axis=1, inplace=True)
if 'item_id_x' in tr_test.columns:
    tr_test.drop(['item_id_x'], axis=1, inplace=True)
print(tr_train.shape, tr_val.shape, tr_test.shape)

# Prepare for modeling
np_train_matrix = np.array(tr_train)
np_val_matrix = np.array(tr_val)
np_test_matrix = np.array(tr_test)
print (np_train_matrix.shape, np_val_matrix.shape, np_test_matrix.shape)
tr_target_clip = np.clip(np.array(tr_target), 0, 20)
val_target_clip = np.clip(np.array(val_target), 0, 20)

# Clean unnesessary data
del tr_train
gc.collect()

del tr_val
gc.collect()

del tr_test
gc.collect()

(214200, 96) (214200, 96) (214200, 96) (214200,) (214200,)
(214200, 192) (214200, 192) (214200, 192)
(214200, 386) (214200, 386) (214200, 386)
(214200, 388) (214200, 388) (214200, 388)
(214200, 387) (214200, 387) (214200, 387)
(214200, 389) (214200, 389) (214200, 389)
(214200, 388) (214200, 388) (214200, 388)
(214200, 390) (214200, 390) (214200, 390)
(214200, 389) (214200, 389) (214200, 389)
(214200, 389) (214200, 389) (214200, 389)
(214200, 389) (214200, 389) (214200, 389)


7

In [44]:
# Start modeling
cbr_1 = cb.CatBoostRegressor(iterations=500, depth=8, random_seed=26)

# Train model
cbr_1.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_1 = cbr_1.predict(np_val_matrix)
val_cbr_1 = np.clip(val_cbr_1, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_1, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_1 = cbr_1.predict(np_test_matrix)
test_cbr_1 = np.clip(test_cbr_1, 0, 20)

print (test_cbr_1[:10])
# 0:	learn: 2.4830036	total: 519ms	remaining: 4m 19s
# 100:	learn: 1.3900709	total: 35.8s	remaining: 2m 21s
# 200:	learn: 1.2585904	total: 1m 7s	remaining: 1m 40s
# 300:	learn: 1.2143731	total: 1m 40s	remaining: 1m 6s
# 400:	learn: 1.1558271	total: 2m 11s	remaining: 32.4s
# 499:	learn: 1.1144856	total: 2m 41s	remaining: 0us
# [ 0.53768322  0.11340991  0.88697125  0.24082134  0.78908626  0.42586568
#   1.137824    0.15323412  0.93935205  0.47286205]

0:	learn: 2.4811675	total: 594ms	remaining: 4m 56s
100:	learn: 1.3866946	total: 1m 32s	remaining: 6m 4s
200:	learn: 1.2402473	total: 3m 40s	remaining: 5m 28s
300:	learn: 1.1875005	total: 5m 29s	remaining: 3m 38s
400:	learn: 1.1548856	total: 6m 54s	remaining: 1m 42s
499:	learn: 1.1065775	total: 8m 27s	remaining: 0us
0.936815271257
[ 0.65188365  0.12369213  0.96031408  0.21938315  0.85269272  0.51262843
  1.03439953  0.13105007  0.8321506   0.23543497]


In [45]:
df_test_pred_1 = test.join(pd.Series(test_cbr_1, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_1.to_csv('../Data/Submissions/6_1_1_catboost_500_trees_with_std.csv', index=None)

### Public LB = 0.98043, let's try other parameters to get higher score

### Let's try 300 trees model

In [11]:
# Start modeling
cbr_2 = cb.CatBoostRegressor(iterations=300, depth=8, random_seed=26)

# Train model
cbr_2.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_2 = cbr_2.predict(np_val_matrix)
val_cbr_2 = np.clip(val_cbr_2, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_2, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_2 = cbr_2.predict(np_test_matrix)
test_cbr_2 = np.clip(test_cbr_2, 0, 20)

print (test_cbr_2[:10])

0:	learn: 2.4811675	total: 589ms	remaining: 2m 56s
100:	learn: 1.3866946	total: 1m 2s	remaining: 2m 2s
200:	learn: 1.2402473	total: 2m 22s	remaining: 1m 10s
299:	learn: 1.1875902	total: 3m 40s	remaining: 0us
0.923104927428
[ 0.66713479  0.12859667  0.97032946  0.21985688  0.82738231  0.53527883
  1.07556569  0.13085928  0.79946759  0.1055324 ]


In [12]:
df_test_pred_2 = test.join(pd.Series(test_cbr_2, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_2.to_csv('../Data/Submissions/6_1_2_catboost_300_trees_with_std.csv', index=None)

### 0.99804, not so good. Let's increase number of trees to 400

In [13]:
# Start modeling
cbr_3 = cb.CatBoostRegressor(iterations=400, depth=8, random_seed=26)

# Train model
cbr_3.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_3 = cbr_3.predict(np_val_matrix)
val_cbr_3 = np.clip(val_cbr_3, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_3, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_3 = cbr_3.predict(np_test_matrix)
test_cbr_3 = np.clip(test_cbr_3, 0, 20)

print (test_cbr_3[:10])

0:	learn: 2.4811675	total: 556ms	remaining: 3m 41s
100:	learn: 1.3866946	total: 57.5s	remaining: 2m 50s
200:	learn: 1.2402473	total: 1m 57s	remaining: 1m 56s
300:	learn: 1.1875005	total: 3m 15s	remaining: 1m 4s
399:	learn: 1.1549750	total: 4m 34s	remaining: 0us
0.928266289222
[ 0.66219706  0.12594924  0.97110104  0.21793089  0.86120795  0.52200437
  1.07835636  0.12855339  0.82240608  0.18040219]


In [14]:
df_test_pred_3 = test.join(pd.Series(test_cbr_3, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_3.to_csv('../Data/Submissions/6_1_3_catboost_400_trees_with_std.csv', index=None)

### 0.99002. What about 600 trees?

In [15]:
# Start modeling
cbr_4 = cb.CatBoostRegressor(iterations=600, depth=8, random_seed=26)

# Train model
cbr_4.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_4 = cbr_4.predict(np_val_matrix)
val_cbr_4 = np.clip(val_cbr_4, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_4, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_4 = cbr_4.predict(np_test_matrix)
test_cbr_4 = np.clip(test_cbr_4, 0, 20)

print (test_cbr_4[:10])

0:	learn: 2.4811675	total: 551ms	remaining: 5m 29s
100:	learn: 1.3866946	total: 56.6s	remaining: 4m 39s
200:	learn: 1.2402473	total: 2m 14s	remaining: 4m 26s
300:	learn: 1.1875005	total: 3m 36s	remaining: 3m 35s
400:	learn: 1.1548856	total: 4m 55s	remaining: 2m 26s
500:	learn: 1.1065067	total: 6m 24s	remaining: 1m 16s
599:	learn: 1.0798208	total: 7m 38s	remaining: 0us
0.943756857271
[ 0.68043912  0.1211013   0.9667588   0.20697038  0.83116084  0.4886558
  1.10632647  0.13563199  0.87667531  0.32718629]


In [16]:
df_test_pred_4 = test.join(pd.Series(test_cbr_4, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_4.to_csv('../Data/Submissions/6_1_4_catboost_600_trees_with_std.csv', index=None)

### 0.97549, bingo! Your submission scored 0.97549, which is an improvement of your previous score of 0.97944. Great job!

### What about 700 trees?

In [17]:
# Start modeling
cbr_5 = cb.CatBoostRegressor(iterations=700, depth=8, random_seed=26)

# Train model
cbr_5.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_5 = cbr_5.predict(np_val_matrix)
val_cbr_5 = np.clip(val_cbr_5, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_5, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_5 = cbr_5.predict(np_test_matrix)
test_cbr_5 = np.clip(test_cbr_5, 0, 20)

print (test_cbr_5[:10])

0:	learn: 2.4811675	total: 552ms	remaining: 6m 25s
100:	learn: 1.3866946	total: 56.8s	remaining: 5m 36s
200:	learn: 1.2402473	total: 1m 58s	remaining: 4m 53s
300:	learn: 1.1875005	total: 3m 13s	remaining: 4m 16s
400:	learn: 1.1548856	total: 4m 37s	remaining: 3m 26s
500:	learn: 1.1065067	total: 5m 59s	remaining: 2m 22s
600:	learn: 1.0795650	total: 7m 31s	remaining: 1m 14s
699:	learn: 1.0513323	total: 9m 13s	remaining: 0us
0.946389557467
[ 0.70331324  0.11855691  0.97203454  0.19484174  0.79797864  0.47567052
  1.108428    0.13809008  0.8985481   0.32084839]


In [18]:
df_test_pred_5 = test.join(pd.Series(test_cbr_5, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_5.to_csv('../Data/Submissions/6_1_5_catboost_700_trees_with_std.csv', index=None)

### Wow. It is still works! Your submission scored 0.97295, which is an improvement of your previous score of 0.97549. Great job!

### Let's try 800 trees with 8 depth

In [19]:
# Start modeling
cbr_6 = cb.CatBoostRegressor(iterations=800, depth=8, random_seed=26)

# Train model
cbr_6.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_6 = cbr_6.predict(np_val_matrix)
val_cbr_6 = np.clip(val_cbr_6, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_6, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_6 = cbr_6.predict(np_test_matrix)
test_cbr_6 = np.clip(test_cbr_6, 0, 20)

print (test_cbr_6[:10])

0:	learn: 2.4811675	total: 559ms	remaining: 7m 26s
100:	learn: 1.3866946	total: 1m 11s	remaining: 8m 16s
200:	learn: 1.2402473	total: 2m 41s	remaining: 8m 2s
300:	learn: 1.1875005	total: 4m 32s	remaining: 7m 31s
400:	learn: 1.1548856	total: 6m 16s	remaining: 6m 14s
500:	learn: 1.1065067	total: 8m 6s	remaining: 4m 50s
600:	learn: 1.0795650	total: 10m 1s	remaining: 3m 19s
700:	learn: 1.0512352	total: 11m 57s	remaining: 1m 41s
799:	learn: 1.0282497	total: 13m 59s	remaining: 0us
0.950335552221
[ 0.7187036   0.11147701  0.98165522  0.18469531  0.80615558  0.46833731
  1.12303327  0.14041333  0.91575907  0.33699294]


In [20]:
df_test_pred_6 = test.join(pd.Series(test_cbr_6, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_6.to_csv('../Data/Submissions/6_1_6_catboost_800_trees_with_std.csv', index=None)

### Your submission scored 0.97118, which is an improvement of your previous score of 0.97295. Great job!

### Let's increase number of trees to 900

In [21]:
# Start modeling
cbr_7 = cb.CatBoostRegressor(iterations=900, depth=8, random_seed=26)

# Train model
cbr_7.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_7 = cbr_7.predict(np_val_matrix)
val_cbr_7 = np.clip(val_cbr_7, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_7, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_7 = cbr_7.predict(np_test_matrix)
test_cbr_7 = np.clip(test_cbr_7, 0, 20)

print (test_cbr_7[:10])

0:	learn: 2.4811675	total: 570ms	remaining: 8m 32s
100:	learn: 1.3866946	total: 1m 19s	remaining: 10m 30s
200:	learn: 1.2402473	total: 2m 59s	remaining: 10m 22s
300:	learn: 1.1875005	total: 4m 33s	remaining: 9m 3s
400:	learn: 1.1548856	total: 6m 3s	remaining: 7m 32s
500:	learn: 1.1065067	total: 7m 30s	remaining: 5m 58s
600:	learn: 1.0795650	total: 9m 4s	remaining: 4m 31s
700:	learn: 1.0512352	total: 10m 36s	remaining: 3m
800:	learn: 1.0281156	total: 12m 8s	remaining: 1m 30s
899:	learn: 1.0080754	total: 13m 40s	remaining: 0us
0.953205177444
[ 0.75076969  0.11070633  1.00159364  0.19839734  0.7928251   0.46822974
  1.14535782  0.14156475  0.93061459  0.34352648]


In [22]:
df_test_pred_7 = test.join(pd.Series(test_cbr_7, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_7.to_csv('../Data/Submissions/6_1_7_catboost_900_trees_with_std.csv', index=None)

### Great, it works very well. Your submission scored 0.96992, which is an improvement of your previous score of 0.97118. Great job!

### Now we try 1000 trees

In [10]:
# Start modeling
cbr_8 = cb.CatBoostRegressor(iterations=1000, depth=8, random_seed=26)

# Train model
cbr_8.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_8 = cbr_8.predict(np_val_matrix)
val_cbr_8 = np.clip(val_cbr_8, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_8, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_8 = cbr_8.predict(np_test_matrix)
test_cbr_8 = np.clip(test_cbr_8, 0, 20)

print (test_cbr_8[:10])

0:	learn: 2.4811675	total: 633ms	remaining: 10m 32s
100:	learn: 1.3866946	total: 58.1s	remaining: 8m 37s
200:	learn: 1.2402473	total: 2m 10s	remaining: 8m 40s
300:	learn: 1.1875005	total: 3m 29s	remaining: 8m 6s
400:	learn: 1.1548856	total: 4m 57s	remaining: 7m 23s
500:	learn: 1.1065067	total: 6m 25s	remaining: 6m 23s
600:	learn: 1.0795650	total: 8m 11s	remaining: 5m 26s
700:	learn: 1.0512352	total: 9m 53s	remaining: 4m 12s
800:	learn: 1.0281156	total: 11m 39s	remaining: 2m 53s
900:	learn: 1.0080474	total: 13m 3s	remaining: 1m 26s
999:	learn: 0.9817443	total: 14m 30s	remaining: 0us
0.957047371383
[ 0.75401679  0.10525834  1.01743752  0.19334837  0.75145986  0.46197127
  1.1445893   0.14182811  0.94045474  0.34839846]


In [11]:
df_test_pred_8 = test.join(pd.Series(test_cbr_8, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_8.to_csv('../Data/Submissions/6_1_8_catboost_1000_trees_with_std.csv', index=None)

### 0.97055 - not the best score. Let's change random seed

In [13]:
# Start modeling
cbr_9 = cb.CatBoostRegressor(iterations=900, depth=8, random_seed=4)

# Train model
cbr_9.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_9 = cbr_9.predict(np_val_matrix)
val_cbr_9 = np.clip(val_cbr_9, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_9, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_9 = cbr_9.predict(np_test_matrix)
test_cbr_9 = np.clip(test_cbr_9, 0, 20)

print (test_cbr_9[:10])

0:	learn: 2.4847328	total: 555ms	remaining: 8m 18s
100:	learn: 1.3856467	total: 1m 11s	remaining: 9m 25s
200:	learn: 1.2445882	total: 2m 42s	remaining: 9m 24s
300:	learn: 1.1742307	total: 4m 9s	remaining: 8m 15s
400:	learn: 1.1397619	total: 5m 33s	remaining: 6m 55s
500:	learn: 1.1060752	total: 6m 53s	remaining: 5m 29s
600:	learn: 1.0798327	total: 8m 15s	remaining: 4m 6s
700:	learn: 1.0559542	total: 9m 52s	remaining: 2m 48s
800:	learn: 1.0269983	total: 11m 31s	remaining: 1m 25s
899:	learn: 1.0017091	total: 13m 34s	remaining: 0us
0.945586213119
[ 0.72965716  0.11246311  0.94176006  0.19296657  0.64430349  0.45657939
  1.06922135  0.14091511  0.88963909  0.26886789]


In [15]:
df_test_pred_9 = test.join(pd.Series(test_cbr_9, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_9.to_csv('../Data/Submissions/6_1_9_catboost_900_trees_with_std.csv', index=None)

### 0.96500, wow! Your submission scored 0.96500, which is an improvement of your previous score of 0.96992. Great job! 15 places up on the leaderboard

### Let us try 1000 trees with this random seed

In [16]:
# Start modeling
cbr_10 = cb.CatBoostRegressor(iterations=1000, depth=8, random_seed=4)

# Train model
cbr_10.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_10 = cbr_10.predict(np_val_matrix)
val_cbr_10 = np.clip(val_cbr_10, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_10, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_10 = cbr_10.predict(np_test_matrix)
test_cbr_10 = np.clip(test_cbr_10, 0, 20)

print (test_cbr_10[:10])

0:	learn: 2.4847328	total: 576ms	remaining: 9m 35s
100:	learn: 1.3856467	total: 1m 1s	remaining: 9m 3s
200:	learn: 1.2445882	total: 2m 22s	remaining: 9m 25s
300:	learn: 1.1742307	total: 3m 44s	remaining: 8m 42s
400:	learn: 1.1397619	total: 5m 5s	remaining: 7m 36s
500:	learn: 1.1060752	total: 6m 23s	remaining: 6m 22s
600:	learn: 1.0798327	total: 7m 41s	remaining: 5m 6s
700:	learn: 1.0559542	total: 9m 4s	remaining: 3m 52s
800:	learn: 1.0269983	total: 10m 21s	remaining: 2m 34s
900:	learn: 1.0007516	total: 11m 51s	remaining: 1m 18s
999:	learn: 0.9827573	total: 13m 23s	remaining: 0us
0.948047033706
[ 0.73844426  0.11441228  0.94531768  0.21117039  0.61258524  0.45203371
  1.12077449  0.14240582  0.93278232  0.27371671]


In [17]:
df_test_pred_10 = test.join(pd.Series(test_cbr_10, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_10.to_csv('../Data/Submissions/6_1_10_catboost_1000_trees_with_std.csv', index=None)

### Public LB score = 0.96449, slightly better

### ... And 1100 trees

In [18]:
# Start modeling
cbr_11 = cb.CatBoostRegressor(iterations=1100, depth=8, random_seed=4)

# Train model
cbr_11.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_11 = cbr_11.predict(np_val_matrix)
val_cbr_11 = np.clip(val_cbr_11, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_11, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_11 = cbr_11.predict(np_test_matrix)
test_cbr_11 = np.clip(test_cbr_11, 0, 20)

print (test_cbr_11[:10])

0:	learn: 2.4847328	total: 597ms	remaining: 10m 55s
100:	learn: 1.3856467	total: 1m 2s	remaining: 10m 21s
200:	learn: 1.2445882	total: 2m 24s	remaining: 10m 44s
300:	learn: 1.1742307	total: 3m 42s	remaining: 9m 50s
400:	learn: 1.1397619	total: 5m 1s	remaining: 8m 45s
500:	learn: 1.1060752	total: 6m 14s	remaining: 7m 27s
600:	learn: 1.0798327	total: 7m 31s	remaining: 6m 14s
700:	learn: 1.0559542	total: 8m 51s	remaining: 5m 2s
800:	learn: 1.0269983	total: 10m 18s	remaining: 3m 51s
900:	learn: 1.0007516	total: 11m 51s	remaining: 2m 37s
1000:	learn: 0.9820265	total: 13m 14s	remaining: 1m 18s
1099:	learn: 0.9554568	total: 14m 45s	remaining: 0us
0.951226225687
[ 0.7275049   0.10843564  0.94671598  0.21936311  0.58171137  0.45407823
  1.13758443  0.14389279  0.94910729  0.27597356]


In [19]:
df_test_pred_11 = test.join(pd.Series(test_cbr_11, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_11.to_csv('../Data/Submissions/6_1_11_catboost_1100_trees_with_std.csv', index=None)

### 0.96434 - A bit better

### May be 1200 trees take slightly better result...

In [10]:
# Start modeling
cbr_12 = cb.CatBoostRegressor(iterations=1200, depth=8, random_seed=4)

# Train model
cbr_12.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_12 = cbr_12.predict(np_val_matrix)
val_cbr_12 = np.clip(val_cbr_12, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_12, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_12 = cbr_12.predict(np_test_matrix)
test_cbr_12 = np.clip(test_cbr_12, 0, 20)

print (test_cbr_12[:10])

0:	learn: 2.4847328	total: 824ms	remaining: 16m 27s
100:	learn: 1.3856467	total: 1m 2s	remaining: 11m 15s
200:	learn: 1.2445882	total: 2m 37s	remaining: 13m 1s
300:	learn: 1.1742307	total: 3m 50s	remaining: 11m 27s
400:	learn: 1.1397619	total: 5m 15s	remaining: 10m 28s
500:	learn: 1.1060752	total: 6m 47s	remaining: 9m 28s
600:	learn: 1.0798327	total: 8m 11s	remaining: 8m 10s
700:	learn: 1.0559542	total: 9m 32s	remaining: 6m 47s
800:	learn: 1.0269983	total: 10m 51s	remaining: 5m 24s
900:	learn: 1.0007516	total: 12m 17s	remaining: 4m 4s
1000:	learn: 0.9820265	total: 13m 35s	remaining: 2m 42s
1100:	learn: 0.9553976	total: 15m 5s	remaining: 1m 21s
1199:	learn: 0.9388954	total: 16m 38s	remaining: 0us
0.952740843671
[ 0.73219795  0.10475316  0.93288725  0.22332731  0.56471019  0.44947479
  1.12917542  0.14257811  0.99051767  0.27549427]


In [11]:
df_test_pred_12 = test.join(pd.Series(test_cbr_12, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_12.to_csv('../Data/Submissions/6_1_12_catboost_1200_trees_with_std.csv', index=None)

### Public LB Score = 0.96456. Model start overfitting

### Let's try trird random seed

In [10]:
# Start modeling
cbr_13 = cb.CatBoostRegressor(iterations=700, depth=8, random_seed=1989)

# Train model
cbr_13.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_13 = cbr_13.predict(np_val_matrix)
val_cbr_13 = np.clip(val_cbr_13, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_13, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_13 = cbr_13.predict(np_test_matrix)
test_cbr_13 = np.clip(test_cbr_13, 0, 20)

print (test_cbr_13[:10])

0:	learn: 2.4793616	total: 662ms	remaining: 7m 42s
100:	learn: 1.3780073	total: 57.7s	remaining: 5m 41s
200:	learn: 1.2378002	total: 2m 3s	remaining: 5m 6s
300:	learn: 1.1965830	total: 3m 24s	remaining: 4m 31s
400:	learn: 1.1521588	total: 5m	remaining: 3m 43s
500:	learn: 1.1121830	total: 6m 39s	remaining: 2m 38s
600:	learn: 1.0805617	total: 8m 14s	remaining: 1m 21s
699:	learn: 1.0542672	total: 9m 59s	remaining: 0us
0.940511039791
[ 0.75833833  0.08680165  0.98400483  0.22867623  0.64178663  0.53623793
  1.0372205   0.12677932  0.92658929  0.23552745]


In [11]:
df_test_pred_13 = test.join(pd.Series(test_cbr_13, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_13.to_csv('../Data/Submissions/6_1_13_catboost_700_trees_with_std_third_seed.csv', index=None)

#### Public LB Score = 0.97767

#### And 800 trees

In [12]:
# Start modeling
cbr_14 = cb.CatBoostRegressor(iterations=800, depth=8, random_seed=1989)

# Train model
cbr_14.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_14 = cbr_14.predict(np_val_matrix)
val_cbr_14 = np.clip(val_cbr_14, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_14, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_14 = cbr_14.predict(np_test_matrix)
test_cbr_14 = np.clip(test_cbr_14, 0, 20)

print (test_cbr_14[:10])

0:	learn: 2.4793616	total: 643ms	remaining: 8m 34s
100:	learn: 1.3780073	total: 1m 56s	remaining: 13m 28s
200:	learn: 1.2378002	total: 3m 49s	remaining: 11m 22s
300:	learn: 1.1965830	total: 5m 46s	remaining: 9m 34s
400:	learn: 1.1521588	total: 7m 46s	remaining: 7m 44s
500:	learn: 1.1121830	total: 9m 46s	remaining: 5m 49s
600:	learn: 1.0805617	total: 12m 3s	remaining: 3m 59s
700:	learn: 1.0526529	total: 14m 6s	remaining: 1m 59s
799:	learn: 1.0197211	total: 16m 2s	remaining: 0us
0.945577866457
[ 0.75711805  0.08185568  0.99344825  0.24454795  0.57706606  0.52544166
  1.05833928  0.12648436  0.95931206  0.22043181]


In [13]:
df_test_pred_14 = test.join(pd.Series(test_cbr_14, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_14.to_csv('../Data/Submissions/6_1_14_catboost_800_trees_with_std_third_seed.csv', index=None)

#### 0.97229 - better

#### And 1000 trees

In [14]:
# Start modeling
cbr_15 = cb.CatBoostRegressor(iterations=1000, depth=8, random_seed=1989)

# Train model
cbr_15.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_15 = cbr_15.predict(np_val_matrix)
val_cbr_15 = np.clip(val_cbr_15, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_15, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_15 = cbr_15.predict(np_test_matrix)
test_cbr_15 = np.clip(test_cbr_15, 0, 20)

print (test_cbr_15[:10])

0:	learn: 2.4793616	total: 873ms	remaining: 14m 32s
100:	learn: 1.3780073	total: 1m 37s	remaining: 14m 24s
200:	learn: 1.2378002	total: 3m 21s	remaining: 13m 21s
300:	learn: 1.1965830	total: 5m 6s	remaining: 11m 51s
400:	learn: 1.1521588	total: 6m 52s	remaining: 10m 15s
500:	learn: 1.1121830	total: 8m 32s	remaining: 8m 30s
600:	learn: 1.0805617	total: 9m 54s	remaining: 6m 34s
700:	learn: 1.0526529	total: 11m 23s	remaining: 4m 51s
800:	learn: 1.0185000	total: 12m 44s	remaining: 3m 10s
900:	learn: 0.9952696	total: 14m 8s	remaining: 1m 33s
999:	learn: 0.9766909	total: 15m 38s	remaining: 0us
0.95352983222
[ 0.78120532  0.08373945  1.0190648   0.2453445   0.49803631  0.48930308
  1.0529561   0.12979831  0.98102868  0.2069991 ]


In [15]:
df_test_pred_15 = test.join(pd.Series(test_cbr_15, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_15.to_csv('../Data/Submissions/6_1_15_catboost_1000_trees_with_std_third_seed.csv', index=None)

#### 0.97171 - slightly better

#### ...And 1100 trees

In [16]:
# Start modeling
cbr_16 = cb.CatBoostRegressor(iterations=1100, depth=8, random_seed=1989)

# Train model
cbr_16.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_16 = cbr_16.predict(np_val_matrix)
val_cbr_16 = np.clip(val_cbr_16, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_16, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_16 = cbr_16.predict(np_test_matrix)
test_cbr_16 = np.clip(test_cbr_16, 0, 20)

print (test_cbr_16[:10])

0:	learn: 2.4793616	total: 572ms	remaining: 10m 28s
100:	learn: 1.3780073	total: 1m 19s	remaining: 13m 4s
200:	learn: 1.2378002	total: 2m 39s	remaining: 11m 54s
300:	learn: 1.1965830	total: 4m 2s	remaining: 10m 42s
400:	learn: 1.1521588	total: 5m 20s	remaining: 9m 19s
500:	learn: 1.1121830	total: 6m 44s	remaining: 8m 4s
600:	learn: 1.0805617	total: 8m 20s	remaining: 6m 55s
700:	learn: 1.0526529	total: 9m 48s	remaining: 5m 34s
800:	learn: 1.0185000	total: 11m 18s	remaining: 4m 13s
900:	learn: 0.9952696	total: 12m 57s	remaining: 2m 51s
1000:	learn: 0.9764891	total: 14m 30s	remaining: 1m 26s
1099:	learn: 0.9560757	total: 16m 7s	remaining: 0us
0.956596679886
[ 0.79306333  0.08226717  1.02475949  0.24356052  0.49364944  0.48665406
  1.18883754  0.12863289  0.9865919   0.38547622]


In [17]:
df_test_pred_16 = test.join(pd.Series(test_cbr_16, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_16.to_csv('../Data/Submissions/6_1_16_catboost_1100_trees_with_std_third_seed.csv', index=None)

#### 0.97149 - a little bit better. Let's try fourth seed

#### ...And 1200 trees

In [19]:
# Start modeling
cbr_17 = cb.CatBoostRegressor(iterations=1000, depth=8, random_seed=1)

# Train model
cbr_17.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_17 = cbr_17.predict(np_val_matrix)
val_cbr_17 = np.clip(val_cbr_17, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_17, val_target_clip))
print(rmse)
# 0.92821027298022363

# Calculate predictions on test set
test_cbr_17 = cbr_17.predict(np_test_matrix)
test_cbr_17 = np.clip(test_cbr_17, 0, 20)

print (test_cbr_17[:10])

0:	learn: 2.4847175	total: 899ms	remaining: 14m 57s
100:	learn: 1.3804346	total: 1m 47s	remaining: 15m 59s
200:	learn: 1.2427810	total: 3m 9s	remaining: 12m 34s
300:	learn: 1.1836846	total: 4m 32s	remaining: 10m 33s
400:	learn: 1.1452198	total: 5m 55s	remaining: 8m 51s
500:	learn: 1.1054746	total: 7m 20s	remaining: 7m 18s
600:	learn: 1.0694536	total: 8m 49s	remaining: 5m 51s
700:	learn: 1.0449741	total: 10m 7s	remaining: 4m 19s
800:	learn: 1.0179989	total: 11m 31s	remaining: 2m 51s
900:	learn: 0.9967356	total: 12m 54s	remaining: 1m 25s
999:	learn: 0.9760779	total: 14m 12s	remaining: 0us
0.954106304523
[ 0.67325938  0.093028    1.05023568  0.24579249  0.55845966  0.53982457
  1.14190466  0.13477733  0.9249142   0.27903585]


In [20]:
df_test_pred_17 = test.join(pd.Series(test_cbr_17, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_17.to_csv('../Data/Submissions/6_1_17_catboost_1000_trees_with_std_fourth_seed.csv', index=None)

#### 0.9717, not good

## Let us add mean of the count day

In [6]:
# New function to construct base dataframes for train, validation and test
def construct_base_datasets_add_std_and_mean(months=NMONTHS, verbose=True):
    """
    months - number of last months to construct base train/test matrix dataset
    verbose - if True, function print logs of it's operations
    This function construct grouped by months sales information for train, validation and test
    """
    
    # Group transactions by month, shop and item (add stds and price mean here)
    if verbose:
        print(str(datetime.datetime.now()) + ': start aggregating')
    
    agg_func = {'item_cnt_day': ['sum', 'std', 'mean'], 
                 'item_price': ['mean', 'std']}
    grp_trans = transactions.groupby(['date_block_num', 'shop_id', 'item_id'], as_index=False).agg(agg_func)
    # Rename columns
    grp_trans.columns = ['date_block_num', 'shop_id', 'item_id', 'item_cnt_day_sum', 'item_cnt_day_std', 
                         'item_cnt_day_mean', 'item_price_mean', 'item_price_std']
    
    # Split dataset by months
    if verbose:
        print(str(datetime.datetime.now()) + '. construct_base_datasets_add_std_and_perc: start splitting by months')
    
    lst_grp_trans = []
    max_month = grp_trans.date_block_num.max() + 1
    for month in range(max_month-months-2, max_month): # Get only last NMONTHS months data
        month_df = grp_trans[grp_trans.date_block_num == month]
        # Rename target column and price column
        month_df.rename(index=str, columns={'item_cnt_day_sum': 'item_cnt_day_sum_' + str(month),
                                            'item_cnt_day_std': 'item_cnt_day_std_' + str(month),
                                            'item_cnt_day_mean': 'item_cnt_day_mean_' + str(month),
                                            'item_price_mean': 'item_price_mean_' + str(month),
                                            'item_price_std': 'item_price_std_' + str(month)
                                           }, inplace=True) 
        lst_grp_trans.append(month_df)
        
    # Join data with test dataset
    if verbose:
        print(str(datetime.datetime.now()) + '. construct_base_datasets_add_std_and_perc: start joining with test data')
    
    all_data = test[['shop_id', 'item_id']].set_index(['shop_id', 'item_id'])
    for month in range(max_month-months-2, max_month): # [8..34)
        all_data = all_data.join(lst_grp_trans[month+months-max_month+2][['shop_id', 'item_id', 
                                                                          'item_cnt_day_sum_' + str(month),
                                                                          'item_cnt_day_std_' + str(month),
                                                                          'item_cnt_day_mean_' + str(month),
                                                                          'item_price_mean_' + str(month),
                                                                          'item_price_std_' + str(month)
                                                                         ]].set_index(['shop_id', 'item_id']))

    # Delete unused dataframes
    del grp_trans, lst_grp_trans
    gc.collect()
    
    # Replace NaNs by 0
    all_data.fillna(0, inplace=True)
    
    # Get train target and validation target column
    if verbose:
        print(str(datetime.datetime.now()) + '. construct_base_datasets_add_std_and_perc: start calculating x_train, ' +
              'x_val, x_test, train_target, val_target')
    
    val_target = all_data.item_cnt_day_sum_33
    train_target = all_data.item_cnt_day_sum_32
    
    # Calculate x_train, x_val, x_test
    x_train = all_data.copy()
    x_train.drop(['item_cnt_day_sum_32', 'item_cnt_day_std_32', 'item_cnt_day_mean_32',
                  'item_price_mean_32', 'item_price_std_32',
                  'item_cnt_day_sum_33', 'item_cnt_day_std_33', 'item_cnt_day_mean_33',
                  'item_price_mean_33', 'item_price_std_33'
                 ], axis=1, inplace=True)
    
    x_val = all_data.copy()
    first_col = str(max_month-months-2)
    x_val.drop(['item_cnt_day_sum_' + first_col, 'item_cnt_day_std_' + first_col, 'item_cnt_day_mean_' + first_col,
                'item_price_mean_' + first_col, 'item_price_std_' + first_col,
                'item_cnt_day_sum_33', 'item_cnt_day_std_33', 'item_cnt_day_mean_33',
                'item_price_mean_33', 'item_price_std_33',
               ], axis=1, inplace=True)
    
    
    x_test = all_data.copy()
    second_col = str(max_month-months-1)
    x_test.drop(['item_cnt_day_sum_' + first_col, 'item_cnt_day_std_' + first_col, 'item_cnt_day_mean_' + first_col,
                 'item_price_mean_' + first_col, 'item_price_std_' + first_col,
                 'item_cnt_day_sum_' + second_col, 'item_cnt_day_std_' + second_col,
                 'item_price_mean_' + second_col, 'item_price_std_' + second_col, 'item_cnt_day_mean_' + second_col,
                ], axis=1, inplace=True)
    
    return x_train, x_val, x_test, train_target, val_target
    #return x_train, x_test, train_target

In [16]:
%%time
tr_train, tr_val, tr_test, tr_target, val_target = construct_base_datasets_add_std_and_mean()
print(tr_train.shape, tr_val.shape, tr_test.shape, tr_target.shape, val_target.shape)

2018-09-25 22:34:43.184345: start aggregating
2018-09-25 22:34:44.965806. construct_base_datasets_add_std_and_perc: start splitting by months
2018-09-25 22:34:51.077078. construct_base_datasets_add_std_and_perc: start joining with test data
2018-09-25 22:34:53.768383. construct_base_datasets_add_std_and_perc: start calculating x_train, x_val, x_test, train_target, val_target
(214200, 120) (214200, 120) (214200, 120) (214200,) (214200,)
Wall time: 11.2 s


In [9]:
tr_train.head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,item_cnt_day_sum_8,item_cnt_day_std_8,item_cnt_day_mean_8,item_price_mean_8,item_price_std_8,item_cnt_day_sum_9,item_cnt_day_std_9,item_cnt_day_mean_9,item_price_mean_9,item_price_std_9,item_cnt_day_sum_10,item_cnt_day_std_10,item_cnt_day_mean_10,item_price_mean_10,item_price_std_10,item_cnt_day_sum_11,item_cnt_day_std_11,item_cnt_day_mean_11,item_price_mean_11,item_price_std_11,item_cnt_day_sum_12,item_cnt_day_std_12,item_cnt_day_mean_12,item_price_mean_12,item_price_std_12,item_cnt_day_sum_13,item_cnt_day_std_13,item_cnt_day_mean_13,item_price_mean_13,item_price_std_13,item_cnt_day_sum_14,item_cnt_day_std_14,item_cnt_day_mean_14,item_price_mean_14,item_price_std_14,item_cnt_day_sum_15,item_cnt_day_std_15,item_cnt_day_mean_15,item_price_mean_15,item_price_std_15,item_cnt_day_sum_16,item_cnt_day_std_16,item_cnt_day_mean_16,item_price_mean_16,item_price_std_16,item_cnt_day_sum_17,item_cnt_day_std_17,item_cnt_day_mean_17,item_price_mean_17,item_price_std_17,item_cnt_day_sum_18,item_cnt_day_std_18,item_cnt_day_mean_18,item_price_mean_18,item_price_std_18,item_cnt_day_sum_19,item_cnt_day_std_19,item_cnt_day_mean_19,item_price_mean_19,item_price_std_19,item_cnt_day_sum_20,item_cnt_day_std_20,item_cnt_day_mean_20,item_price_mean_20,item_price_std_20,item_cnt_day_sum_21,item_cnt_day_std_21,item_cnt_day_mean_21,item_price_mean_21,item_price_std_21,item_cnt_day_sum_22,item_cnt_day_std_22,item_cnt_day_mean_22,item_price_mean_22,item_price_std_22,item_cnt_day_sum_23,item_cnt_day_std_23,item_cnt_day_mean_23,item_price_mean_23,item_price_std_23,item_cnt_day_sum_24,item_cnt_day_std_24,item_cnt_day_mean_24,item_price_mean_24,item_price_std_24,item_cnt_day_sum_25,item_cnt_day_std_25,item_cnt_day_mean_25,item_price_mean_25,item_price_std_25,item_cnt_day_sum_26,item_cnt_day_std_26,item_cnt_day_mean_26,item_price_mean_26,item_price_std_26,item_cnt_day_sum_27,item_cnt_day_std_27,item_cnt_day_mean_27,item_price_mean_27,item_price_std_27,item_cnt_day_sum_28,item_cnt_day_std_28,item_cnt_day_mean_28,item_price_mean_28,item_price_std_28,item_cnt_day_sum_29,item_cnt_day_std_29,item_cnt_day_mean_29,item_price_mean_29,item_price_std_29,item_cnt_day_sum_30,item_cnt_day_std_30,item_cnt_day_mean_30,item_price_mean_30,item_price_std_30,item_cnt_day_sum_31,item_cnt_day_std_31,item_cnt_day_mean_31,item_price_mean_31,item_price_std_31
shop_id,item_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1,Unnamed: 102_level_1,Unnamed: 103_level_1,Unnamed: 104_level_1,Unnamed: 105_level_1,Unnamed: 106_level_1,Unnamed: 107_level_1,Unnamed: 108_level_1,Unnamed: 109_level_1,Unnamed: 110_level_1,Unnamed: 111_level_1,Unnamed: 112_level_1,Unnamed: 113_level_1,Unnamed: 114_level_1,Unnamed: 115_level_1,Unnamed: 116_level_1,Unnamed: 117_level_1,Unnamed: 118_level_1,Unnamed: 119_level_1,Unnamed: 120_level_1,Unnamed: 121_level_1
5,5037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2599.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,2599.0,0.0,2.0,0.0,1.0,1999.0,0.0,2.0,0.0,1.0,1999.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1299.0,0.0,1.0,0.0,1.0,1499.0,0.0,1.0,0.0,1.0,1499.0,0.0,3.0,0.0,1.0,999.166667,432.868437
5,5320,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,5233,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.707107,1.5,899.0,424.264069,2.0,0.0,1.0,599.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,599.0,0.0


In [7]:
def add_means_with_std_and_mean(df, months=NMONTHS):
    """
    df - dataframe with the data
    This function add sliding means by last n column values (last n months)
    """
    # Define filters for price and item count features separation
    df_fltr_item_cnt_sum_col = [col for col in df if col.startswith('item_cnt_day_sum')]
    df_fltr_item_cnt_std_col = [col for col in df if col.startswith('item_cnt_day_std')]
    df_fltr_item_cnt_mean_col = [col for col in df if col.startswith('item_cnt_day_mean')]
    df_fltr_price_mean_col = [col for col in df if col.startswith('item_price_mean')]
    df_fltr_price_std_col = [col for col in df if col.startswith('item_price_std')]
    
    
    # Get separate dataframes by this filters
    df_item_cnt_sum_df = df[df_fltr_item_cnt_sum_col]
    df_item_cnt_std_df = df[df_fltr_item_cnt_std_col]
    df_item_cnt_mean_df = df[df_fltr_item_cnt_mean_col]
    df_price_mean_df = df[df_fltr_price_mean_col]
    df_price_std_df = df[df_fltr_price_std_col]
    
    
    # Convert dataframes to NumPy format
    np_df_sum_count = np.array(df_item_cnt_sum_df)
    np_df_std_count = np.array(df_item_cnt_std_df)
    np_df_mean_count = np.array(df_item_cnt_mean_df)
    np_df_mean_price = np.array(df_price_mean_df)
    np_df_std_price = np.array(df_price_std_df)
    
    
    # Clean data
    del df_item_cnt_sum_df, df_item_cnt_std_df, df_item_cnt_mean_df
    del df_price_mean_df, df_price_std_df
    gc.collect()
    
    # Add features to df
    for i in range(1, months+1):
        df['mean_cnt_sum_' + str(i)] = pd.Series(np_df_sum_count[:, -i:].mean(axis=1), index=df.index)
        df['mean_cnt_std_' + str(i)] = pd.Series(np_df_std_count[:, -i:].mean(axis=1), index=df.index)
        df['mean_cnt_mean_' + str(i)] = pd.Series(np_df_mean_count[:, -i:].mean(axis=1), index=df.index)
        df['mean_price_mean_' + str(i)] = pd.Series(np_df_mean_price[:, -i:].mean(axis=1), index=df.index)
        df['mean_price_std_' + str(i)] = pd.Series(np_df_std_price[:, -i:].mean(axis=1), index=df.index)
        
    return df

In [18]:
%%time
tr_train = add_means_with_std_and_mean(tr_train)
tr_val = add_means_with_std_and_mean(tr_val)
tr_test = add_means_with_std_and_mean(tr_test)
print(tr_train.shape, tr_val.shape, tr_test.shape, tr_target.shape, val_target.shape)

(214200, 240) (214200, 240) (214200, 240) (214200,) (214200,)
Wall time: 3.98 s


In [8]:
def get_month_mean_encodings_with_std_and_mean(df):
    """
    df - initial dataframe
    Function calculate cumsum divided by cumcount for current shop and current item saled quantities,
    also it makes the same transformations for current month price
    We aggregate info by months
    """
    # Drop base dataframes indeces
    df.reset_index(inplace=True)
    
    # Get dataframes with sold item quantities, grouped by shop and item
    
    # Define item count column filters
    df_filter_cnt_sum_col = [col for col in df if col.startswith('item_cnt_day_sum')]
    df_filter_cnt_std_col = [col for col in df if col.startswith('item_cnt_day_std')]
    df_filter_cnt_mean_col = [col for col in df if col.startswith('item_cnt_day_mean')]
    
    # Define item price filters
    df_filter_price_mean_col = [col for col in df if col.startswith('item_price_mean')]
    df_filter_price_std_col = [col for col in df if col.startswith('item_price_std')]

    
    # Aggregate df by shop and item
    df_shop_agg_cnt_sum_df = df.groupby(['shop_id'])[df_filter_cnt_sum_col].agg(['sum'])
    df_shop_agg_cnt_std_df = df.groupby(['shop_id'])[df_filter_cnt_std_col].agg(['sum'])
    df_shop_agg_cnt_mean_df = df.groupby(['shop_id'])[df_filter_cnt_mean_col].agg(['sum'])
    
    df_item_agg_cnt_sum_df = df.groupby(['item_id'])[df_filter_cnt_sum_col].agg(['sum'])
    df_item_agg_cnt_std_df = df.groupby(['item_id'])[df_filter_cnt_std_col].agg(['sum'])
    df_item_agg_cnt_mean_df = df.groupby(['item_id'])[df_filter_cnt_mean_col].agg(['sum'])
    
    df_shop_agg_price_mean_df = df.groupby(['shop_id'])[df_filter_price_mean_col].agg(['sum'])
    df_shop_agg_price_std_df = df.groupby(['shop_id'])[df_filter_price_std_col].agg(['sum'])
    
    df_item_agg_price_mean_df = df.groupby(['item_id'])[df_filter_price_mean_col].agg(['sum'])
    df_item_agg_price_std_df = df.groupby(['item_id'])[df_filter_price_std_col].agg(['sum'])
        

    # Now we get df cumulative sum of this features on rows
    df_shop_agg_cnt_sum_df = df_shop_agg_cnt_sum_df.cumsum(axis=1)
    df_shop_agg_cnt_std_df = df_shop_agg_cnt_std_df.cumsum(axis=1)
    df_shop_agg_cnt_mean_df = df_shop_agg_cnt_mean_df.cumsum(axis=1)
    
    df_item_agg_cnt_sum_df = df_item_agg_cnt_sum_df.cumsum(axis=1)
    df_item_agg_cnt_std_df = df_item_agg_cnt_std_df.cumsum(axis=1)
    df_item_agg_cnt_mean_df = df_item_agg_cnt_mean_df.cumsum(axis=1)
    
    df_shop_agg_price_mean_df = df_shop_agg_price_mean_df.cumsum(axis=1)
    df_shop_agg_price_std_df = df_shop_agg_price_std_df.cumsum(axis=1)
    
    df_item_agg_price_mean_df = df_item_agg_price_mean_df.cumsum(axis=1)
    df_item_agg_price_std_df = df_item_agg_price_std_df.cumsum(axis=1)


    # Calculate arithmetic progression (1, 2, 3, 4, ...) with months length
    np_to_divide = np.array(df_item_agg_cnt_sum_df.columns.labels[0] + 1, dtype=float)

    
    # Divide df cumulative sum by column number (np.cumcount emulation)
    df_shop_agg_cnt_sum_df = df_shop_agg_cnt_sum_df / np_to_divide
    df_shop_agg_cnt_std_df = df_shop_agg_cnt_std_df / np_to_divide
    df_shop_agg_cnt_mean_df = df_shop_agg_cnt_mean_df / np_to_divide
    
    df_item_agg_cnt_sum_df = df_item_agg_cnt_sum_df / np_to_divide
    df_item_agg_cnt_std_df = df_item_agg_cnt_std_df / np_to_divide
    df_item_agg_cnt_mean_df = df_item_agg_cnt_mean_df / np_to_divide
    
    df_shop_agg_price_mean_df = df_shop_agg_price_mean_df / np_to_divide
    df_shop_agg_price_std_df = df_shop_agg_price_std_df / np_to_divide
    
    df_item_agg_price_mean_df = df_item_agg_price_mean_df / np_to_divide
    df_item_agg_price_std_df = df_item_agg_price_std_df / np_to_divide
    
    
    # Move index to columns of df daraframes
    df_shop_agg_cnt_sum_df.reset_index(inplace=True)
    df_shop_agg_cnt_std_df.reset_index(inplace=True)
    df_shop_agg_cnt_mean_df.reset_index(inplace=True)
    
    df_item_agg_cnt_sum_df.reset_index(inplace=True)
    df_item_agg_cnt_std_df.reset_index(inplace=True)
    df_item_agg_cnt_mean_df.reset_index(inplace=True)
    
    df_shop_agg_price_mean_df.reset_index(inplace=True)
    df_shop_agg_price_std_df.reset_index(inplace=True)
    
    df_item_agg_price_mean_df.reset_index(inplace=True)
    df_item_agg_price_std_df.reset_index(inplace=True)

    
    # Drop second level of multilevel index on df dataframes
    df_shop_agg_cnt_sum_df.columns = df_shop_agg_cnt_sum_df.columns.droplevel(1)
    df_shop_agg_cnt_std_df.columns = df_shop_agg_cnt_std_df.columns.droplevel(1)
    df_shop_agg_cnt_mean_df.columns = df_shop_agg_cnt_mean_df.columns.droplevel(1)
    
    df_item_agg_cnt_sum_df.columns = df_item_agg_cnt_sum_df.columns.droplevel(1)
    df_item_agg_cnt_std_df.columns = df_item_agg_cnt_std_df.columns.droplevel(1)
    df_item_agg_cnt_mean_df.columns = df_item_agg_cnt_mean_df.columns.droplevel(1)
    
    df_shop_agg_price_mean_df.columns = df_shop_agg_price_mean_df.columns.droplevel(1)
    df_shop_agg_price_std_df.columns = df_shop_agg_price_std_df.columns.droplevel(1)
    
    df_item_agg_price_mean_df.columns = df_item_agg_price_mean_df.columns.droplevel(1)
    df_item_agg_price_std_df.columns = df_item_agg_price_std_df.columns.droplevel(1)


    # Join all df mean encoding features with initial data
    df_ext = pd.merge(df, df_shop_agg_cnt_sum_df, how='left', left_on='shop_id', right_on='shop_id')
    df_ext = pd.merge(df_ext, df_shop_agg_cnt_std_df, how='left', left_on='shop_id', right_on='shop_id')
    df_ext = pd.merge(df_ext, df_shop_agg_cnt_mean_df, how='left', left_on='shop_id', right_on='shop_id')
    
    df_ext = pd.merge(df_ext, df_item_agg_cnt_sum_df, how='left', left_on='item_id', right_on='item_id')
    df_ext = pd.merge(df_ext, df_item_agg_cnt_std_df, how='left', left_on='item_id', right_on='item_id')
    df_ext = pd.merge(df_ext, df_item_agg_cnt_mean_df, how='left', left_on='item_id', right_on='item_id')
    
    df_ext = pd.merge(df_ext, df_shop_agg_price_mean_df, how='left', left_on='shop_id', right_on='shop_id')
    df_ext = pd.merge(df_ext, df_shop_agg_price_std_df, how='left', left_on='shop_id', right_on='shop_id')
    
    df_ext = pd.merge(df_ext, df_item_agg_price_mean_df, how='left', left_on='item_id', right_on='item_id')
    df_ext = pd.merge(df_ext, df_item_agg_price_std_df, how='left', left_on='item_id', right_on='item_id')

    return df_ext

In [20]:
%%time
tr_train = get_month_mean_encodings_with_std_and_mean(tr_train)
tr_val = get_month_mean_encodings_with_std_and_mean(tr_val)
tr_test = get_month_mean_encodings_with_std_and_mean(tr_test)
print(tr_train.shape, tr_val.shape, tr_test.shape, tr_target.shape, val_target.shape)

(214200, 482) (214200, 482) (214200, 482) (214200,) (214200,)
Wall time: 22.8 s


In [9]:
# Data generating pipeline

# Construct basic dataframes
print('Construct base dataframes. Time = ' + str(datetime.datetime.now()))
tr_train, tr_val, tr_test, tr_target, val_target = construct_base_datasets_add_std_and_mean()
#tr_train, tr_test, tr_target = construct_base_datasets_add_std_and_perc()
print(tr_train.shape, tr_test.shape, tr_target.shape)

# Add month means
print('Add month means. Time = ' + str(datetime.datetime.now()))
tr_train = add_means_with_std_and_mean(tr_train)
tr_val = add_means_with_std_and_mean(tr_val)
tr_test = add_means_with_std_and_mean(tr_test)
print(tr_train.shape, tr_test.shape)

# Add mean encodings
print('Add mean encodings. Time = ' + str(datetime.datetime.now()))
tr_train = get_month_mean_encodings_with_std_and_mean(tr_train)
tr_val = get_month_mean_encodings_with_std_and_mean(tr_val)
tr_test = get_month_mean_encodings_with_std_and_mean(tr_test)
print(tr_train.shape, tr_test.shape)

# Add item category to train/val/test from transactions
print('Add item category. Time = ' + str(datetime.datetime.now()))
tr_train = pd.merge(tr_train, items, how='left', left_on='item_id', right_on='item_id')
tr_val = pd.merge(tr_val, items, how='left', left_on='item_id', right_on='item_id')
tr_test = pd.merge(tr_test, items, how='left', left_on='item_id', right_on='item_id')
print(tr_train.shape, tr_test.shape)
# Drop unnecessary item_name column
tr_train.drop(['item_name'], axis=1, inplace=True)
tr_val.drop(['item_name'], axis=1, inplace=True)
tr_test.drop(['item_name'], axis=1, inplace=True)
print(tr_train.shape, tr_test.shape)

# Add item category type
print('Add item category type. Time = ' + str(datetime.datetime.now()))
tr_train = pd.merge(tr_train, item_categories, how='left', left_on='item_category_id', right_on='item_category_id')
tr_val = pd.merge(tr_val, item_categories, how='left', left_on='item_category_id', right_on='item_category_id')
tr_test = pd.merge(tr_test, item_categories, how='left', left_on='item_category_id', right_on='item_category_id')
print(tr_train.shape, tr_test.shape)
# Drop unnecessary item_category_name column
tr_train.drop(['item_category_name'], axis=1, inplace=True)
tr_val.drop(['item_category_name'], axis=1, inplace=True)
tr_test.drop(['item_category_name'], axis=1, inplace=True)
print(tr_train.shape, tr_test.shape)

# Add city feature
print('Add city. Time = ' + str(datetime.datetime.now()))
tr_train = pd.merge(tr_train, shops, how='left', left_on='shop_id', right_on='shop_id')
tr_val = pd.merge(tr_val, shops, how='left', left_on='shop_id', right_on='shop_id')
tr_test = pd.merge(tr_test, shops, how='left', left_on='shop_id', right_on='shop_id')
print(tr_train.shape, tr_test.shape)
# Drop unnecessary shop_name column
tr_train.drop(['shop_name'], axis=1, inplace=True)
tr_val.drop(['shop_name'], axis=1, inplace=True)
tr_test.drop(['shop_name'], axis=1, inplace=True)
print(tr_train.shape, tr_test.shape)

# Transform Item Category Type and City to the int categorical feature
print('Factorize Item category type and City. Time = ' + str(datetime.datetime.now()))
tr_train['item_category_type'] = tr_train['item_category_type'].factorize()[0]
tr_val['item_category_type'] = tr_val['item_category_type'].factorize()[0]
tr_test['item_category_type'] = tr_test['item_category_type'].factorize()[0]
tr_train['city'] = tr_train['city'].factorize()[0]
tr_val['city'] = tr_val['city'].factorize()[0]
tr_test['city'] = tr_test['city'].factorize()[0]

# Drop duplicate shop_id_x and item_id_x column
print('Drop duplicate indexes. Time = ' + str(datetime.datetime.now()))
if 'shop_id_x' in tr_train.columns:
    tr_train.drop(['shop_id_x'], axis=1, inplace=True)
if 'shop_id_x' in tr_val.columns:
    tr_val.drop(['shop_id_x'], axis=1, inplace=True)
if 'shop_id_x' in tr_test.columns:
    tr_test.drop(['shop_id_x'], axis=1, inplace=True)
if 'item_id_x' in tr_train.columns:
    tr_train.drop(['item_id_x'], axis=1, inplace=True)
if 'item_id_x' in tr_val.columns:
    tr_val.drop(['item_id_x'], axis=1, inplace=True)
if 'item_id_x' in tr_test.columns:
    tr_test.drop(['item_id_x'], axis=1, inplace=True)
print(tr_train.shape, tr_test.shape)

# Prepare for modeling
print('Create NumPy matrices. Time = ' + str(datetime.datetime.now()))
np_train_matrix = np.array(tr_train)
np_val_matrix = np.array(tr_val)
np_test_matrix = np.array(tr_test)
print (np_train_matrix.shape, np_test_matrix.shape)
tr_target_clip = np.clip(np.array(tr_target), 0, 20)
val_target_clip = np.clip(np.array(val_target), 0, 20)

# Clean unnesessary data
print('Clean memory. Time = ' + str(datetime.datetime.now()))
del tr_train
gc.collect()

del tr_val
gc.collect()

del tr_test
gc.collect()

Construct base dataframes. Time = 2018-09-26 07:34:25.998455
2018-09-26 07:34:25.998455: start aggregating
2018-09-26 07:34:27.759369. construct_base_datasets_add_std_and_perc: start splitting by months
2018-09-26 07:34:33.626756. construct_base_datasets_add_std_and_perc: start joining with test data
2018-09-26 07:34:36.463008. construct_base_datasets_add_std_and_perc: start calculating x_train, x_val, x_test, train_target, val_target
(214200, 120) (214200, 120) (214200,)
Add month means. Time = 2018-09-26 07:34:37.130594
(214200, 240) (214200, 240)
Add mean encodings. Time = 2018-09-26 07:34:41.098275
(214200, 482) (214200, 482)
Add item category. Time = 2018-09-26 07:35:01.470015
(214200, 484) (214200, 484)
(214200, 483) (214200, 483)
Add item category type. Time = 2018-09-26 07:35:19.941719
(214200, 485) (214200, 485)
(214200, 484) (214200, 484)
Add city. Time = 2018-09-26 07:35:24.336711
(214200, 486) (214200, 486)
(214200, 485) (214200, 485)
Factorize Item category type and City. 

7

In [10]:
# Let's start modeling
# Start modeling
cbr_2_1 = cb.CatBoostRegressor(iterations=1000, depth=8, random_seed=4)

# Train model
cbr_2_1.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_2_1 = cbr_2_1.predict(np_val_matrix)
val_cbr_2_1 = np.clip(val_cbr_2_1, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_2_1, val_target_clip))
print(rmse)

# Calculate predictions on test set
test_cbr_2_1 = cbr_2_1.predict(np_test_matrix)
test_cbr_2_1 = np.clip(test_cbr_2_1, 0, 20)

print (test_cbr_2_1[:10])
# Best before
# 0:	learn: 2.4847328	total: 597ms	remaining: 10m 55s
# 100:	learn: 1.3856467	total: 1m 2s	remaining: 10m 21s
# 200:	learn: 1.2445882	total: 2m 24s	remaining: 10m 44s
# 300:	learn: 1.1742307	total: 3m 42s	remaining: 9m 50s
# 400:	learn: 1.1397619	total: 5m 1s	remaining: 8m 45s
# 500:	learn: 1.1060752	total: 6m 14s	remaining: 7m 27s
# 600:	learn: 1.0798327	total: 7m 31s	remaining: 6m 14s
# 700:	learn: 1.0559542	total: 8m 51s	remaining: 5m 2s
# 800:	learn: 1.0269983	total: 10m 18s	remaining: 3m 51s
# 900:	learn: 1.0007516	total: 11m 51s	remaining: 2m 37s
# 1000:	learn: 0.9820265	total: 13m 14s	remaining: 1m 18s
# 1099:	learn: 0.9554568	total: 14m 45s	remaining: 0us
# 0.951226225687
# [ 0.7275049   0.10843564  0.94671598  0.21936311  0.58171137  0.45407823
#   1.13758443  0.14389279  0.94910729  0.27597356]

0:	learn: 2.4792342	total: 947ms	remaining: 15m 46s
100:	learn: 1.3846346	total: 1m 27s	remaining: 12m 59s
200:	learn: 1.2564225	total: 3m 29s	remaining: 13m 51s
300:	learn: 1.1873753	total: 5m 31s	remaining: 12m 49s
400:	learn: 1.1437810	total: 7m 44s	remaining: 11m 34s
500:	learn: 1.1176080	total: 10m 1s	remaining: 9m 58s
600:	learn: 1.0855270	total: 11m 55s	remaining: 7m 54s
700:	learn: 1.0520653	total: 13m 52s	remaining: 5m 55s
800:	learn: 1.0278101	total: 15m 43s	remaining: 3m 54s
900:	learn: 1.0000837	total: 17m 38s	remaining: 1m 56s
999:	learn: 0.9774637	total: 19m 48s	remaining: 0us
0.950454417285
[ 0.74228808  0.10534214  1.07961814  0.18394179  0.58284275  0.52286837
  1.43324291  0.15004683  0.8943047   0.43650018]


In [11]:
df_test_pred_2_1 = test.join(pd.Series(test_cbr_2_1, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_2_1.to_csv('../Data/Submissions/6_2_1_catboost_1000_trees_with_std_and_mean.csv', index=None)

#### 0.96837, not bad, not good. Let's change random seed to 26

### Add 100 trees

In [12]:
# Let's start modeling
# Start modeling
cbr_2_2 = cb.CatBoostRegressor(iterations=1100, depth=8, random_seed=4)

# Train model
cbr_2_2.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_2_2 = cbr_2_2.predict(np_val_matrix)
val_cbr_2_2 = np.clip(val_cbr_2_2, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_2_2, val_target_clip))
print(rmse)

# Calculate predictions on test set
test_cbr_2_2 = cbr_2_2.predict(np_test_matrix)
test_cbr_2_2 = np.clip(test_cbr_2_2, 0, 20)

print (test_cbr_2_2[:10])
# Best before
# 0:	learn: 2.4847328	total: 597ms	remaining: 10m 55s
# 100:	learn: 1.3856467	total: 1m 2s	remaining: 10m 21s
# 200:	learn: 1.2445882	total: 2m 24s	remaining: 10m 44s
# 300:	learn: 1.1742307	total: 3m 42s	remaining: 9m 50s
# 400:	learn: 1.1397619	total: 5m 1s	remaining: 8m 45s
# 500:	learn: 1.1060752	total: 6m 14s	remaining: 7m 27s
# 600:	learn: 1.0798327	total: 7m 31s	remaining: 6m 14s
# 700:	learn: 1.0559542	total: 8m 51s	remaining: 5m 2s
# 800:	learn: 1.0269983	total: 10m 18s	remaining: 3m 51s
# 900:	learn: 1.0007516	total: 11m 51s	remaining: 2m 37s
# 1000:	learn: 0.9820265	total: 13m 14s	remaining: 1m 18s
# 1099:	learn: 0.9554568	total: 14m 45s	remaining: 0us
# 0.951226225687
# [ 0.7275049   0.10843564  0.94671598  0.21936311  0.58171137  0.45407823
#   1.13758443  0.14389279  0.94910729  0.27597356]

0:	learn: 2.4792342	total: 737ms	remaining: 13m 30s
100:	learn: 1.3846346	total: 1m 53s	remaining: 18m 44s
200:	learn: 1.2564225	total: 4m 39s	remaining: 20m 48s
300:	learn: 1.1873753	total: 7m 8s	remaining: 18m 56s
400:	learn: 1.1437810	total: 9m 35s	remaining: 16m 43s
500:	learn: 1.1176080	total: 11m 39s	remaining: 13m 56s
600:	learn: 1.0855270	total: 13m 43s	remaining: 11m 23s
700:	learn: 1.0520653	total: 15m 32s	remaining: 8m 50s
800:	learn: 1.0278101	total: 17m 20s	remaining: 6m 28s
900:	learn: 1.0000837	total: 19m 14s	remaining: 4m 15s
1000:	learn: 0.9774167	total: 21m 20s	remaining: 2m 6s
1099:	learn: 0.9632251	total: 23m 38s	remaining: 0us
0.953286733497
[ 0.73993178  0.10739321  1.07615588  0.23068727  0.63832856  0.51527324
  1.42513363  0.15250773  0.90494429  0.42279728]


In [13]:
df_test_pred_2_2 = test.join(pd.Series(test_cbr_2_2, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_2_2.to_csv('../Data/Submissions/6_2_2_catboost_1100_trees_with_std_and_mean.csv', index=None)

#### Public LB Score = 0.96645

### And 1200 trees

In [14]:
# Let's start modeling
# Start modeling
cbr_2_3 = cb.CatBoostRegressor(iterations=1200, depth=8, random_seed=4)

# Train model
cbr_2_3.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_2_3 = cbr_2_3.predict(np_val_matrix)
val_cbr_2_3 = np.clip(val_cbr_2_3, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_2_3, val_target_clip))
print(rmse)

# Calculate predictions on test set
test_cbr_2_3 = cbr_2_3.predict(np_test_matrix)
test_cbr_2_3 = np.clip(test_cbr_2_3, 0, 20)

print (test_cbr_2_3[:10])
# Best before
# 0:	learn: 2.4847328	total: 597ms	remaining: 10m 55s
# 100:	learn: 1.3856467	total: 1m 2s	remaining: 10m 21s
# 200:	learn: 1.2445882	total: 2m 24s	remaining: 10m 44s
# 300:	learn: 1.1742307	total: 3m 42s	remaining: 9m 50s
# 400:	learn: 1.1397619	total: 5m 1s	remaining: 8m 45s
# 500:	learn: 1.1060752	total: 6m 14s	remaining: 7m 27s
# 600:	learn: 1.0798327	total: 7m 31s	remaining: 6m 14s
# 700:	learn: 1.0559542	total: 8m 51s	remaining: 5m 2s
# 800:	learn: 1.0269983	total: 10m 18s	remaining: 3m 51s
# 900:	learn: 1.0007516	total: 11m 51s	remaining: 2m 37s
# 1000:	learn: 0.9820265	total: 13m 14s	remaining: 1m 18s
# 1099:	learn: 0.9554568	total: 14m 45s	remaining: 0us
# 0.951226225687
# [ 0.7275049   0.10843564  0.94671598  0.21936311  0.58171137  0.45407823
#   1.13758443  0.14389279  0.94910729  0.27597356]

0:	learn: 2.4792342	total: 754ms	remaining: 15m 3s
100:	learn: 1.3846346	total: 1m 33s	remaining: 17m
200:	learn: 1.2564225	total: 3m 42s	remaining: 18m 26s
300:	learn: 1.1873753	total: 6m 1s	remaining: 17m 59s
400:	learn: 1.1437810	total: 8m 8s	remaining: 16m 12s
500:	learn: 1.1176080	total: 10m 17s	remaining: 14m 22s
600:	learn: 1.0855270	total: 12m 38s	remaining: 12m 36s
700:	learn: 1.0520653	total: 15m 4s	remaining: 10m 43s
800:	learn: 1.0278101	total: 17m 10s	remaining: 8m 33s
900:	learn: 1.0000837	total: 19m 12s	remaining: 6m 22s
1000:	learn: 0.9774167	total: 21m 32s	remaining: 4m 16s
1100:	learn: 0.9625752	total: 23m 29s	remaining: 2m 6s
1199:	learn: 0.9409291	total: 25m 22s	remaining: 0us
0.957015161133
[ 0.74940783  0.10964817  1.076222    0.23134564  0.61735403  0.50313506
  1.41829848  0.15340271  0.89463034  0.45222298]


In [15]:
df_test_pred_2_3 = test.join(pd.Series(test_cbr_2_3, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_2_3.to_csv('../Data/Submissions/6_2_3_catboost_1200_trees_with_std_and_mean.csv', index=None)

#### Public LB Score = 0.96655, let's change random seed

In [10]:
# Let's start modeling
# Start modeling
cbr_2_4 = cb.CatBoostRegressor(iterations=1000, depth=8, random_seed=26)

# Train model
cbr_2_4.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_2_4 = cbr_2_4.predict(np_val_matrix)
val_cbr_2_4 = np.clip(val_cbr_2_4, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_2_4, val_target_clip))
print(rmse)

# Calculate predictions on test set
test_cbr_2_4 = cbr_2_4.predict(np_test_matrix)
test_cbr_2_4 = np.clip(test_cbr_2_4, 0, 20)

print (test_cbr_2_4[:10])
# Best before
# 0:	learn: 2.4847328	total: 597ms	remaining: 10m 55s
# 100:	learn: 1.3856467	total: 1m 2s	remaining: 10m 21s
# 200:	learn: 1.2445882	total: 2m 24s	remaining: 10m 44s
# 300:	learn: 1.1742307	total: 3m 42s	remaining: 9m 50s
# 400:	learn: 1.1397619	total: 5m 1s	remaining: 8m 45s
# 500:	learn: 1.1060752	total: 6m 14s	remaining: 7m 27s
# 600:	learn: 1.0798327	total: 7m 31s	remaining: 6m 14s
# 700:	learn: 1.0559542	total: 8m 51s	remaining: 5m 2s
# 800:	learn: 1.0269983	total: 10m 18s	remaining: 3m 51s
# 900:	learn: 1.0007516	total: 11m 51s	remaining: 2m 37s
# 1000:	learn: 0.9820265	total: 13m 14s	remaining: 1m 18s
# 1099:	learn: 0.9554568	total: 14m 45s	remaining: 0us
# 0.951226225687
# [ 0.7275049   0.10843564  0.94671598  0.21936311  0.58171137  0.45407823
#   1.13758443  0.14389279  0.94910729  0.27597356]

0:	learn: 2.4857165	total: 782ms	remaining: 13m 1s
100:	learn: 1.3794306	total: 1m 12s	remaining: 10m 43s
200:	learn: 1.2399495	total: 2m 47s	remaining: 11m 6s
300:	learn: 1.1751860	total: 4m 37s	remaining: 10m 44s
400:	learn: 1.1407090	total: 6m 34s	remaining: 9m 49s
500:	learn: 1.1135165	total: 8m 42s	remaining: 8m 39s
600:	learn: 1.0797372	total: 10m 53s	remaining: 7m 13s
700:	learn: 1.0426744	total: 13m 13s	remaining: 5m 38s
800:	learn: 1.0209613	total: 15m 40s	remaining: 3m 53s
900:	learn: 0.9935827	total: 17m 49s	remaining: 1m 57s
999:	learn: 0.9691693	total: 19m 39s	remaining: 0us
0.951490357882
[ 0.84722459  0.12334822  0.96486916  0.19912346  0.49235855  0.46128827
  1.14017656  0.14163113  0.91939378  0.35470679]


In [11]:
df_test_pred_2_4 = test.join(pd.Series(test_cbr_2_4, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_2_4.to_csv('../Data/Submissions/6_2_4_catboost_1000_trees_with_std_and_mean_second_seed.csv', index=None)

#### 0.97324 - bad

... And 1100 trees

In [12]:
# Let's start modeling
# Start modeling
cbr_2_5 = cb.CatBoostRegressor(iterations=1100, depth=8, random_seed=26)

# Train model
cbr_2_5.fit(np_train_matrix, tr_target, verbose=100)

# Predict on validation
val_cbr_2_5 = cbr_2_5.predict(np_val_matrix)
val_cbr_2_5 = np.clip(val_cbr_2_5, 0, 20)

# Get RMSE error on validation set
rmse = np.sqrt(mean_squared_error(val_cbr_2_5, val_target_clip))
print(rmse)

# Calculate predictions on test set
test_cbr_2_5 = cbr_2_5.predict(np_test_matrix)
test_cbr_2_5 = np.clip(test_cbr_2_5, 0, 20)

print (test_cbr_2_5[:10])

0:	learn: 2.4857165	total: 1.58s	remaining: 29m
100:	learn: 1.3794306	total: 1m 59s	remaining: 19m 42s
200:	learn: 1.2399495	total: 3m 45s	remaining: 16m 49s
300:	learn: 1.1751860	total: 5m 35s	remaining: 14m 50s
400:	learn: 1.1407090	total: 7m 31s	remaining: 13m 6s
500:	learn: 1.1135165	total: 9m 15s	remaining: 11m 4s
600:	learn: 1.0797372	total: 11m 11s	remaining: 9m 17s
700:	learn: 1.0426744	total: 13m 13s	remaining: 7m 31s
800:	learn: 1.0209613	total: 15m 1s	remaining: 5m 36s
900:	learn: 0.9935827	total: 16m 50s	remaining: 3m 43s
1000:	learn: 0.9678922	total: 18m 43s	remaining: 1m 51s
1099:	learn: 0.9527922	total: 20m 43s	remaining: 0us
0.954964320794
[ 0.84270883  0.12147589  0.96649961  0.20674829  0.4577002   0.45262749
  1.14466323  0.14444004  0.93928639  0.35725898]


In [13]:
df_test_pred_2_5 = test.join(pd.Series(test_cbr_2_5, index=test.index, name='item_cnt_month'))[
    ['ID', 'item_cnt_month']]
df_test_pred_2_5.to_csv('../Data/Submissions/6_2_5_catboost_1100_trees_with_std_and_mean_second_seed.csv', index=None)

### 0.97453 - worse... Mean item count doesn't work

## Let's add 25 and 75 percentiles of price and delete mean item count