# Load data

In [18]:
import pandas as pd
import numpy as np
import re
from collections import Counter

df_Train = pd.read_hdf('All_train.hdf', key='train')
df_Items = pd.read_csv('./data-readonly/items.csv')
df_Items_categories = pd.read_csv('./data-readonly/item_categories.csv')
df_Shops = pd.read_csv('./data-readonly/shops.csv')

# Bag of words

In [21]:
# generate some features from item_name
item_name = df_Items.item_name.values
item_name = ' '.join(item_name)
strip = re.compile(r'[()\'\[\]!*,/\-+.«»:&"]')
item_name = strip.sub(' ',item_name)
item_name = item_name.split(' ')
item_name_counter = Counter(item_name)
for name, _ in item_name_counter.most_common(101):
    if name:
        df_Items['BoW_{}'.format(name)] = df_Items['item_name'].str.count(name)


In [22]:
# generate some features from shop_name
shop_name = df_Shops.shop_name.values
shop_name = ' '.join(shop_name)
strip = re.compile(r'[()\'\[\]!*,/\-+.«»:&"]')
shop_name = strip.sub(' ',shop_name)
shop_name = shop_name.split(' ')
shop_name_counter = Counter(shop_name)
for name, _ in shop_name_counter.most_common(21):
    if name:
        df_Shops['BoW_{}'.format(name)] = df_Shops['shop_name'].str.count(name)


In [23]:
# generate some features from item_category_name
item_category_name = df_Items_categories.item_category_name.values
item_category_name = ' '.join(item_category_name)
strip = re.compile(r'[()\'\[\]!*,/\-+.«»:&"]')
item_category_name = strip.sub(' ',item_category_name)
item_category_name = item_category_name.split(' ')
item_category_name_counter = Counter(item_category_name)
for name, _ in item_category_name_counter.most_common(21):
    if name:
        df_Items_categories['BoW_{}'.format(name)] = df_Items_categories['item_category_name'].str.count(name)


In [24]:
# save features
df_Items.to_hdf('features.hdf', key='items')
df_Items_categories.to_hdf('features.hdf', key='item_categories')
df_Shops.to_hdf('features.hdf', key='shops')

# Release date

In [None]:
# load release date
df_Release = pd.read_csv('items_with_release_date.csv')
df_Release['release_date'] = pd.to_datetime(df_Release['release_date'], format='%Y-%m-%d')
df_Release['release_date_block_num'] = df_Release['release_date'].map(lambda x: (x.year - 2013) * 12 + x.month -1 + x.day / 30)
df_Release['release_date_block_num'] = df_Release['release_date_block_num'].clip(lower=-1)
df_Release['release_date_block_num'].fillna(-999, inplace=True)
df_Release.to_hdf('features.hdf', key='release_date')

# Training Set Feature Engineering

In [None]:
# lag features
index_cols = ['shop_id', 'item_id', 'date_block_num']
cols_to_rename = list(df_Train.columns.difference(index_cols))

shift_range = [1, 2, 3, 4, 6, 12]

for month_shift in shift_range:
    train_shift = df_Train[index_cols + cols_to_rename].copy()

    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift

    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    df_Train = pd.merge(df_Train, train_shift, on=index_cols, how='left').fillna(0)

In [4]:
# get national holidays from https://www.officeholidays.com/countries/russia/2013.php
num_holidays = [6, 1, 1, 0, 5, 1, 0, 0, 0, 0, 1, 0, 6, 1, 1, 0, 3, 2, 0, 0, 0, 0, 2, 0, 7, 1, 1, 0, 4, 4, 0, 0, 0, 0, 1, 0]
for i in range(len(num_holidays)):
    df_Train.loc[df_Train['date_block_num'] == i, 'num_holidays'] = num_holidays[i]

In [7]:
# item / shop / category mean encoding using Expanding mean scheme
global_mean = df_Train.target.mean()
df_Train = df_Train.merge(df_Items[['item_id','item_category_id']], on='item_id')

for id in ['item_id', 'shop_id', 'item_category_id']:
    cumsum = df_Train.groupby(id)['target'].cumsum() - df_Train['target']
    cumcount = df_Train.groupby(id)['target'].cumcount()

    df_Train['{}_target_enc'.format(id)]= cumsum / cumcount
    df_Train['{}_target_enc'.format(id)].fillna(global_mean, inplace=True)

# Merge together

In [None]:
#load features
df_Items.read_hdf('features.hdf', key='items')
df_Items_categories.read_hdf('features.hdf', key='item_categories')
df_Shops.read_hdf('features.hdf', key='shops')

In [8]:
# merge features
merge_list = [
    (df_Items, 'item_id'),
    (df_Shops, 'shop_id'),
    (df_Items_categories, 'item_category_id'),
    (df_Release, 'item_id')
]
for df, id in merge_list:
    df_Train = df_Train.merge(df, on=id)
df_Train.to_hdf('All_train.hdf', key='train_with_features')

-2.0           6
-1.0         246
 19.0        570
 18.0        628
 17.0        761
 16.0        836
 15.0       1003
 14.0       1161
 13.0       1482
 12.0       1714
 11.0       2082
 10.0       2810
 9.0        3553
 8.0        4972
 20.0       6541
 7.0        6600
 6.0        9838
 5.0       15026
 4.0       24321
 3.0       45773
 2.0      109854
 1.0      359469
 0.0     6683554
Name: target, dtype: int64

minkowski


AssertionError: 