# Bag of words

In [1]:
# load requirements
import pandas as pd
import numpy as np
import re
from collections import Counter

# df_Items = pd.read_csv('./data-readonly/items.csv')
df_Items_categories = pd.read_csv('./data-readonly/item_categories.csv')
df_Shops = pd.read_csv('./data-readonly/shops.csv')

# item_name = df_Items.item_name.values
# item_name = ' '.join(item_name)
# strip = re.compile(r'[()\'\[\]!*,/\-+.«»:&"]')
# item_name = strip.sub(' ',item_name)
# item_name = item_name.split(' ')
# item_name_counter = Counter(item_name)
# for name, _ in item_name_counter.most_common(101):
#     if name:
#         df_Items['BoW_{}'.format(name)] = df_Items['item_name'].str.count(name)


In [2]:
# generate some features from shop_name
shop_name = df_Shops.shop_name.values
shop_name = ' '.join(shop_name)
strip = re.compile(r'[()\'\[\]!*,/\-+.«»:&"]')
shop_name = strip.sub(' ',shop_name)
shop_name = shop_name.split(' ')
shop_name_counter = Counter(shop_name)
for name, _ in shop_name_counter.most_common(21):
    if name:
        df_Shops['BoW_{}'.format(name)] = df_Shops['shop_name'].str.count(name)


In [3]:
# generate some features from item_category_name
item_category_name = df_Items_categories.item_category_name.values
item_category_name = ' '.join(item_category_name)
strip = re.compile(r'[()\'\[\]!*,/\-+.«»:&"]')
item_category_name = strip.sub(' ',item_category_name)
item_category_name = item_category_name.split(' ')
item_category_name_counter = Counter(item_category_name)
for name, _ in item_category_name_counter.most_common(21):
    if name:
        df_Items_categories['BoW_{}'.format(name)] = df_Items_categories['item_category_name'].str.count(name)


In [4]:
# save features

df_Items_categories.drop(columns='item_category_name').to_hdf('./HDF/features.hdf', key='item_categories')
df_Shops.drop(columns='shop_name').to_hdf('./HDF/features.hdf', key='shops')

# Tfid

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import pandas as pd

df_Items = pd.read_csv('./data-readonly/items.csv')

In [6]:
tfidf = TfidfVectorizer(analyzer='word',
                        stop_words=stopwords.words('russian'),
                        token_pattern=u'(?ui)\\b\\w*[a-z]+\\w*\\b',
                        max_features=100)

df_Items['star_count'] = df_Items['item_name'].str.count('\*')
df_Items['item_name'] = df_Items['item_name'].str.replace('[\(\)\*BD\/\!\d+]','')
df_Items['item_name'] = df_Items['item_name'].apply(lambda x: x.lower())
tfidf_values = tfidf.fit_transform(df_Items['item_name'])
df_Items = pd.concat([df_Items,pd.DataFrame(tfidf_values.toarray(),
                                            columns=tfidf.get_feature_names())],axis=1)
df_Items['Regional'] = (df_Items['item_name'].str.contains('регион')).astype('int')
df_Items['BlueRay'] = (df_Items['item_name'].str.contains('BD')).astype('int')

df_Items.drop(columns='item_name').to_hdf('./HDF/features.hdf', key='items')

# Release date

In [7]:
# load release date
df_Release = pd.read_csv('items_with_release_date.csv')
df_Release['release_date'] = pd.to_datetime(df_Release['release_date'], format='%Y-%m-%d')
df_Release['release_date_block_num'] = df_Release['release_date'].map(lambda x: (x.year - 2013) * 12 + x.month -1 + x.day / 30)
df_Release['release_date_block_num'] = df_Release['release_date_block_num'].clip(lower=-1)
df_Release['release_date_block_num'].fillna(-999, inplace=True)
df_Release[['item_id', 'release_date_block_num']].to_hdf('./HDF/features.hdf', key='release_date')

# Load trainning data

In [8]:
import pandas as pd

df_Train = pd.read_hdf('./HDF/All_train.hdf', key='train')


# Training Set Feature Engineering

In [9]:
# lag features
index_cols = ['shop_id', 'item_id', 'date_block_num']
cols_to_rename = list(df_Train.columns.difference(index_cols))

shift_range = [1, 2, 3, 4, 6, 12]

for month_shift in shift_range:
    train_shift = df_Train[index_cols + cols_to_rename].copy()

    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift

    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    df_Train = pd.merge(df_Train, train_shift, on=index_cols, how='left').fillna(0)

In [10]:
# get national holidays from https://www.officeholidays.com/countries/russia/2013.php
num_holidays = [6, 1, 1, 0, 5, 1, 0, 0, 0, 0, 1, 0, 6, 1, 1, 0, 3, 2, 0, 0, 0, 0, 2, 0, 7, 1, 1, 0, 4, 4, 0, 0, 0, 0, 1, 0]
for i in range(len(num_holidays)):
    df_Train.loc[df_Train['date_block_num'] == i, 'num_holidays'] = num_holidays[i]

In [11]:
# item / shop / category mean encoding using Expanding mean scheme
df_Items = pd.read_csv('./data-readonly/items.csv')
global_mean = df_Train.target.mean()
df_Train = df_Train.merge(df_Items[['item_id','item_category_id']], on='item_id')

for id in ['item_id', 'shop_id', 'item_category_id']:
    cumsum = df_Train.groupby(id)['target'].cumsum() - df_Train['target']
    cumcount = df_Train.groupby(id)['target'].cumcount()

    df_Train['{}_target_enc'.format(id)]= cumsum / cumcount
    df_Train['{}_target_enc'.format(id)].fillna(global_mean, inplace=True)

# Merge together

In [12]:
#load features

df_Items = pd.read_hdf('./HDF/features.hdf', key='items')
df_Items_categories = pd.read_hdf('./HDF/features.hdf', key='item_categories')
df_Shops = pd.read_hdf('./HDF/features.hdf', key='shops')
df_Release = pd.read_hdf('./HDF/features.hdf', key='release_date')

In [13]:
df_Train = df_Train[df_Train['date_block_num'] >= 12] 

In [14]:
# merge features
merge_list = [
    (df_Items_categories, 'item_category_id'),
    (df_Items, 'item_id'),
    (df_Shops, 'shop_id'),    
    (df_Release, 'item_id')
]
for df, id in merge_list:
    df_Train = df_Train.merge(df, on=id)

drop_col = ['target']
df_Train.drop(columns=drop_col).to_hdf('./HDF/Train_with_features.hdf', key='train_x')
df_Train.target.to_hdf('./HDF/Train_with_features.hdf', key='train_y')