## Mean encoding

Here is a real example where mean encoding is useful.

In [3]:
import pandas as pd
import numpy as np
from itertools import product
sales = pd.read_csv('data/sales_train.csv.gz')
sales.head()

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day
0,02.01.2013,0,59,22154,999.0,1.0
1,03.01.2013,0,25,2552,899.0,1.0
2,05.01.2013,0,25,2552,899.0,-1.0
3,06.01.2013,0,25,2554,1709.05,1.0
4,15.01.2013,0,25,2555,1099.0,1.0


The main features that this will consider are
- date_block_num
- shop_id
- item_id

We can notice that not every item_id is on store 59.

In [4]:
sales.loc[(sales.shop_id == 59) & (sales.item_id == 2552),:]

Unnamed: 0,date,date_block_num,shop_id,item_id,item_price,item_cnt_day


A way to predict the value for these item_ids is to make fill them with item_cnt_day = 0, but here it does it by MAKING EVERY COMBINATION OF \['shop_id', 'item_id', 'date_block_num'\] 
and for each of them compute the AGGREGATED SUMMED item_cnt_day (not the mean, which is weird, but that is the example in the course)

In [5]:
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month, we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales[sales['date_block_num']==block_num]['shop_id'].unique()
    cur_items = sales[sales['date_block_num']==block_num]['item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

#turn the grid into pandas dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)
grid.head()

Unnamed: 0,shop_id,item_id,date_block_num
0,59,22154,0
1,59,2552,0
2,59,2554,0
3,59,2555,0
4,59,2564,0


In [13]:
#get aggregated values for (shop_id, item_id, month)
## agg --> https://pandas.pydata.org/pandas-docs/version/0.22/generated/pandas.core.groupby.DataFrameGroupBy.agg.html
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
print(gb.columns.values)

#fix column names (it simply chooses the column names)
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
#join aggregated data to the grid
all_data = pd.merge(grid,gb,how='left',on=index_cols).fillna(0)
all_data.head()

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


[('shop_id', '') ('item_id', '') ('item_cnt_day', 'target')]


Unnamed: 0,shop_id,item_id,date_block_num,target
0,59,22154,0,1.0
1,59,2552,0,0.0
2,59,2554,0,0.0
3,59,2555,0,0.0
4,59,2564,0,0.0


In [26]:
#sort the data
all_data.sort_values(['date_block_num','shop_id','item_id'],inplace=True)
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target
139255,0,19,0,0.0
141495,0,27,0,0.0
144968,0,28,0,0.0
142661,0,29,0,0.0
138947,0,32,0,6.0


## Mean encodings without regularization

This will mean-encode `item_id`.

Here are two ways to implement mean encoding features *without* any regularization.

This is the global mean. We can fill the target blanks with this 0.3343.

In [29]:
all_data.target.mean()

0.3342730567123426

#### Method 1

In [28]:
# Calculate a mapping: {item_id: target_mean}
item_id_target_mean = all_data.groupby('item_id').target.mean()

# In our non-regularized case we just *map* the computed means to the `item_id`'s
all_data['item_target_enc'] = all_data['item_id'].map(item_id_target_mean)

# Fill NaNs
all_data['item_target_enc'].fillna(0.3343, inplace=True) 
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,item_target_enc
139255,0,19,0,0.0,0.022222
141495,0,27,0,0.0,0.056834
144968,0,28,0,0.0,0.141176
142661,0,29,0,0.0,0.037383
138947,0,32,0,6.0,1.319042


#### Method 2

In [31]:
'''
     Differently to `.target.mean()` function `transform` 
   will return a dataframe with an index like in `all_data`.
   Basically this single line of code is equivalent to the first two lines from of Method 1.
'''
all_data['item_target_enc'] = all_data.groupby('item_id')['target'].transform('mean')

# Fill NaNs
all_data['item_target_enc'].fillna(0.3343, inplace=True)
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,item_target_enc
139255,0,19,0,0.0,0.022222
141495,0,27,0,0.0,0.056834
144968,0,28,0,0.0,0.141176
142661,0,29,0,0.0,0.037383
138947,0,32,0,6.0,1.319042


## Mean encodings with regularization

Only mean encoding may result in overfitting, so it is worth doing some examples doing regularization.

    1) Via KFold scheme;  
    2) Via Leave-one-out scheme;
    3) Via smoothing scheme;
    4) Via expanding mean scheme.
    
## 1. KFold scheme

In [32]:
from sklearn.model_selection import KFold

kf = KFold(n_splits = 5, shuffle = False)
all_data['item_target_enc'] = np.nan

for tr_ind, val_ind in kf.split(all_data):
    X_tr, X_val = all_data.iloc[tr_ind], all_data.iloc[val_ind]
    all_data.loc[all_data.index[val_ind], 'item_target_enc'] = X_val['item_id'].map(X_tr.groupby('item_id').target.mean())
    
all_data['item_target_enc'].fillna(0.3343, inplace = True)
all_data.head() 

Unnamed: 0,shop_id,item_id,date_block_num,target,item_target_enc
139255,0,19,0,0.0,0.3343
141495,0,27,0,0.0,0.048523
144968,0,28,0,0.0,0.142424
142661,0,29,0,0.0,0.030303
138947,0,32,0,6.0,0.89402


## 2. Leave-one-out scheme

In [33]:
loo_sum = all_data['item_id'].map(all_data.groupby('item_id').target.sum())
loo_count = all_data['item_id'].map(all_data.groupby('item_id').target.count())
all_data['item_target_enc'] = (loo_sum - all_data['target']) / (loo_count - 1)
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,item_target_enc
139255,0,19,0,0.0,0.022727
141495,0,27,0,0.0,0.056911
144968,0,28,0,0.0,0.141414
142661,0,29,0,0.0,0.0375
138947,0,32,0,6.0,1.316088


## 3. Smoothing

In [34]:
alpha = 100
globalmean = 0.3343
nrows = all_data.groupby('item_id')['target'].count()
means = all_data.groupby('item_id')['target'].mean()
score = (np.multiply(means, nrows) + globalmean*alpha) / (nrows + alpha)
all_data['item_target_enc'] = all_data['item_id'].map(score)
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,item_target_enc
139255,0,19,0,0.0,0.237448
141495,0,27,0,0.0,0.089905
144968,0,28,0,0.0,0.168964
142661,0,29,0,0.0,0.10791
138947,0,32,0,6.0,1.260635


## 4. Expanding mean scheme

In [40]:
cumsum = all_data.groupby('item_id')['target'].cumsum() - all_data['target']
cumcnt = all_data.groupby('item_id').cumcount()
all_data['item_target_enc'] = cumsum/cumcnt
all_data['item_target_enc'].fillna(0.3343, inplace = True)
all_data.head()

Unnamed: 0,shop_id,item_id,date_block_num,target,item_target_enc
139255,0,19,0,0.0,0.3343
141495,0,27,0,0.0,0.3343
144968,0,28,0,0.0,0.3343
142661,0,29,0,0.0,0.3343
138947,0,32,0,6.0,0.3343


I may have to try out some of these methods in a model to submit to kaggle.