In [1]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
from utils import *

%reload_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pandas as pd
import gc, os, string

from scipy.sparse import hstack, csr_matrix

In [2]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
    """
    Smoothing is computed like in the following paper by Daniele Micci-Barreca
    https://kaggle2.blob.core.windows.net/forum-message-attachments/225952/7441/high%20cardinality%20categoricals.pdf
    trn_series : training categorical feature as a pd.Series
    tst_series : test categorical feature as a pd.Series
    target : target data as a pd.Series
    min_samples_leaf (int) : minimum samples to take category average into account
    smoothing (int) : smoothing effect to balance categorical average vs prior
    """
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [9]:
if not os.path.exists(root+'features/tr_categorical_target_encoding.pkl'):
    with timeit('categorical target encoding'):
        target_enc_cols = ['region', 'image_top_1', 'category_name', 'user_type',
                           'parent_category_name', 'city', 'item_seq_number']
        
        use_cols = ['deal_probability', 'price', 'item_id'] + target_enc_cols
        X_tr = pd.read_csv(root + 'train.csv.zip', index_col='item_id', usecols=use_cols)
        y = X_tr['deal_probability'].copy()
        X_tr.drop(['deal_probability'], axis=1, inplace=True)
        
        use_cols = ['price', 'item_id'] + target_enc_cols
        X_te = pd.read_csv(root + 'test.csv.zip',  index_col='item_id', usecols=use_cols)
        
    
        
        print(X_tr.shape,X_te.shape)
        
        for c in target_enc_cols:
            tr_avg, te_avg = target_encode(X_tr[c].fillna('nan'),X_te[c].fillna('nan'), y)
            cc = 'mean_encoding_target_' + c
            X_tr[cc] = tr_avg.values
            X_te[cc] = te_avg.values
            
        y = np.log1p(X_tr['price'].fillna(X_tr['price'].median())).fillna(0)
        for c in target_enc_cols:
            tr_avg, te_avg = target_encode(X_tr[c].fillna('nan'),X_te[c].fillna('nan'), y)
            cc = 'mean_encoding_price_' + c
            X_tr[cc] = tr_avg.values
            X_te[cc] = te_avg.values
        
        X_tr.drop(use_cols, axis=1, inplace=True)
        X_te.drop(use_cols, axis=1, inplace=True)
        
        dump_obj(X_tr, 'tr_categorical_target_encoding')
        dump_obj(X_te, 'te_categorical_target_encoding')
        
        del X_tr, X_te, y
        gc.collect()

categorical target encoding started
(1503424, 8) (508438, 8)
categorical target encoding: 24.66254734992981


ValueError: labels ['item_id'] not contained in axis