# Supervised categorical encodings

In [1]:
import numpy as np
import pandas as pd
import os

In [3]:
from catboost.datasets import amazon
train, test = amazon()
target = 'ACTION'
col4train = [x for x in train.columns if x not in [target, 'ROLE_TITLE']]

In [4]:
y = train[target].values

In [5]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import ExtraTreesClassifier

In [8]:
def get_model():
    params = {
        "n_estimators":300,
        "n_jobs":3,
        "random_state":5436
    }
    return ExtraTreesClassifier

### Simple Target Encoding

In [10]:
from sklearn.base import BaseEstimator, TransformerMixin

In [15]:
class TargetEncoding(BaseEstimator, TransformerMixin):
    def __init__(self, columns_names):
        self.columns_names = columns_names
        self.learned_values = {}
        self.dataset_mean = np.nan
        
    def fit(self, X, y, **fit_params):
        X_ = X.copy()
        self.learned_values = {}
        X_["__target__"] = y
        for c in [x for x in X_.columns if x in self.columns_names]:
            self.learned_values[c] = (X_[[c, "__target__"]].groupby(c)["__target__"]
                                      .mean().reset_index())
            self.dataset_mean = np.mean(y)
            return self
        
    def transform(self, X, **fit_params):
        transformed_X = X[self.columns_names].copy()
        for c in transformed_X.columns:
            transformed_X[c] = (transformed_X[[c]]
                                .merge(self.learned_values[c], on = c, how = 'left')
                               )["__target__"]
            transformed_X = transformed_X.fillna(self.dataset_mean)
            return transformed_X
        
    def fit_transform(self, X, y ,**fit_params):
        self.fit(X,y)
        return self.transform(X)