# Обработка категориальных признаков

In [None]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.datasets import load_breast_cancer, load_boston
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import lightgbm
import catboost
from sklearn.base import TransformerMixin
from collections import Counter

## One hot encoding

![](images/one_hot_encoding.png)

In [None]:
X_train = np.array([
    [0, 0, 3], 
    [1, 1, 0], 
    [0, 2, 1], 
    [1, 0, 2]
])

In [None]:
encoder = preprocessing.OneHotEncoder(categories='auto', sparse=False).fit(X_train)
encoder

In [None]:
encoder.transform([[0, 1, 3]])

In [None]:
encoder.categories_

Можем явно указать метки

In [None]:
encoder = preprocessing.OneHotEncoder(
    categories=[np.array([0, 1, 2]), np.array([0, 1, 2]), np.array([0, 1, 2, 3])]
).fit(X_train)
encoder

In [None]:
encoder.transform([[0, 1, 3]]).todense()

Пример использования

In [None]:
df = pd.read_csv('data/mushrooms.csv', header=None)

In [None]:
df.head()

Датасет с категориальными фичами

In [None]:
X_cat, y = np.array(df.loc[:, 1:]), np.array(df.loc[:, 0])

In [None]:
X_cat

In [None]:
y = np.equal(y, 'p').astype(int) # poison
y

In [None]:
y.mean()

### Можно сделать числа

In [None]:
label_encoder = preprocessing.LabelEncoder()
label_encoder

In [None]:
X_with_cat = X_cat.copy()
for i in range(X_cat.shape[1]):
    X_with_cat[:, i] = label_encoder.fit_transform(X_with_cat[:, i])
X_with_cat

In [None]:
cross_val_score(LogisticRegression(solver='lbfgs', max_iter=1000), X_with_cat, y, cv=3).mean()

Это хорошее качество?

### OHE будет лучше

In [None]:
encoder = preprocessing.OneHotEncoder(sparse=False, categories='auto').fit(X_cat)
encoder

In [None]:
X = encoder.transform(X_cat)
X

In [None]:
cross_val_score(LogisticRegression(solver='lbfgs', max_iter=400), X, y, cv=3).mean()

## Mean encoding

![](images/mean_encoding.png)

In [None]:
X = X_cat.copy()

for i in range(X.shape[1]):
    le = label_encoder.fit(X[:, i])
    X[:, i] = le.transform(X[:, i])
    for j in range(len(le.classes_)):
        indices = X[:, i] == j
        X[indices, i] = y[indices].mean()

In [None]:
cross_val_score(LogisticRegression(solver='lbfgs', max_iter=400), X, y, cv=3).mean()

Но, как мы помним, это некорректные оценки. Почему?

Давайте честно оценим качество

In [None]:
X = X_cat.copy()
for i in range(X.shape[1]):
    X[:, i] = label_encoder.fit_transform(X[:, i])

In [None]:
cross_val_score(
    make_pipeline(
        preprocessing.OneHotEncoder(sparse=False, categories='auto'),
        LogisticRegression(solver='lbfgs')
    ),
    X, 
    y,
    cv=10,
).mean()

In [None]:
cross_val_score(
    make_pipeline(
        preprocessing.OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'),
        LogisticRegression(solver='lbfgs')
    ),
    X, 
    y,
    cv=3
).mean()

Мы не указали поведение encoder-a на ранее не наблюдаемых значениях признака, поэтому получаем ошибку, если указать поведение явно, то ошибки не будет

**Почему при cv=10 ошибки не было?**

In [None]:
cross_val_score(
    make_pipeline(
        preprocessing.OneHotEncoder(handle_unknown='ignore'),
        LogisticRegression(solver='lbfgs')
    ),
    X, 
    y,
    cv=3
).mean()

Чтобы сделать pipeline напишем собственный трансформер

Чтобы не писать лишних методов наседуемся от базового класса TransformerMixin - теперь не нужно реализовывать fit_transform

In [None]:
class MeanTransformer(TransformerMixin):
    
    def fit(self, X, y):
        self.cnt = Counter()
        for i in range(X.shape[1]):
            for j in range(np.max(X[:, i])):
                indices = X[:, i] == j
                if np.sum(indices) > 0:
                    val = y[indices].mean()
                else:
                    val = y.mean()
                self.cnt[(i, j)] = val
                
        return self
    
    def transform(self, X):
        X_new = np.copy(X)
        for i in range(X.shape[1]): 
            for j in range(np.max(X[:, i])):
                indices = X[:, i] == j
                if np.sum(indices) > 0:
                    X_new[indices, i] = self.cnt[(i, j)]
        return X_new


In [None]:
X = X_cat.copy()
for i in range(X.shape[1]):
    X[:, i] = label_encoder.fit_transform(X[:, i])

In [None]:
cross_val_score(
    make_pipeline(
        MeanTransformer(),
        LogisticRegression(solver='lbfgs', max_iter=400)
    ),
    X, 
    y,
    cv=3
).mean()

In [None]:
cross_val_score(
    make_pipeline(
        MeanTransformer(),
        LogisticRegression(solver='lbfgs', max_iter=400)
    ),
    X, 
    y,
    cv=10
).mean()

Как видите, качество заметно ниже

Но если поиграться

In [None]:
cross_val_score(
    make_pipeline(
        MeanTransformer(),
        LogisticRegression(solver='liblinear', max_iter=400, C=10, penalty='l1')
    ),
    X, 
    y,
    cv=3
).mean()



### Напишите MeanTransformer, который бы при обучении считал счётчики не по всей обучающей выборке, а только по предыдущим объектам - придётся написать свой метод fit_transform

## Деревья умеют работать с категориальными признаками

In [None]:
cross_val_score(DecisionTreeClassifier(max_depth=None), X_with_cat, y, cv=3).mean()

In [None]:
cross_val_score(
    make_pipeline(
        MeanTransformer(),
        DecisionTreeClassifier(max_depth=None)
    ),
    X_with_cat, 
    y,
    cv=3
).mean()

### CatBoost тоже

In [None]:
cat_features = list(range(X_train.shape[1]))
catboost_pool = catboost.Pool(X_cat, y, cat_features)

In [None]:
X_cat

In [None]:
params = {
    'iterations': 1, 
    'depth': 6, 
    'loss_function': 'Logloss', 
    'verbose': False,
    'eval_metric': 'Accuracy'
}
catboost.cv(catboost_pool, params, fold_count=3, stratified=True)

Почему так?

In [None]:
cb_model = catboost.train(catboost_pool, params)

In [None]:
cb_model.feature_importances_

In [None]:
X_sel = X[:, [4, 5, 7]]

In [None]:
X_sel

In [None]:
cross_val_score(
    make_pipeline(
        MeanTransformer(),
        LogisticRegression(solver='lbfgs', max_iter=400)
    ),
    X_sel, 
    y,
    cv=3
).mean()

In [None]:
params = {
    'iterations': 10, 
    'depth': 6, 
    'loss_function': 'Logloss', 
    'verbose': False,
    'eval_metric': 'Accuracy'
}
catboost.cv(catboost_pool, params, fold_count=3, stratified=True)