# Обработка категориальных признаков

In [1]:
import numpy as np
import pandas as pd

from sklearn import preprocessing
from sklearn.datasets import load_breast_cancer, load_boston
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

import lightgbm
import catboost
from sklearn.base import TransformerMixin
from collections import Counter

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


## One hot encoding

![](images/one_hot_encoding.png)

In [2]:
X_train = np.array([
    [0, 0, 3], 
    [1, 1, 0], 
    [0, 2, 1], 
    [1, 0, 2]
])

In [3]:
encoder = preprocessing.OneHotEncoder(categories='auto', sparse=False).fit(X_train)
encoder

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=False)

In [4]:
encoder.transform([[0, 1, 3]])

array([[1., 0., 0., 1., 0., 0., 0., 0., 1.]])

In [5]:
encoder.categories_

[array([0, 1]), array([0, 1, 2]), array([0, 1, 2, 3])]

Можем явно указать метки

In [6]:
encoder = preprocessing.OneHotEncoder(
    categories=[np.array([0, 1, 2]), np.array([0, 1, 2]), np.array([0, 1, 2, 3])]
).fit(X_train)
encoder

OneHotEncoder(categorical_features=None,
              categories=[array([0, 1, 2]), array([0, 1, 2]),
                          array([0, 1, 2, 3])],
              drop=None, dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=True)

In [7]:
encoder.transform([[0, 1, 3]]).todense()

matrix([[1., 0., 0., 0., 1., 0., 0., 0., 0., 1.]])

Пример использования

In [8]:
df = pd.read_csv('data/mushrooms.csv', header=None)

In [9]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g


Датасет с категориальными фичами

In [10]:
X_cat, y = np.array(df.loc[:, 1:]), np.array(df.loc[:, 0])

In [11]:
X_cat

array([['x', 's', 'n', ..., 'k', 's', 'u'],
       ['x', 's', 'y', ..., 'n', 'n', 'g'],
       ['b', 's', 'w', ..., 'n', 'n', 'm'],
       ...,
       ['f', 's', 'n', ..., 'b', 'c', 'l'],
       ['k', 'y', 'n', ..., 'w', 'v', 'l'],
       ['x', 's', 'n', ..., 'o', 'c', 'l']], dtype=object)

In [12]:
y = np.equal(y, 'p').astype(int) # poison
y

array([1, 0, 0, ..., 0, 1, 0])

In [13]:
y.mean()

0.48202855736090594

### Можно сделать числа

In [14]:
label_encoder = preprocessing.LabelEncoder()
label_encoder

LabelEncoder()

In [15]:
X_with_cat = X_cat.copy()
for i in range(X_cat.shape[1]):
    X_with_cat[:, i] = label_encoder.fit_transform(X_with_cat[:, i])
X_with_cat

array([[5, 2, 4, ..., 2, 3, 5],
       [5, 2, 9, ..., 3, 2, 1],
       [0, 2, 8, ..., 3, 2, 3],
       ...,
       [2, 2, 4, ..., 0, 1, 2],
       [3, 3, 4, ..., 7, 4, 2],
       [5, 2, 4, ..., 4, 1, 2]], dtype=object)

In [16]:
cross_val_score(LogisticRegression(solver='lbfgs', max_iter=1000), X_with_cat, y, cv=3).mean()

0.8014663724195582

Это хорошее качество?

### OHE будет лучше

In [17]:
encoder = preprocessing.OneHotEncoder(sparse=False, categories='auto').fit(X_cat)
encoder

OneHotEncoder(categorical_features=None, categories='auto', drop=None,
              dtype=<class 'numpy.float64'>, handle_unknown='error',
              n_values=None, sparse=False)

In [18]:
X = encoder.transform(X_cat)
X

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [19]:
cross_val_score(LogisticRegression(solver='lbfgs', max_iter=400), X, y, cv=3).mean()

0.9053192752042772

## Mean encoding

![](images/mean_encoding.png)

In [20]:
X = X_cat.copy()

for i in range(X.shape[1]):
    le = label_encoder.fit(X[:, i])
    X[:, i] = le.transform(X[:, i])
    for j in range(len(le.classes_)):
        indices = X[:, i] == j
        X[indices, i] = y[indices].mean()

In [21]:
cross_val_score(LogisticRegression(solver='lbfgs', max_iter=400), X, y, cv=3).mean()

0.9022420641917647

Но, как мы помним, это некорректные оценки. Почему?

Давайте честно оценим качество

In [22]:
X = X_cat.copy()
for i in range(X.shape[1]):
    X[:, i] = label_encoder.fit_transform(X[:, i])

In [23]:
cross_val_score(
    make_pipeline(
        preprocessing.OneHotEncoder(sparse=False, categories='auto'),
        LogisticRegression(solver='lbfgs')
    ),
    X, 
    y,
    cv=10,
).mean()

ValueError: Found unknown categories [6] in column 4 during transform

In [24]:
cross_val_score(
    make_pipeline(
        preprocessing.OneHotEncoder(sparse=False, categories='auto', handle_unknown='ignore'),
        LogisticRegression(solver='lbfgs')
    ),
    X, 
    y,
    cv=3
).mean()

0.9053192752042772

Мы не указали поведение encoder-a на ранее не наблюдаемых значениях признака, поэтому получаем ошибку, если указать поведение явно, то ошибки не будет

In [25]:
cross_val_score(
    make_pipeline(
        preprocessing.OneHotEncoder(handle_unknown='ignore'),
        LogisticRegression(solver='lbfgs')
    ),
    X, 
    y,
    cv=3
).mean()

0.9053192752042772

Чтобы сделать pipeline напишем собственный трансформер

Чтобы не писать лишних методов наседуемся от базового класса TransformerMixin - теперь не нужно реализовывать fit_transform

In [26]:
class MeanTransformer(TransformerMixin):
    
    def fit(self, X, y):
        self.cnt = Counter()
        for i in range(X.shape[1]):
            for j in range(np.max(X[:, i])):
                indices = X[:, i] == j
                if np.sum(indices) > 0:
                    val = y[indices].mean()
                else:
                    val = y.mean()
                self.cnt[(i, j)] = val
                
        return self
    
    def transform(self, X):
        X_new = np.copy(X)
        for i in range(X.shape[1]): 
            for j in range(np.max(X[:, i])):
                indices = X[:, i] == j
                if np.sum(indices) > 0:
                    X_new[indices, i] = self.cnt[(i, j)]
        return X_new


In [27]:
X = X_cat.copy()
for i in range(X.shape[1]):
    X[:, i] = label_encoder.fit_transform(X[:, i])

In [28]:
cross_val_score(
    make_pipeline(
        MeanTransformer(),
        LogisticRegression(solver='lbfgs', max_iter=400)
    ),
    X, 
    y,
    cv=3
).mean()

0.7606350795749437

In [29]:
cross_val_score(
    make_pipeline(
        MeanTransformer(),
        LogisticRegression(solver='lbfgs', max_iter=400)
    ),
    X, 
    y,
    cv=10
).mean()

0.7016294379367596

Как видите, качество заметно ниже

Но если поиграться

In [30]:
cross_val_score(
    make_pipeline(
        MeanTransformer(),
        LogisticRegression(solver='liblinear', max_iter=400, C=10, penalty='l1')
    ),
    X, 
    y,
    cv=3
).mean()

0.8851333556059116



### Напишите MeanTransformer, который бы при обучении считал счётчики не по всей обучающей выборке, а только по предыдущим объектам - придётся написать свой метод fit_transform

## Деревья умеют работать с категориальными признаками

In [31]:
cross_val_score(DecisionTreeClassifier(max_depth=None), X_with_cat, y, cv=3).mean()

0.8788568381940892

In [32]:
cross_val_score(
    make_pipeline(
        MeanTransformer(),
        DecisionTreeClassifier(max_depth=None)
    ),
    X_with_cat, 
    y,
    cv=3
).mean()

0.964056655639928

### CatBoost тоже

In [45]:
cat_features = list(range(X_cat.shape[1]))
catboost_pool = catboost.Pool(X_cat, y, cat_features)

In [46]:
X_cat

array([['x', 's', 'n', ..., 'k', 's', 'u'],
       ['x', 's', 'y', ..., 'n', 'n', 'g'],
       ['b', 's', 'w', ..., 'n', 'n', 'm'],
       ...,
       ['f', 's', 'n', ..., 'b', 'c', 'l'],
       ['k', 'y', 'n', ..., 'w', 'v', 'l'],
       ['x', 's', 'n', ..., 'o', 'c', 'l']], dtype=object)

In [47]:
params = {
    'iterations': 1, 
    'depth': 6, 
    'loss_function': 'Logloss', 
    'verbose': False,
    'eval_metric': 'Accuracy'
}
catboost.cv(catboost_pool, params, fold_count=3, stratified=True)

Unnamed: 0,iterations,test-Accuracy-mean,test-Accuracy-std,train-Accuracy-mean,train-Accuracy-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.985229,0.000736,0.984121,0.000491,0.615378,0.004342,0.615514,0.004312


Почему так?

In [48]:
cb_model = catboost.train(catboost_pool, params)

In [49]:
cb_model.feature_importances_

array([ 0.        ,  0.        ,  0.        ,  0.        , 28.57187027,
        0.        ,  0.        , 71.42812973,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ])

In [50]:
X_sel = X[:, [4, 5, 7]]

In [51]:
X_sel

array([[6, 1, 1],
       [0, 1, 0],
       [3, 1, 0],
       ...,
       [5, 0, 0],
       [8, 1, 1],
       [5, 0, 0]], dtype=object)

In [52]:
cross_val_score(
    make_pipeline(
        MeanTransformer(),
        LogisticRegression(solver='lbfgs', max_iter=400)
    ),
    X_sel, 
    y,
    cv=3
).mean()

0.98522886011058

In [53]:
params = {
    'iterations': 10, 
    'depth': 6, 
    'loss_function': 'Logloss', 
    'verbose': False,
    'eval_metric': 'Accuracy'
}
catboost.cv(catboost_pool, params, fold_count=3, stratified=True)

Unnamed: 0,iterations,test-Accuracy-mean,test-Accuracy-std,train-Accuracy-mean,train-Accuracy-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.985229,0.000736,0.984121,0.000491,0.615378,0.004342,0.615514,0.004312
1,1,0.985229,0.000736,0.984306,0.000807,0.543517,0.009675,0.544006,0.009513
2,2,0.985229,0.000736,0.984367,0.00065,0.482665,0.008292,0.483472,0.008286
3,3,0.985229,0.000736,0.984613,0.000596,0.431016,0.008791,0.432047,0.008917
4,4,0.985229,0.000736,0.984798,0.000467,0.386863,0.005557,0.387874,0.005418
5,5,0.985229,0.000736,0.984552,0.000651,0.349891,0.008396,0.351057,0.007996
6,6,0.985229,0.000736,0.984798,0.000387,0.307675,0.008384,0.308819,0.00765
7,7,0.985229,0.000736,0.985106,0.000213,0.278259,0.008003,0.279404,0.00735
8,8,0.985229,0.000736,0.985106,0.000213,0.253052,0.007135,0.254242,0.006519
9,9,0.985229,0.000736,0.984921,0.000284,0.230553,0.011031,0.231827,0.010485
