# Encoding Categoricals

## Setup

In [35]:
import numpy as np
import pandas as pd

## Tom Augspurger's [approach](https://tomaugspurger.github.io/categorical-pipelines.html)

### Look into data

In [4]:
url = ('http://vincentarelbundock.github.io/Rdatasets/csv/Stat2Data/Diamonds.csv')
df = pd.read_csv(url, index_col=0)
df.head()

Unnamed: 0,Carat,Color,Clarity,Depth,PricePerCt,TotalPrice
1,1.08,E,VS1,68.6,6693.3,7228.8
2,0.31,F,VVS1,61.9,3159.0,979.3
3,0.31,H,VS1,62.1,1755.0,544.1
4,0.32,F,VVS1,60.8,3159.0,1010.9
5,0.33,D,IF,60.8,4758.8,1570.4


In [72]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 351 entries, 1 to 351
Data columns (total 6 columns):
Carat         351 non-null float64
Color         351 non-null category
Clarity       349 non-null category
Depth         351 non-null float64
PricePerCt    351 non-null float64
TotalPrice    351 non-null float64
dtypes: category(2), float64(4)
memory usage: 25.1 KB


Clarity has 2 nulls

In [77]:
df.dropna(inplace=True)

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 349 entries, 1 to 351
Data columns (total 6 columns):
Carat         349 non-null float64
Color         349 non-null category
Clarity       349 non-null category
Depth         349 non-null float64
PricePerCt    349 non-null float64
TotalPrice    349 non-null float64
dtypes: category(2), float64(4)
memory usage: 15.1 KB


In [100]:
pd.get_dummies(df.Color).head()

Unnamed: 0,D,E,F,G,H,I,J
1,0,1,0,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0
5,1,0,0,0,0,0,0


In [101]:
pd.get_dummies(df.Clarity).head()

Unnamed: 0,I1,SI2,SI1,VS2,VS1,VVS2,VVS1,IF
1,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,1,0
3,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,0
5,0,0,0,0,0,0,0,1


In [79]:
cat_columns = ['Color', 'Clarity']
color = list("DEFGHIJ")
clarity = ["I1", "SI2", "SI1", "VS2", "VS1", "VVS2", "VVS1", "IF"]

In [80]:
df['Color'] = pd.Categorical(df.Color, categories=color)
df['Clarity'] = pd.Categorical(df.Clarity, categories=clarity, ordered=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 349 entries, 1 to 351
Data columns (total 6 columns):
Carat         349 non-null float64
Color         349 non-null category
Clarity       349 non-null category
Depth         349 non-null float64
PricePerCt    349 non-null float64
TotalPrice    349 non-null float64
dtypes: category(2), float64(4)
memory usage: 15.1 KB


In [103]:
df['Color'].cat.categories

Index(['D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype='object')

In [82]:
X = df.drop(['PricePerCt', 'TotalPrice'], axis=1)
y = df['TotalPrice']
X_train, X_test, y_train, y_test = cv.train_test_split(X, y)

### Original Code

In [230]:
class CategoricalTransformer(TransformerMixin):
    def __init__(self, cat_columns=None):
        self._cat_columns = cat_columns
        
    def fit(self, X, y=None, *args, **kwargs):
        self.columns_ = X.columns
        
        self.cat_columns_ = self._cat_columns if self._cat_columns else X.select_dtypes(include=['category']).columns
            
        self.non_cat_columns_ = X.columns.drop(self.cat_columns_)

        self.cat_map_ = {col: X[col].cat.categories
                         for col in self.cat_columns_}
        self.ordered_ = {col: X[col].cat.ordered
                         for col in self.cat_columns_}

        self.dummy_columns_ = {col: ["_".join([col, v])
                                     for v in self.cat_map_[col]]
                               for col in self.cat_columns_}
        self.transformed_columns_ = pd.Index(
            self.non_cat_columns_.tolist() +
            list(chain.from_iterable(self.dummy_columns_[k]
                                     for k in self.cat_columns_))
        )
        return self

    def transform(self, X, y=None, *args, **kwargs):
        return (pd.get_dummies(X)
                  .reindex(columns=self.transformed_columns_)
                  .fillna(0))

    def inverse_transform(self, X):
        X = np.asarray(X)
        series = []
        non_cat_cols = (self.transformed_columns_
                            .get_indexer(self.non_cat_columns_))
        non_cat = pd.DataFrame(X[:, non_cat_cols],
                               columns=self.non_cat_columns_)
        for col, cat_cols in self.dummy_columns_.items():
            locs = self.transformed_columns_.get_indexer(cat_cols)
            codes = X[:, locs].argmax(1)
            cats = pd.Categorical.from_codes(codes, self.cat_map_[col],
                                             ordered=self.ordered_[col])
            series.append(pd.Series(cats, name=col))
        # concats sorts, we want the original order
        df = (pd.concat([non_cat] + series, axis=1)
                .reindex(columns=self.columns_))
        return df

### Test it

In [231]:
import pandas.util.testing as tm

result = cat_transformer.inverse_transform(cat_transformer.transform(X))
result.index = X.index

tm.assert_frame_equal(result, X)

In [219]:
cat_transformer = CategoricalTransformer()

In [220]:
cat_transformer.fit(X)

<__main__.CategoricalTransformer at 0x7f3ac6911cf8>

In [221]:
X_transformed = cat_transformer.transform(X)

In [222]:
X_transformed.head()

Unnamed: 0,Carat,Depth,Color_D,Color_E,Color_F,Color_G,Color_H,Color_I,Color_J,Clarity_I1,Clarity_SI2,Clarity_SI1,Clarity_VS2,Clarity_VS1,Clarity_VVS2,Clarity_VVS1,Clarity_IF
1,1.08,68.6,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0
2,0.31,61.9,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
3,0.31,62.1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0
4,0.32,60.8,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0
5,0.33,60.8,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [223]:
row = X_transformed.loc[1]

In [224]:
cat_transformer.inverse_transform(row.to_frame().T)

Unnamed: 0,Carat,Color,Clarity,Depth
0,1.08,E,VS1,68.6


In [233]:
cat_trans = CategoricalTransformer(['Color'])
cat_trans.fit_transform(df).head()

Unnamed: 0,Carat,Clarity,Depth,PricePerCt,TotalPrice,Color_D,Color_E,Color_F,Color_G,Color_H,Color_I,Color_J
1,1.08,0.0,68.6,6693.3,7228.8,0,1,0,0,0,0,0
2,0.31,0.0,61.9,3159.0,979.3,0,0,1,0,0,0,0
3,0.31,0.0,62.1,1755.0,544.1,0,0,0,0,1,0,0
4,0.32,0.0,60.8,3159.0,1010.9,0,0,1,0,0,0,0
5,0.33,0.0,60.8,4758.8,1570.4,1,0,0,0,0,0,0


### Use it

In [226]:
from itertools import chain

from sklearn.linear_model import Lasso
import sklearn.model_selection as cv
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline, TransformerMixin

In [227]:
pipe = make_pipeline(CategoricalTransformer(), StandardScaler(), Lasso())
pipe.fit(X_train, y_train)
print("train", pipe.score(X_train, y_train))
print("test", pipe.score(X_test, y_test))

train 0.931300396464
test 0.929892372391


In [97]:
pipe.score?

### <font color='red'>Todo</font>

- Pass columns to Hot-Encode explicitly into constructor
- Memory inefficient, because dummies are dense and not SparseMatrix like Scikit-Learn's OneHotEncoder ..
- Consider other variants: approach from "Hands-on ML"
<br/>

Otherwise looks pretty good for practical use

## Approach from "Hands-On ML"

In [186]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, Imputer, MultiLabelBinarizer

In [194]:
class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attribute_names].values

In [204]:
class SupervisionFriendlyMultiLabelBinarizer(MultiLabelBinarizer):
    def fit_transform(self, X, y=None):
        return super(SupervisionFriendlyMultiLabelBinarizer, self).fit_transform(X)

In [107]:
df.head()

Unnamed: 0,Carat,Color,Clarity,Depth,PricePerCt,TotalPrice
1,1.08,E,VS1,68.6,6693.3,7228.8
2,0.31,F,VVS1,61.9,3159.0,979.3
3,0.31,H,VS1,62.1,1755.0,544.1
4,0.32,F,VVS1,60.8,3159.0,1010.9
5,0.33,D,IF,60.8,4758.8,1570.4


In [209]:
num_columns = list(df.select_dtypes(include=['float']).columns)
cat_columns = list(set(df.columns) - set(num_columns))

num_columns, cat_columns

(['Carat', 'Depth', 'PricePerCt', 'TotalPrice'], ['Clarity', 'Color'])

In [197]:
num_pipe = Pipeline([
    ('selector', DataFrameSelector(num_columns)),
    ('imputer', Imputer(strategy='median')),
    ('std_scaler', StandardScaler())
])

In [205]:
cat_pipe = Pipeline([
    ('selector', DataFrameSelector(cat_columns)),
    ('binarizer', SupervisionFriendlyMultiLabelBinarizer())
])

In [207]:
preparation_pipe = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipe),
    ('cat_pipeline', cat_pipe)
])

In [208]:
preparation_pipe.fit_transform(df)

array([[ 0.16371142,  0.81795926,  0.14957714, ...,  0.        ,
         0.        ,  0.        ],
       [-1.39691891, -0.48812797, -1.0745251 , ...,  0.        ,
         1.        ,  0.        ],
       [-1.39691891, -0.44914029, -1.56079947, ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.50826617,  1.46125596, -0.22635036, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.05550018,  1.16884837,  0.4754918 , ...,  0.        ,
         0.        ,  0.        ],
       [ 1.46085871,  0.91542846,  0.63024066, ...,  0.        ,
         0.        ,  0.        ]])