# Category Encoding

In [1]:
!pip install category_encoders --upgrade --quiet

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
%matplotlib inline

In [3]:
df = pd.DataFrame(
    {
        'Size': np.random.choice(['XS', 'S', 'M', 'L', 'XL', 'XXL'], 10),
        'Brand': np.random.choice(['Nike', 'Puma', 'Adidas', 'Le Coq', 'Reebok'], 10),
    }
)

df.head()

Unnamed: 0,Size,Brand
0,XS,Nike
1,M,Puma
2,XS,Puma
3,XL,Puma
4,S,Le Coq


## Splitting the Dataset

In [4]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=.5)

In [5]:
df_train.shape, df_test.shape

((5, 2), (5, 2))

In [6]:
df_train

Unnamed: 0,Size,Brand
0,XS,Nike
3,XL,Puma
4,S,Le Coq
2,XS,Puma
9,XXL,Nike


In [7]:
df_test

Unnamed: 0,Size,Brand
1,M,Puma
7,S,Le Coq
5,XL,Nike
8,XL,Adidas
6,XXL,Nike


In [8]:
import category_encoders
from category_encoders.one_hot import OneHotEncoder

encoder = OneHotEncoder(use_cat_names=True, handle_unknown='return_nan')
X_train = encoder.fit_transform(df_train)
X_test = encoder.transform(df_test)

In [9]:
X_train

Unnamed: 0,Size_XS,Size_XL,Size_S,Size_XXL,Brand_Nike,Brand_Puma,Brand_Le Coq
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,1.0,0.0,0.0,0.0,0.0,1.0,0.0
9,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [10]:
X_test

Unnamed: 0,Size_XS,Size_XL,Size_S,Size_XXL,Brand_Nike,Brand_Puma,Brand_Le Coq
1,,,,,0.0,1.0,0.0
7,0.0,0.0,1.0,0.0,0.0,0.0,1.0
5,0.0,1.0,0.0,0.0,1.0,0.0,0.0
8,0.0,1.0,0.0,0.0,,,
6,0.0,0.0,0.0,1.0,1.0,0.0,0.0


## Ordinal Encoder


In [11]:
from category_encoders.ordinal import OrdinalEncoder

oencoder = OrdinalEncoder(
    mapping=[
             {
                 'col':'Size',
              'mapping': {'XS': 1, 'S': 2, 'M': 3, 'L': 4, 'XL': 5},
             }
    ]
)

df_train.loc[:, 'Size[Ordinal_Encoded]'] = oencoder.fit_transform(
    df_train['Size']
)['Size'].values

df_test.loc[:, 'Size[Ordinal_Encoded]'] = oencoder.transform(
    df_test['Size']
)['Size'].values

In [12]:
df_train

Unnamed: 0,Size,Brand,Size[Ordinal_Encoded]
0,XS,Nike,1.0
3,XL,Puma,5.0
4,S,Le Coq,2.0
2,XS,Puma,1.0
9,XXL,Nike,-1.0


In [13]:
df_test

Unnamed: 0,Size,Brand,Size[Ordinal_Encoded]
1,M,Puma,3.0
7,S,Le Coq,2.0
5,XL,Nike,5.0
8,XL,Adidas,5.0
6,XXL,Nike,-1.0
