# Sklearn One Hot Encode
This notebook is aimed to those who want to know a little bit more about the process of OneHotEncoding. It's simple but useful (at least for me) and help me to understand and fix the concept of Categorical Encoding.

The data is part of the competition ['30 Days of ML' from Kaggle](https://www.kaggle.com/c/30-days-of-ml/overview). 
Now, let's read the data and put our hands on it!

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder

In [2]:
filepath = '../data/train.csv'
df = pd.read_csv(filepath)

df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,B,B,B,C,B,B,A,E,C,...,0.400361,0.160266,0.310921,0.38947,0.267559,0.237281,0.377873,0.322401,0.86985,8.113634
1,2,B,B,A,A,B,D,A,F,A,...,0.533087,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233
2,3,A,A,A,C,B,D,A,D,A,...,0.650609,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351
3,4,B,B,A,C,B,D,A,E,C,...,0.66898,0.239061,0.732948,0.679618,0.574844,0.34601,0.71461,0.54015,0.280682,8.049253
4,6,A,A,A,C,B,D,A,E,A,...,0.686964,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.97226


In [3]:
cat_cols = [col for col in df.columns if df[col].dtype == 'object']
df[cat_cols]

Unnamed: 0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9
0,B,B,B,C,B,B,A,E,C,N
1,B,B,A,A,B,D,A,F,A,O
2,A,A,A,C,B,D,A,D,A,F
3,B,B,A,C,B,D,A,E,C,K
4,A,A,A,C,B,D,A,E,A,N
...,...,...,...,...,...,...,...,...,...,...
299995,B,B,A,A,B,D,A,E,A,I
299996,A,B,A,C,B,B,A,E,E,F
299997,B,B,A,C,B,C,A,E,G,F
299998,A,B,A,C,B,B,A,E,E,I


In [4]:
encoder_categories = []
for col in cat_cols:
    cod_categ = df[col].unique()
    encoder_categories.append(cod_categ)

encoder_categories

[array(['B', 'A'], dtype=object),
 array(['B', 'A'], dtype=object),
 array(['B', 'A'], dtype=object),
 array(['C', 'A', 'D', 'B'], dtype=object),
 array(['B', 'C', 'A', 'D'], dtype=object),
 array(['B', 'D', 'C', 'A'], dtype=object),
 array(['A', 'B', 'C', 'H', 'D', 'I', 'G', 'E'], dtype=object),
 array(['E', 'F', 'D', 'B', 'G', 'C', 'A', 'I'], dtype=object),
 array(['C', 'A', 'G', 'E', 'F', 'D', 'B'], dtype=object),
 array(['N', 'O', 'F', 'K', 'M', 'I', 'G', 'H', 'L', 'B', 'A', 'J', 'D',
        'C', 'E'], dtype=object)]

In [5]:
encoder_ohe = OneHotEncoder(categories=encoder_categories, handle_unknown='ignore', sparse=False)
df_ohe = encoder_ohe.fit_transform(df[cat_cols])

In [6]:
encoder_ohe.get_feature_names()

array(['x0_B', 'x0_A', 'x1_B', 'x1_A', 'x2_B', 'x2_A', 'x3_C', 'x3_A',
       'x3_D', 'x3_B', 'x4_B', 'x4_C', 'x4_A', 'x4_D', 'x5_B', 'x5_D',
       'x5_C', 'x5_A', 'x6_A', 'x6_B', 'x6_C', 'x6_H', 'x6_D', 'x6_I',
       'x6_G', 'x6_E', 'x7_E', 'x7_F', 'x7_D', 'x7_B', 'x7_G', 'x7_C',
       'x7_A', 'x7_I', 'x8_C', 'x8_A', 'x8_G', 'x8_E', 'x8_F', 'x8_D',
       'x8_B', 'x9_N', 'x9_O', 'x9_F', 'x9_K', 'x9_M', 'x9_I', 'x9_G',
       'x9_H', 'x9_L', 'x9_B', 'x9_A', 'x9_J', 'x9_D', 'x9_C', 'x9_E'],
      dtype=object)

In [7]:
encoder_ohe.get_feature_names(cat_cols)

array(['cat0_B', 'cat0_A', 'cat1_B', 'cat1_A', 'cat2_B', 'cat2_A',
       'cat3_C', 'cat3_A', 'cat3_D', 'cat3_B', 'cat4_B', 'cat4_C',
       'cat4_A', 'cat4_D', 'cat5_B', 'cat5_D', 'cat5_C', 'cat5_A',
       'cat6_A', 'cat6_B', 'cat6_C', 'cat6_H', 'cat6_D', 'cat6_I',
       'cat6_G', 'cat6_E', 'cat7_E', 'cat7_F', 'cat7_D', 'cat7_B',
       'cat7_G', 'cat7_C', 'cat7_A', 'cat7_I', 'cat8_C', 'cat8_A',
       'cat8_G', 'cat8_E', 'cat8_F', 'cat8_D', 'cat8_B', 'cat9_N',
       'cat9_O', 'cat9_F', 'cat9_K', 'cat9_M', 'cat9_I', 'cat9_G',
       'cat9_H', 'cat9_L', 'cat9_B', 'cat9_A', 'cat9_J', 'cat9_D',
       'cat9_C', 'cat9_E'], dtype=object)

In [8]:
df_ohe

array([[1., 0., 1., ..., 0., 0., 0.],
       [1., 0., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 1., ..., 0., 0., 0.],
       [0., 1., 1., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]])

In [9]:
df_encoded = pd.DataFrame(df_ohe, columns=encoder_ohe.get_feature_names(cat_cols))
df_encoded

Unnamed: 0,cat0_B,cat0_A,cat1_B,cat1_A,cat2_B,cat2_A,cat3_C,cat3_A,cat3_D,cat3_B,...,cat9_I,cat9_G,cat9_H,cat9_L,cat9_B,cat9_A,cat9_J,cat9_D,cat9_C,cat9_E
0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,1.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299996,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299997,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
299998,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
