# One Hot Encoding

### Data Set: https://www.kaggle.com/aditya1702/mercedes-benz-data-exploration/data


In [2]:
import pandas as pd
import numpy as np

In [3]:
traindf = pd.read_csv('data/Mercedes Benz/train.csv',usecols=['X1','X2','X3','X4','X5','X6'])
print('Shape: ', traindf.shape)

Shape:  (4209, 6)


In [3]:
traindf.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


## Counting unique labels under all columns

In [4]:
_colSize = 0
for col in traindf.columns:
    print(col, ': ', len(traindf[col].unique()), ' labels')
    _colSize += len(traindf[col].unique())
print('Total possible cols: ', _colSize)

X1 :  27  labels
X2 :  44  labels
X3 :  7  labels
X4 :  4  labels
X5 :  29  labels
X6 :  12  labels
Total possible cols:  123


## One hot ecoding using pandas

#### Column X2 is selected for one hot encoding. 
#### So, this will result in 44 additional columns added to our train dataset. 

In [5]:
pd.get_dummies(traindf,prefix='X2',prefix_sep='_', columns=['X2']).head()

Unnamed: 0,X1,X3,X4,X5,X6,X2_a,X2_aa,X2_ac,X2_ae,X2_af,...,X2_n,X2_o,X2_p,X2_q,X2_r,X2_s,X2_t,X2_x,X2_y,X2_z
0,v,a,d,u,j,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,t,e,d,y,l,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,c,d,x,j,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,t,f,d,x,l,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,v,f,d,h,d,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


## Defining a function to perform one hot encoding
 
#### Our function would accept 
#### 1. Data frame,
#### 2. Column name in an array,
#### 3. Top - Value representing x top values to be considered for one hot encoding

In [6]:
# one hot encoding function
def one_hot_encoding(df, columns: [], top = -1):
    for col in columns:
        if(top != -1):
            labels = [x for x in (df[col]).value_counts().head(top).index]
        else:
            labels = df[col].unique()
        for label in labels:
            df[col + "_" +label] = np.where(df[col] == label,1,0)
    

#### Selecting only top ranked values for encoding helps reduce column size.

In [7]:
one_hot_encoding(traindf, ["X3","X4"], top=2)

In [8]:
traindf.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X3_c,X3_f,X4_d,X4_a
0,v,at,a,d,u,j,0,0,1,0
1,t,av,e,d,y,l,0,0,1,0
2,w,n,c,d,x,j,1,0,1,0
3,t,n,f,d,x,l,0,1,1,0
4,v,n,f,d,h,d,0,1,1,0
