# One Hot Encoding - variables with many (multiple) categories


In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2', 'X3', 'X4', 'X5', 'X6'])
df.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [2]:
for col in df:
    print(df[col].unique())

['v' 't' 'w' 'b' 'r' 'l' 's' 'aa' 'c' 'a' 'e' 'h' 'z' 'j' 'o' 'u' 'p' 'n'
 'i' 'y' 'd' 'f' 'm' 'k' 'g' 'q' 'ab']
['at' 'av' 'n' 'e' 'as' 'aq' 'r' 'ai' 'ak' 'm' 'a' 'k' 'ae' 's' 'f' 'd'
 'ag' 'ay' 'ac' 'ap' 'g' 'i' 'aw' 'y' 'b' 'ao' 'al' 'h' 'x' 'au' 't' 'an'
 'z' 'ah' 'p' 'am' 'j' 'q' 'af' 'l' 'aa' 'c' 'o' 'ar']
['a' 'e' 'c' 'f' 'd' 'b' 'g']
['d' 'b' 'c' 'a']
['u' 'y' 'x' 'h' 'g' 'f' 'j' 'i' 'd' 'c' 'af' 'ag' 'ab' 'ac' 'ad' 'ae'
 'ah' 'l' 'k' 'n' 'm' 'p' 'q' 's' 'r' 'v' 'w' 'o' 'aa']
['j' 'l' 'd' 'h' 'i' 'a' 'g' 'c' 'k' 'e' 'f' 'b']


In [3]:
df['X1'].unique()

array(['v', 't', 'w', 'b', 'r', 'l', 's', 'aa', 'c', 'a', 'e', 'h', 'z',
       'j', 'o', 'u', 'p', 'n', 'i', 'y', 'd', 'f', 'm', 'k', 'g', 'q',
       'ab'], dtype=object)

In [4]:
len(df['X1'].unique())

27

In [5]:
df['X2'].unique()

array(['at', 'av', 'n', 'e', 'as', 'aq', 'r', 'ai', 'ak', 'm', 'a', 'k',
       'ae', 's', 'f', 'd', 'ag', 'ay', 'ac', 'ap', 'g', 'i', 'aw', 'y',
       'b', 'ao', 'al', 'h', 'x', 'au', 't', 'an', 'z', 'ah', 'p', 'am',
       'j', 'q', 'af', 'l', 'aa', 'c', 'o', 'ar'], dtype=object)

In [6]:
len(df['X2'].unique())

44

In [7]:
#Let's have a look at how many unique labels each variable have
for col in df.columns:
    print(col, ': ', len(df[col].unique()), ' labels')

X1 :  27  labels
X2 :  44  labels
X3 :  7  labels
X4 :  4  labels
X5 :  29  labels
X6 :  12  labels


In [8]:
df.shape

(4209, 6)

In [9]:
# let's examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(df, drop_first=True).shape

(4209, 117)

We can observe that from with just 6 initial categorical features we are getting 117 new features with the help of one hot encoding.

What can we do instead?

## KDD Cup Orange Challenge

PDF Link:
http://proceedings.mlr.press/v7/niculescu09/niculescu09.pdf

In the winning solution of the KDD 2009 cup: "Winning the KDD Cup Orange Challenge with Ensemble 

The Team suggested using 10 most frequent labels convert them into dummy variables using onehotencoding

In [10]:
# let's find the top 10 most frequent categories for the variable X2
df.X2.value_counts().sort_values(ascending=False).head(15)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
i       25
Name: X2, dtype: int64

In [11]:
# let's make a list with the most frequent categories of the variable
top_10_labels = [x for x in df.X1.value_counts().sort_values(ascending=False).head(10).index]
top_10_labels

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [12]:
# get whole set of dummy variables, for all the categorical variables

def one_hot_encoding_top_x(df, variable, top_x_labels):
    # function to create the dummy variables for the most frequent labels
    # we can vary the number of most frequent labels that we encode
    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(df[variable]==label, 1, 0)

In [13]:
# read the data again
df = pd.read_csv('mercedesbenz.csv', usecols=['X1', 'X2', 'X3', 'X4', 'X5', 'X6',])

# encode X1 into the 10 most frequent categories
one_hot_encoding_top_x(df, 'X1', top_10_labels)
df.head(10)

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o
0,v,at,a,d,u,j,0,0,0,0,1,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,1,0,0,0,0,0
5,b,e,c,d,g,h,0,0,1,0,0,0,0,0,0,0
6,r,e,f,d,f,h,0,0,0,0,0,1,0,0,0,0
7,l,as,f,d,f,j,0,0,0,1,0,0,0,0,0,0
8,s,as,e,d,f,i,0,1,0,0,0,0,0,0,0,0
9,b,aq,c,d,f,a,0,0,1,0,0,0,0,0,0,0


In [14]:
top_10_labels = [x for x in df.X2.value_counts().sort_values(ascending=False).head(10).index]
one_hot_encoding_top_x(df, 'X2', top_10_labels)
df.head(10)

#Similarly we can do the same thing for remaining X3, X4, X5 and X6

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,...,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
5,b,e,c,d,g,h,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
6,r,e,f,d,f,h,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,l,as,f,d,f,j,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
8,s,as,e,d,f,i,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
9,b,aq,c,d,f,a,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


## Merits and Demerits of this technique:

###### Merits:

    • Straightforward to implement.
    • Does not require extensive efforts of variable exploration.
    • Does not expand feature space massively (number of columns in a dataset as limited to inly top 10).

###### Demerits:

    • Does not make variable more predictive.
    • Does not keep the information of ignores values (labels).