# One hot encoding - variables with many categories

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [2]:
data=pd.read_csv('mercedes.csv')
data.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [3]:
data.shape

(4209, 378)

In [8]:
categorical_fea=[features for features in data.columns if data[features].dtypes=='O']
data[categorical_fea].head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n


In [14]:
data[categorical_fea].shape[1]

8

In [12]:
#lets look at how many labels are there in these categories
for cols in categorical_fea:
    print(cols,':',len(data[cols].unique()),'labels')   

X0 : 47 labels
X1 : 27 labels
X2 : 44 labels
X3 : 7 labels
X4 : 4 labels
X5 : 29 labels
X6 : 12 labels
X8 : 25 labels


In [16]:
#lets examine how many columns we will obtain after one hot encoding these variables
pd.get_dummies(data[categorical_fea],drop_first=True).shape

(4209, 187)

In [17]:
#lets find the top 10 most frequent categories for the X2 feature
data.X2.value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
z       19
ag      19
Name: X2, dtype: int64

In [20]:
#lets make a list with the most frequent categories of the variable

top_10=[X for X in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [23]:
#and now we make the 10 binary variables

for label in top_10:
    data[label]=np.where(data['X2']==label,1,0)
    
data[['X2']+top_10].head(40)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [37]:
#get whole set of dummy variables,for all the categorical variables

def one_hot_top_x(df,variable,top_x_labels):
    #function to create the dummy variables for the most frequent lables
    #we can vary the number of most frequent labels that we encode 
    for label in top_x_labels:
        df[variable+'_'+label]=np.where(data[variable]==label,1,0)
        
#read the data again
df=data[categorical_fea]

#encode X2 into the 10 most frequent categories
one_hot_top_x(df,'X2',top_10)
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X2_j,X2_s,X2_f,X2_n,X2_i,X2_e,X2_r,X2_a,X2_w,X2_v
0,k,v,at,a,d,u,j,o,0,0,0,0,0,0,0,0,0,0
1,k,t,av,e,d,y,l,o,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,x,0,0,0,1,0,0,0,0,0,0
3,az,t,n,f,d,x,l,e,0,0,0,1,0,0,0,0,0,0
4,az,v,n,f,d,h,d,n,0,0,0,1,0,0,0,0,0,0


In [39]:
top_10=[x for x in data.X1.value_counts().sort_values(ascending=False).head(10).index]
print(top_10)

one_hot_top_x(df,'X1',top_10)
df.head()

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']


Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X2_j,X2_s,...,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o
0,k,v,at,a,d,u,j,o,0,0,...,0,0,0,0,1,0,0,0,0,0
1,k,t,av,e,d,y,l,o,0,0,...,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,x,0,0,...,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,e,0,0,...,0,0,0,0,0,0,0,0,0,0
4,az,v,n,f,d,h,d,n,0,0,...,0,0,0,0,1,0,0,0,0,0


___

In [47]:
data=pd.read_csv('mercedes.csv')
data.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,k,v,at,a,d,u,j,o,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,k,t,av,e,d,y,l,o,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,az,w,n,c,d,x,j,x,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,az,t,n,f,d,x,l,e,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,az,v,n,f,d,h,d,n,...,0,0,0,0,0,0,0,0,0,0


In [58]:
df1=data[data.dtypes[data.dtypes=='O'].index]
df1.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n


In [59]:
for cols in df1:
    top_10=[x for x in df1[cols].value_counts().sort_values(ascending=False).head(10).index]
    print(cols,':',top_10)
    one_hot_top_x(df1,cols,top_10)
df1.head()

X0 : ['z', 'ak', 'y', 'ay', 't', 'x', 'o', 'f', 'n', 'w']
X1 : ['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']
X2 : ['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']
X3 : ['c', 'f', 'a', 'd', 'g', 'e', 'b']
X4 : ['d', 'a', 'c', 'b']
X5 : ['w', 'v', 'q', 'r', 'd', 's', 'n', 'p', 'm', 'i']
X6 : ['g', 'j', 'd', 'i', 'l', 'a', 'h', 'k', 'c', 'b']
X8 : ['j', 's', 'f', 'n', 'i', 'e', 'r', 'a', 'w', 'v']


Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8,X0_z,X0_ak,...,X8_j,X8_s,X8_f,X8_n,X8_i,X8_e,X8_r,X8_a,X8_w,X8_v
0,k,v,at,a,d,u,j,o,0,0,...,0,0,0,0,0,0,0,0,0,0
1,k,t,av,e,d,y,l,o,0,0,...,0,0,0,0,0,0,0,0,0,0
2,az,w,n,c,d,x,j,x,0,0,...,0,0,0,0,0,0,0,0,0,0
3,az,t,n,f,d,x,l,e,0,0,...,0,0,0,0,0,1,0,0,0,0
4,az,v,n,f,d,h,d,n,0,0,...,0,0,0,1,0,0,0,0,0,0


In [61]:
df1.columns

Index(['X0', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X8', 'X0_z', 'X0_ak', 'X0_y',
       'X0_ay', 'X0_t', 'X0_x', 'X0_o', 'X0_f', 'X0_n', 'X0_w', 'X1_aa',
       'X1_s', 'X1_b', 'X1_l', 'X1_v', 'X1_r', 'X1_i', 'X1_a', 'X1_c', 'X1_o',
       'X2_as', 'X2_ae', 'X2_ai', 'X2_m', 'X2_ak', 'X2_r', 'X2_n', 'X2_s',
       'X2_f', 'X2_e', 'X3_c', 'X3_f', 'X3_a', 'X3_d', 'X3_g', 'X3_e', 'X3_b',
       'X4_d', 'X4_a', 'X4_c', 'X4_b', 'X5_w', 'X5_v', 'X5_q', 'X5_r', 'X5_d',
       'X5_s', 'X5_n', 'X5_p', 'X5_m', 'X5_i', 'X6_g', 'X6_j', 'X6_d', 'X6_i',
       'X6_l', 'X6_a', 'X6_h', 'X6_k', 'X6_c', 'X6_b', 'X8_j', 'X8_s', 'X8_f',
       'X8_n', 'X8_i', 'X8_e', 'X8_r', 'X8_a', 'X8_w', 'X8_v'],
      dtype='object')

___

## One hot encoding of top variables

### Advantages

- Straight forward to implement
- Does not require hours of variable exploration
- Does not expand massively the feature space(number of columns in the dataset)

### Disadvantages

- Does not add any information that may make the variables more predictive
- Does not keep the information of the ignored labels

Because it is not unusual that categorical variables have a few dominating variables and the remaining labels add mostly noise,this is a quite simple and straightforward approach that may be useful on many occasions

It is worth nothing that the top 10 variables is a totally arbitrary number.you can also choose top 5 or top 20.

___

##### If we have categorical variables containing many multiple labels or high cardinality,then by using one hot encoding,we will expand the feature space dramatically

Two types of categorical variables
1. Ordinal (categories are ranked) - commonly used encoding technique used in label encoding
2. Nominal (ranking does not matter) 

## Count or Frequecy encoding (applied for high cardinality)


    This approach is used to replace each label of the categorical variable by the count,this is the amount of the times each label appears in the dataset.
    (or) The frequency,this is the percentage of observations within that category

In [65]:
data=pd.read_csv('mercedes.csv')
data=data[data.dtypes[data.dtypes=='O'].index]
data.head()           

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X8
0,k,v,at,a,d,u,j,o
1,k,t,av,e,d,y,l,o
2,az,w,n,c,d,x,j,x
3,az,t,n,f,d,x,l,e
4,az,v,n,f,d,h,d,n


In [80]:
for cols in data.columns:
    print('{} as {} unique labels in it'.format(cols,len(data[cols].value_counts().index)))

X0 as 47 unique labels in it
X1 as 27 unique labels in it
X2 as 44 unique labels in it
X3 as 7 unique labels in it
X4 as 4 unique labels in it
X5 as 29 unique labels in it
X6 as 12 unique labels in it
X8 as 25 unique labels in it


In [86]:
for cols in data.columns:
    print('{} as {} unique labels in it'.format(cols,len(data[cols].unique())))

X0 as 47 unique labels in it
X1 as 27 unique labels in it
X2 as 44 unique labels in it
X3 as 7 unique labels in it
X4 as 4 unique labels in it
X5 as 29 unique labels in it
X6 as 12 unique labels in it
X8 as 25 unique labels in it


In [81]:
#Lets consider top 2 columns with more num of unique labels
df=data[['X0','X2']]
df.head()

Unnamed: 0,X0,X2
0,k,at
1,k,av
2,az,n
3,az,n
4,az,n


In [82]:
df.shape

(4209, 2)

In [84]:
pd.get_dummies(df).shape

(4209, 91)

In [87]:
#Lets obtain the counts for each one of the labels in variable X2
#Lets capture this in a dictionary that we can use to re-map the labels

df.X2.value_counts().to_dict()

{'as': 1659,
 'ae': 496,
 'ai': 415,
 'm': 367,
 'ak': 265,
 'r': 153,
 'n': 137,
 's': 94,
 'f': 87,
 'e': 81,
 'aq': 63,
 'ay': 54,
 'a': 47,
 't': 29,
 'k': 25,
 'i': 25,
 'b': 21,
 'ao': 20,
 'z': 19,
 'ag': 19,
 'd': 18,
 'ac': 13,
 'g': 12,
 'y': 11,
 'ap': 11,
 'x': 10,
 'aw': 8,
 'at': 6,
 'h': 6,
 'q': 5,
 'al': 5,
 'an': 5,
 'p': 4,
 'ah': 4,
 'av': 4,
 'au': 3,
 'l': 1,
 'af': 1,
 'j': 1,
 'am': 1,
 'aa': 1,
 'o': 1,
 'ar': 1,
 'c': 1}

In [88]:
# and now lets replace each label in X2 by its count

#first we make a dictionary that maps each label to the counts

df_freq_map=df.X2.value_counts().to_dict()

In [90]:
df.head(10)

Unnamed: 0,X0,X2
0,k,at
1,k,av
2,az,n
3,az,n
4,az,n
5,t,e
6,al,e
7,o,as
8,w,as
9,j,aq


In [91]:
#and now we replace X2 labels in the dataset df
df.X2=df.X2.map(df_freq_map)

df.head(10)

Unnamed: 0,X0,X2
0,k,6
1,k,4
2,az,137
3,az,137
4,az,137
5,t,81
6,al,81
7,o,1659
8,w,1659
9,j,63


___

### Advantages

1. It is very simple to implement
2. Does not increase the feature dimensional space

### Disadvantages

1. If some of the labels have the same count,then they will be replaced with the same count and they will loose some valuable information
2. Adds somewhat arbitrary numbers,and therefore weights to the different labels,that may not be related to their predictive power

___

## Ordinal Encoding

In [1]:
import pandas as pd
import datetime

In [23]:
df_base=datetime.datetime.today()
df_date_list=[df_base-datetime.timedelta(days=x) for x in range(0,20)]
df=pd.DataFrame(df_date_list)
df.columns=['day']
df

Unnamed: 0,day
0,2022-05-18 15:58:07.813452
1,2022-05-17 15:58:07.813452
2,2022-05-16 15:58:07.813452
3,2022-05-15 15:58:07.813452
4,2022-05-14 15:58:07.813452
5,2022-05-13 15:58:07.813452
6,2022-05-12 15:58:07.813452
7,2022-05-11 15:58:07.813452
8,2022-05-10 15:58:07.813452
9,2022-05-09 15:58:07.813452


In [24]:
df['day_of_week']=df['day'].dt.day_name()
df.head()

Unnamed: 0,day,day_of_week
0,2022-05-18 15:58:07.813452,Wednesday
1,2022-05-17 15:58:07.813452,Tuesday
2,2022-05-16 15:58:07.813452,Monday
3,2022-05-15 15:58:07.813452,Sunday
4,2022-05-14 15:58:07.813452,Saturday


In [26]:
weekday_map = {'Monday':1,
             'Tuesday':2,
             'Wednesday':3,
             'Thursday':4,
             'Friday':5,
             'Saturday':6,
             'Sunday':7
}

In [28]:
df['day_ordinal']=df.day_of_week.map(weekday_map)
df

Unnamed: 0,day,day_of_week,day_ordinal
0,2022-05-18 15:58:07.813452,Wednesday,3
1,2022-05-17 15:58:07.813452,Tuesday,2
2,2022-05-16 15:58:07.813452,Monday,1
3,2022-05-15 15:58:07.813452,Sunday,7
4,2022-05-14 15:58:07.813452,Saturday,6
5,2022-05-13 15:58:07.813452,Friday,5
6,2022-05-12 15:58:07.813452,Thursday,4
7,2022-05-11 15:58:07.813452,Wednesday,3
8,2022-05-10 15:58:07.813452,Tuesday,2
9,2022-05-09 15:58:07.813452,Monday,1
