##### Handle categorical feature

### One Hot Encoding

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('titanic.csv', usecols=['Sex'])
df.head()

Unnamed: 0,Sex
0,male
1,female
2,female
3,female
4,male


In [3]:
df.isnull().sum()

Sex    0
dtype: int64

In [4]:
pd.get_dummies(df).head()

Unnamed: 0,Sex_female,Sex_male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [5]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Sex_male
0,1
1,0
2,0
3,0
4,1


In [6]:
df = pd.read_csv('titanic.csv', usecols=['Embarked'])
df.head()

Unnamed: 0,Embarked
0,S
1,C
2,S
3,S
4,S


In [7]:
# check the unique categories
df['Embarked'].unique()

array(['S', 'C', 'Q', nan], dtype=object)

In [8]:
# frop nan value
df.dropna(inplace=True)

In [9]:
pd.get_dummies(df,drop_first=False).head()

Unnamed: 0,Embarked_C,Embarked_Q,Embarked_S
0,0,0,1
1,1,0,0
2,0,0,1
3,0,0,1
4,0,0,1


In [10]:
pd.get_dummies(df,drop_first=True).head()

Unnamed: 0,Embarked_Q,Embarked_S
0,0,1
1,0,0
2,0,1
3,0,1
4,0,1


### Onehotencoding with many categories in a feature

In [12]:
df=pd.read_csv('Mercedies.csv',usecols=['X0','X1','X2','X3','X4','X5','X6'])
df.head()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6
0,k,v,at,a,d,u,j
1,k,t,av,e,d,y,l
2,az,w,n,c,d,x,j
3,az,t,n,f,d,x,l
4,az,v,n,f,d,h,d


In [14]:
df.shape

(4209, 7)

In [16]:
# for i in df.columns:
#     print(df[i].value_counts())
df['X0'].value_counts()

z     360
ak    349
y     324
ay    313
t     306
x     300
o     269
f     227
n     195
w     182
j     181
az    175
aj    151
s     106
ap    103
h      75
d      73
al     67
v      36
af     35
ai     34
m      34
e      32
ba     27
at     25
a      21
ax     19
am     18
aq     18
i      18
u      17
l      16
aw     16
ad     14
k      11
au     11
b      11
as     10
r      10
bc      6
ao      4
c       3
q       2
aa      2
g       1
ab      1
ac      1
Name: X0, dtype: int64

In [17]:
df['X0'].unique()

array(['k', 'az', 't', 'al', 'o', 'w', 'j', 'h', 's', 'n', 'ay', 'f', 'x',
       'y', 'aj', 'ak', 'am', 'z', 'q', 'at', 'ap', 'v', 'af', 'a', 'e',
       'ai', 'd', 'aq', 'c', 'aa', 'ba', 'as', 'i', 'r', 'b', 'ax', 'bc',
       'u', 'ad', 'au', 'm', 'l', 'aw', 'ao', 'ac', 'g', 'ab'],
      dtype=object)

In [18]:
# count of total category present in all columns
for i in df.columns:
    print(len(df[i].unique()))
    

47
27
44
7
4
29
12


### As we see there is lots of feature categories so we cannot perform onehotencoding , so we use basicall we take top most frequent categorie and drop rest of them

In [19]:
# get the top 10 most frequent categories occurs 
df.X1.value_counts().sort_values(ascending=False).head(10)

aa    833
s     598
b     592
l     590
v     408
r     251
i     203
a     143
c     121
o      82
Name: X1, dtype: int64

In [20]:
top_10 = [x for x in df.X1.value_counts().sort_values(ascending=False).head(10).index]
top_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [22]:
lst_10=df.X1.value_counts().sort_values(ascending=False).head(10).index
lst_10=list(lst_10)

In [23]:
lst_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [24]:
import numpy as np
for cat in lst_10:
    df[cat]=np.where(df['X1']==cat,1,0)
    

In [27]:
df[['X1']+top_10].head(20)

Unnamed: 0,X1,aa,s,b,l,v,r,i,a,c,o
0,v,0,0,0,0,1,0,0,0,0,0
1,t,0,0,0,0,0,0,0,0,0,0
2,w,0,0,0,0,0,0,0,0,0,0
3,t,0,0,0,0,0,0,0,0,0,0
4,v,0,0,0,0,1,0,0,0,0,0
5,b,0,0,1,0,0,0,0,0,0,0
6,r,0,0,0,0,0,1,0,0,0,0
7,l,0,0,0,1,0,0,0,0,0,0
8,s,0,1,0,0,0,0,0,0,0,0
9,b,0,0,1,0,0,0,0,0,0,0
