<a href="https://colab.research.google.com/github/dojun43/ML-Example/blob/master/One-HotEncoding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library

In [1]:
import numpy as np
import pandas as pd 
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

# Data

In [2]:
titanic = sns.load_dataset('titanic')
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# One-Hot Encoding

- 한 컬럼에 적용

In [3]:
labels = titanic[['class']].values
labels = np.array(labels).reshape(-1,1) # 2차원으로 변환

onehot_encoder = OneHotEncoder()
onehot_encoder.fit(labels)
labels = onehot_encoder.transform(labels)

In [4]:
print('One-Hot Encoding data')
print(labels.toarray(),'\n')
print('One-Hot Encoding data categories')
print(onehot_encoder.categories_, '\n')
print('One-Hot Encoding data shape')
print(labels.shape)

One-Hot Encoding data
[[0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]
 ...
 [0. 0. 1.]
 [1. 0. 0.]
 [0. 0. 1.]] 

One-Hot Encoding data categories
[array(['First', 'Second', 'Third'], dtype=object)] 

One-Hot Encoding data shape
(891, 3)


In [5]:
labels_df = pd.DataFrame(labels.toarray(), columns=['class_' + col for col in onehot_encoder.categories_[0]])
titanic = pd.concat([titanic, labels_df], axis=1)
titanic.drop(['class'], axis=1, inplace = True)

In [6]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,who,adult_male,deck,embark_town,alive,alone,class_First,class_Second,class_Third
0,0,3,male,22.0,1,0,7.25,S,man,True,,Southampton,no,False,0.0,0.0,1.0
1,1,1,female,38.0,1,0,71.2833,C,woman,False,C,Cherbourg,yes,False,1.0,0.0,0.0
2,1,3,female,26.0,0,0,7.925,S,woman,False,,Southampton,yes,True,0.0,0.0,1.0
3,1,1,female,35.0,1,0,53.1,S,woman,False,C,Southampton,yes,False,1.0,0.0,0.0
4,0,3,male,35.0,0,0,8.05,S,man,True,,Southampton,no,True,0.0,0.0,1.0


- 여러 컬럼에 적용

In [7]:
titanic = sns.load_dataset('titanic')

In [8]:
col_list = ['class', 'who'] # one-hot encoding 적용할 column
labels = titanic[col_list].values
labels = np.array(labels).reshape(-1,1) # 2차원으로 변환

onehot_encoder = OneHotEncoder()
onehot_encoder.fit(labels)
labels = onehot_encoder.transform(labels)

In [9]:
print('One-Hot Encoding data')
print(labels.toarray(),'\n')
print('One-Hot Encoding data categories')
print(onehot_encoder.categories_, '\n')
print('One-Hot Encoding data shape')
print(labels.shape)

One-Hot Encoding data
[[0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0.]
 ...
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]] 

One-Hot Encoding data categories
[array(['First', 'Second', 'Third', 'child', 'man', 'woman'], dtype=object)] 

One-Hot Encoding data shape
(1782, 6)


In [10]:
labels_df = pd.DataFrame(labels.toarray(), columns=[col for col in onehot_encoder.categories_[0]])
titanic = pd.concat([titanic, labels_df], axis=1)
titanic.drop(col_list, axis=1, inplace = True)

In [11]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,adult_male,deck,embark_town,alive,alone,First,Second,Third,child,man,woman
0,0.0,3.0,male,22.0,1.0,0.0,7.25,S,True,,Southampton,no,False,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,1.0,female,38.0,1.0,0.0,71.2833,C,False,C,Cherbourg,yes,False,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,3.0,female,26.0,0.0,0.0,7.925,S,False,,Southampton,yes,True,1.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,female,35.0,1.0,0.0,53.1,S,False,C,Southampton,yes,False,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,3.0,male,35.0,0.0,0.0,8.05,S,True,,Southampton,no,True,0.0,0.0,1.0,0.0,0.0,0.0


# get_dummies()

In [12]:
titanic = sns.load_dataset('titanic')

In [13]:
titanic = pd.get_dummies(titanic)
titanic.head()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare,adult_male,alone,sex_female,sex_male,...,deck_C,deck_D,deck_E,deck_F,deck_G,embark_town_Cherbourg,embark_town_Queenstown,embark_town_Southampton,alive_no,alive_yes
0,0,3,22.0,1,0,7.25,True,False,0,1,...,0,0,0,0,0,0,0,1,1,0
1,1,1,38.0,1,0,71.2833,False,False,1,0,...,1,0,0,0,0,1,0,0,0,1
2,1,3,26.0,0,0,7.925,False,True,1,0,...,0,0,0,0,0,0,0,1,0,1
3,1,1,35.0,1,0,53.1,False,False,1,0,...,1,0,0,0,0,0,0,1,0,1
4,0,3,35.0,0,0,8.05,True,True,0,1,...,0,0,0,0,0,0,0,1,1,0
