In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [17]:
warnings.filterwarnings("ignore")

In [18]:
df = pd.DataFrame({
    'Name': ['Lucy', 'John', 'Tom', 'Mary', 'Mark'],
    'PlaceOfBirth': ['NewYork', 'London', 'HaNoi', 'BinhDuong', 'Dubai'],
    'Age': [24, 29, 21, 30, 27]
})
df

Unnamed: 0,Name,PlaceOfBirth,Age
0,Lucy,NewYork,24
1,John,London,29
2,Tom,HaNoi,21
3,Mary,BinhDuong,30
4,Mark,Dubai,27


In [19]:
df['PlaceOfBirth'].unique()

array(['NewYork', 'London', 'HaNoi', 'BinhDuong', 'Dubai'], dtype=object)

In [20]:
# PlaceOfBirth là biến category có unique values > 3

In [21]:
from sklearn.preprocessing import OneHotEncoder

In [22]:
# khởi tạo encoder
encoder = OneHotEncoder()

In [23]:
# mã hóa thuộc tính
df_encode = encoder.fit_transform(df[['PlaceOfBirth']]).toarray() # truyền zô dataframe nhaaa
df_encode

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [24]:
encoder.categories_

[array(['BinhDuong', 'Dubai', 'HaNoi', 'London', 'NewYork'], dtype=object)]

In [25]:
# chuyển về dataframe
df_encode = pd.DataFrame(df_encode, columns = encoder.categories_[0])
df_encode

Unnamed: 0,BinhDuong,Dubai,HaNoi,London,NewYork
0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0


In [26]:
df_new = pd.concat([df, df_encode], axis = 1)
df_new

Unnamed: 0,Name,PlaceOfBirth,Age,BinhDuong,Dubai,HaNoi,London,NewYork
0,Lucy,NewYork,24,0.0,0.0,0.0,0.0,1.0
1,John,London,29,0.0,0.0,0.0,1.0,0.0
2,Tom,HaNoi,21,0.0,0.0,1.0,0.0,0.0
3,Mary,BinhDuong,30,1.0,0.0,0.0,0.0,0.0
4,Mark,Dubai,27,0.0,1.0,0.0,0.0,0.0


In [27]:
#để tránh trùng cột -> thêm tiền tố ở trước các cột mới tạo

In [28]:
df_encode2 = encoder.fit_transform(df['PlaceOfBirth'].values.reshape(-1,1)).toarray()
df_encode2

array([[0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [29]:
# chuyển về dataframe
df_encode2 = pd.DataFrame(df_encode2, 
                          columns = ['place_' + i for i in encoder.categories_[0]])
df_encode2

Unnamed: 0,place_BinhDuong,place_Dubai,place_HaNoi,place_London,place_NewYork
0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0


In [30]:
df_new2 = pd.concat([df, df_encode2], axis = 1)
df_new2

Unnamed: 0,Name,PlaceOfBirth,Age,place_BinhDuong,place_Dubai,place_HaNoi,place_London,place_NewYork
0,Lucy,NewYork,24,0.0,0.0,0.0,0.0,1.0
1,John,London,29,0.0,0.0,0.0,1.0,0.0
2,Tom,HaNoi,21,0.0,0.0,1.0,0.0,0.0
3,Mary,BinhDuong,30,1.0,0.0,0.0,0.0,0.0
4,Mark,Dubai,27,0.0,1.0,0.0,0.0,0.0


In [31]:
df_encode2 = df_encode2.drop(['place_BinhDuong'], axis = 1)

In [32]:
df_new3 = pd.concat([df, df_encode2], axis = 1)
df_new3

Unnamed: 0,Name,PlaceOfBirth,Age,place_Dubai,place_HaNoi,place_London,place_NewYork
0,Lucy,NewYork,24,0.0,0.0,0.0,1.0
1,John,London,29,0.0,0.0,1.0,0.0
2,Tom,HaNoi,21,0.0,1.0,0.0,0.0
3,Mary,BinhDuong,30,0.0,0.0,0.0,0.0
4,Mark,Dubai,27,1.0,0.0,0.0,0.0


## Bỏ bớt cột đầu tiên khi mã hóa

In [33]:
# khởi tạo encoder
encoder2 = OneHotEncoder(drop='first')

In [34]:
# mã hóa 
df_encode = encoder2.fit_transform(df[['PlaceOfBirth']]).toarray()
df_encode

array([[0., 0., 0., 1.],
       [0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.]])

In [35]:
encoder2.categories_

[array(['BinhDuong', 'Dubai', 'HaNoi', 'London', 'NewYork'], dtype=object)]

In [36]:
# chuyển về DataFrame
df_encode = pd.DataFrame(df_encode, 
                         columns=['place_' + i for i in encoder2.categories_[0][1:]])
df_encode

Unnamed: 0,place_Dubai,place_HaNoi,place_London,place_NewYork
0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,1.0,0.0,0.0,0.0


In [37]:
df_new3 = pd.concat([df, df_encode], axis=1)
df_new3

Unnamed: 0,Name,PlaceOfBirth,Age,place_Dubai,place_HaNoi,place_London,place_NewYork
0,Lucy,NewYork,24,0.0,0.0,0.0,1.0
1,John,London,29,0.0,0.0,1.0,0.0
2,Tom,HaNoi,21,0.0,1.0,0.0,0.0
3,Mary,BinhDuong,30,0.0,0.0,0.0,0.0
4,Mark,Dubai,27,1.0,0.0,0.0,0.0


In [38]:
df_new3.drop(['PlaceOfBirth'], axis=1, inplace=True)
df_new3

Unnamed: 0,Name,Age,place_Dubai,place_HaNoi,place_London,place_NewYork
0,Lucy,24,0.0,0.0,0.0,1.0
1,John,29,0.0,0.0,1.0,0.0
2,Tom,21,0.0,1.0,0.0,0.0
3,Mary,30,0.0,0.0,0.0,0.0
4,Mark,27,1.0,0.0,0.0,0.0


In [40]:
df_new3['PlaceOfBirth_decode'] = encoder2.inverse_transform(df_new3[['place_Dubai', 'place_HaNoi', 'place_London', 'place_NewYork']])
df_new3

Unnamed: 0,Name,Age,place_Dubai,place_HaNoi,place_London,place_NewYork,PlaceOfBirth_decode
0,Lucy,24,0.0,0.0,0.0,1.0,NewYork
1,John,29,0.0,0.0,1.0,0.0,London
2,Tom,21,0.0,1.0,0.0,0.0,HaNoi
3,Mary,30,0.0,0.0,0.0,0.0,BinhDuong
4,Mark,27,1.0,0.0,0.0,0.0,Dubai
