## 라벨인코딩

* 영어: abc
* 한글: ㄱㄴㄷ 순으로 0,1,2,3~

In [45]:
### 01. 데이터 준비
import pandas as pd
data = { "eng": ["b", "c", "a", "d"] }
df = pd.DataFrame(data)
print(type(df))
df


<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,eng
0,b
1,c
2,a
3,d


In [46]:
#sklearn에서 원핫 인코딩 하는 방법
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [47]:
encoder_x = LabelEncoder()
df['eng_라벨인코딩'] = encoder_x.fit_transform(df['eng'])
df

Unnamed: 0,eng,eng_라벨인코딩
0,b,1
1,c,2
2,a,0
3,d,3


In [48]:
### 02. 데이터 준비
import pandas as pd
data = { "eng": ["b", "c", "a", "d"], "계절":['봄','여름','가을','겨울'] }

df1 = pd.DataFrame(data)
print(type(df))
df1

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,eng,계절
0,b,봄
1,c,여름
2,a,가을
3,d,겨울


In [49]:
encoder_x = LabelEncoder()
df1['계절_라벨인코딩'] = encoder_x.fit_transform(df['계절'])
df1

KeyError: '계절'

### 데이터셋 불러와서 실습 

In [50]:
import seaborn as sns
tips = sns.load_dataset('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [51]:
en_x = LabelEncoder()
tips['sex_lb'] = en_x.fit_transform(tips['sex'])
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,sex_lb
0,16.99,1.01,Female,No,Sun,Dinner,2,0
1,10.34,1.66,Male,No,Sun,Dinner,3,1
2,21.01,3.50,Male,No,Sun,Dinner,3,1
3,23.68,3.31,Male,No,Sun,Dinner,2,1
4,24.59,3.61,Female,No,Sun,Dinner,4,0
...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,1
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0
241,22.67,2.00,Male,Yes,Sat,Dinner,2,1
242,17.82,1.75,Male,No,Sat,Dinner,2,1


## 원핫 인코딩(OneHotEncoding) 실습

### 방법1. sklearn.preprocessing.OneHotEncoder 사용
* OneHotEncoder()
* [].fit_transform([적용할열])

In [52]:
df1

Unnamed: 0,eng,계절
0,b,봄
1,c,여름
2,a,가을
3,d,겨울


In [53]:
df.shape

(4, 2)

In [54]:
df['eng_라벨인코딩'].shape

(4,)

In [55]:
onehot = OneHotEncoder()
val = df['eng_라벨인코딩'].values.reshape(-1,1) # OneHotEncoder()를 사용을 위한 적합한 값으로 변경.
y = onehot.fit_transform( val ).toarray() # 값을 변경후, 배열로 만들어준다.
y

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]])

In [56]:
onehot_val = pd.DataFrame(y, dtype=int)
onehot_val

Unnamed: 0,0,1,2,3
0,0,1,0,0
1,0,0,1,0
2,1,0,0,0
3,0,0,0,1


In [57]:
df_new = pd.concat([df, onehot_val], axis=1)
df_new

Unnamed: 0,eng,eng_라벨인코딩,0,1,2,3
0,b,1,0,1,0,0
1,c,2,0,0,1,0
2,a,0,1,0,0,0
3,d,3,0,0,0,1


In [58]:
data = { "companyName": ["MS","Apple", "Google", "Google"]}
df1 = pd.DataFrame(data)
df2 = df1.copy()
df2

Unnamed: 0,companyName
0,MS
1,Apple
2,Google
3,Google


In [59]:
df1.values


array([['MS'],
       ['Apple'],
       ['Google'],
       ['Google']], dtype=object)

In [60]:
### OneHotEncoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [64]:
### LabelEncoder
encoder_x = LabelEncoder()
df1['encoding'] = encoder_x.fit_transform(df1['companyName']) #
df1


Unnamed: 0,companyName,encoding
0,MS,2
1,Apple,0
2,Google,1
3,Google,1


In [62]:
onehot = OneHotEncoder()
val = df1['encoding'].values.reshape(-1,1) # OneHotEncoder()를 사용을 위한 적합한 값으로 변경.
y = onehot.fit_transform( val ).toarray() # 값을 변경후, 배열로 만들어준다.
y

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.]])

In [63]:
df1_new = pd.concat([df1, onehot_val], axis=1)
df1_new

Unnamed: 0,companyName,encoding,0,1,2,3
0,MS,2,0,1,0,0
1,Apple,0,0,0,1,0
2,Google,1,1,0,0,0
3,Google,1,0,0,0,1


### 방법2. One Hot Encoder with Pandas
 pandas에서는 one hot encode를 위해get_dummies() 함수를 제공한다.

In [65]:
import pandas as pd
import os


In [67]:
demo_df = pd.DataFrame({"범주형_feature":['양말', '여우', '양말', '상자']})
display(demo_df)


Unnamed: 0,범주형_feature
0,양말
1,여우
2,양말
3,상자


In [68]:
onehot = pd.get_dummies(demo_df)
onehot

Unnamed: 0,범주형_feature_상자,범주형_feature_양말,범주형_feature_여우
0,0,1,0
1,0,0,1
2,0,1,0
3,1,0,0


In [69]:
df_new = pd.concat([demo_df, onehot], axis=1)
df_new

Unnamed: 0,범주형_feature,범주형_feature_상자,범주형_feature_양말,범주형_feature_여우
0,양말,0,1,0
1,여우,0,0,1
2,양말,0,1,0
3,상자,1,0,0


### 실습- 나만의 취미를 추가해서 원핫 인코딩을 해보자

In [74]:
hobby ={"hobby":["골프",'코딩','음악듣기']}
df_ho =pd.DataFrame(hobby)
df_ho

Unnamed: 0,hobby
0,골프
1,코딩
2,음악듣기


In [75]:
onehot = pd.get_dummies(df_ho)
onehot

Unnamed: 0,hobby_골프,hobby_음악듣기,hobby_코딩
0,1,0,0
1,0,0,1
2,0,1,0


In [76]:
df_hobby = pd.concat([df_ho, onehot], axis=1)
df_hobby

Unnamed: 0,hobby,hobby_골프,hobby_음악듣기,hobby_코딩
0,골프,1,0,0
1,코딩,0,0,1
2,음악듣기,0,1,0


## (추가) tips or 다른 데이터 셋을 동일하게 원핫 인코딩해보기

In [77]:
import seaborn as sns
tips = sns.load_dataset('tips')
tips

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.50,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3
240,27.18,2.00,Female,Yes,Sat,Dinner,2
241,22.67,2.00,Male,Yes,Sat,Dinner,2
242,17.82,1.75,Male,No,Sat,Dinner,2


In [79]:
# category 범주 확인
tips.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.3 KB


In [87]:
onehot_sex = pd.get_dummies(tips['sex'])

tips_new = pd.concat([tips, onehot_sex], axis=1)
tips_new

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,Male,Female
0,16.99,1.01,Female,No,Sun,Dinner,2,0,1
1,10.34,1.66,Male,No,Sun,Dinner,3,1,0
2,21.01,3.50,Male,No,Sun,Dinner,3,1,0
3,23.68,3.31,Male,No,Sun,Dinner,2,1,0
4,24.59,3.61,Female,No,Sun,Dinner,4,0,1
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,1,0
240,27.18,2.00,Female,Yes,Sat,Dinner,2,0,1
241,22.67,2.00,Male,Yes,Sat,Dinner,2,1,0
242,17.82,1.75,Male,No,Sat,Dinner,2,1,0
