# OneHotEnconder와 ColumnTransformer

In [1]:
import numpy as np
import pandas as pd

In [2]:
# 이 파일은 열 이름을 나타내는 헤더가 없으므로 header=None으로 지정하고
# "names" 매개변수로 열 이름을 제공합니다
data = pd.read_csv("./adult.data", header=None, index_col=False,
            names=['age', 'workclass', 'fnlwgt', 'education',  'education-num',
           'marital-status', 'occupation', 'relationship', 'race', 'gender',
           'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
           'income'])

In [3]:
# 예제를 위해 몇개의 열만 선택합니다
data = data[['age', 'workclass', 'education', 'gender', 'hours-per-week','occupation', 'income']]

In [4]:
# IPython.display 함수는 주피터 노트북을 위해 포맷팅된 출력을 만듭니다
display(data.head())

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


In [5]:
print(data.gender.value_counts())

 Male      21790
 Female    10771
Name: gender, dtype: int64


#####  Pandas의 One Hot Encoder 

In [6]:
print("원본 특성:\n", list(data.columns), "\n")

원본 특성:
 ['age', 'workclass', 'education', 'gender', 'hours-per-week', 'occupation', 'income'] 



In [7]:
data_dummies = pd.get_dummies(data)  # OneHotEncoder
print("get_dummies 후의 특성:\n", list(data_dummies.columns))

get_dummies 후의 특성:
 ['age', 'hours-per-week', 'workclass_ ?', 'workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Never-worked', 'workclass_ Private', 'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc', 'workclass_ State-gov', 'workclass_ Without-pay', 'education_ 10th', 'education_ 11th', 'education_ 12th', 'education_ 1st-4th', 'education_ 5th-6th', 'education_ 7th-8th', 'education_ 9th', 'education_ Assoc-acdm', 'education_ Assoc-voc', 'education_ Bachelors', 'education_ Doctorate', 'education_ HS-grad', 'education_ Masters', 'education_ Preschool', 'education_ Prof-school', 'education_ Some-college', 'gender_ Female', 'gender_ Male', 'occupation_ ?', 'occupation_ Adm-clerical', 'occupation_ Armed-Forces', 'occupation_ Craft-repair', 'occupation_ Exec-managerial', 'occupation_ Farming-fishing', 'occupation_ Handlers-cleaners', 'occupation_ Machine-op-inspct', 'occupation_ Other-service', 'occupation_ Priv-house-serv', 'occupation_ Prof-specialty', 'occupation_ Protec

In [8]:
display(data_dummies.head())

Unnamed: 0,age,hours-per-week,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,...,occupation_ Machine-op-inspct,occupation_ Other-service,occupation_ Priv-house-serv,occupation_ Prof-specialty,occupation_ Protective-serv,occupation_ Sales,occupation_ Tech-support,occupation_ Transport-moving,income_ <=50K,income_ >50K
0,39,40,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1,50,13,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
2,38,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,53,40,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,28,40,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0


##### sklearn의 One Hot Encoder

In [9]:
data.head(3)

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K


In [10]:
from sklearn.preprocessing import OneHotEncoder
# sparse=False로 설정하면 OneHotEncode가 희소 행렬이 아니라 넘파이 배열을 반환합니다
ohe = OneHotEncoder(sparse=False)
print(ohe.fit_transform(data)) # ndarray 이므로 열이름이 없어짐

[[0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 ...
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [0. 0. 0. ... 0. 0. 1.]]


In [11]:
print(ohe.get_feature_names())  # 없어진 열이름 확인하기

['x0_17' 'x0_18' 'x0_19' 'x0_20' 'x0_21' 'x0_22' 'x0_23' 'x0_24' 'x0_25'
 'x0_26' 'x0_27' 'x0_28' 'x0_29' 'x0_30' 'x0_31' 'x0_32' 'x0_33' 'x0_34'
 'x0_35' 'x0_36' 'x0_37' 'x0_38' 'x0_39' 'x0_40' 'x0_41' 'x0_42' 'x0_43'
 'x0_44' 'x0_45' 'x0_46' 'x0_47' 'x0_48' 'x0_49' 'x0_50' 'x0_51' 'x0_52'
 'x0_53' 'x0_54' 'x0_55' 'x0_56' 'x0_57' 'x0_58' 'x0_59' 'x0_60' 'x0_61'
 'x0_62' 'x0_63' 'x0_64' 'x0_65' 'x0_66' 'x0_67' 'x0_68' 'x0_69' 'x0_70'
 'x0_71' 'x0_72' 'x0_73' 'x0_74' 'x0_75' 'x0_76' 'x0_77' 'x0_78' 'x0_79'
 'x0_80' 'x0_81' 'x0_82' 'x0_83' 'x0_84' 'x0_85' 'x0_86' 'x0_87' 'x0_88'
 'x0_90' 'x1_ ?' 'x1_ Federal-gov' 'x1_ Local-gov' 'x1_ Never-worked'
 'x1_ Private' 'x1_ Self-emp-inc' 'x1_ Self-emp-not-inc' 'x1_ State-gov'
 'x1_ Without-pay' 'x2_ 10th' 'x2_ 11th' 'x2_ 12th' 'x2_ 1st-4th'
 'x2_ 5th-6th' 'x2_ 7th-8th' 'x2_ 9th' 'x2_ Assoc-acdm' 'x2_ Assoc-voc'
 'x2_ Bachelors' 'x2_ Doctorate' 'x2_ HS-grad' 'x2_ Masters'
 'x2_ Preschool' 'x2_ Prof-school' 'x2_ Some-college' 'x3_ Female'
 'x3_ M

##### 한꺼번에 One Hot Encoder와 Scaler 하기

In [12]:
display(data.head())

Unnamed: 0,age,workclass,education,gender,hours-per-week,occupation,income
0,39,State-gov,Bachelors,Male,40,Adm-clerical,<=50K
1,50,Self-emp-not-inc,Bachelors,Male,13,Exec-managerial,<=50K
2,38,Private,HS-grad,Male,40,Handlers-cleaners,<=50K
3,53,Private,11th,Male,40,Handlers-cleaners,<=50K
4,28,Private,Bachelors,Female,40,Prof-specialty,<=50K


In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
ct = make_column_transformer(
    (StandardScaler(), ['age', 'hours-per-week']),
    (OneHotEncoder(sparse=False), ['workclass', 'education', 'gender', 'occupation']))

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
# income을 제외한 모든 열을 추출합니다
data_features = data.drop("income", axis=1)
# 데이터프레임과 income을 분할합니다
X_train, X_test, y_train, y_test = train_test_split(
    data_features, data.income, random_state=0)

In [15]:
ct.fit(X_train)
X_train_trans = ct.transform(X_train)
X_test_trans = ct.transform(X_test)

In [16]:
print(X_train_trans.shape)
print(X_test_trans.shape)

(24420, 44)
(8141, 44)


In [17]:
logreg = LogisticRegression(max_iter=10000)
logreg.fit(X_train_trans, y_train)

print("테스트 점수: {:.2f}".format(logreg.score(X_test_trans, y_test)))

테스트 점수: 0.81
