cross_val_score

In [9]:
#교차검증을 더 간편하게 해 준다.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.datasets import load_iris
from sklearn.model_selection import GridSearchCV

In [7]:
iris=load_iris()
dt_clf=DecisionTreeClassifier(random_state=156)

data=iris.data
label=iris.target

#성능지표는 accuracy 교차검증세트느 3개
score=cross_val_score(dt_clf,data,label,scoring='accuracy',cv=3)
print(f'교차 검증별 정확도 : {np.round(score,4)}')
print(f'평균 정확도 : {round(np.mean(score),4)}')

#cross_val_score 는 내부적으로 stratifiedkfold 를 사용한다.

교차 검증별 정확도 : [0.9804 0.9216 0.9792]
평균 정확도 : 0.9604


GridSearchCV

교차검증을 기반으로 하이퍼 파라미터의 최적 값을 찾아준다.


순차적으로 테스트하기 때문에 오래걸릴 수 있다.

In [8]:
grid={'max_depth' : [1,2,3],
     'min_samples_split' : [2,3]}

In [10]:
from sklearn.model_selection import GridSearchCV

In [11]:
X_train,X_test,y_train,y_test=train_test_split(data,label,test_size=0.2,random_state=121)

dtree=DecisionTreeClassifier()

parameters=grid

In [13]:
grid_dtree=GridSearchCV(dtree,param_grid=parameters,cv=3,refit=True)

In [14]:
grid_dtree.fit(X_train,y_train)

#결과 추출
score=pd.DataFrame(grid_dtree.cv_results_)
score[['params','mean_test_score','rank_test_score',
      'split0_test_score','split1_test_score','split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.7,5,0.7,0.7,0.7
1,"{'max_depth': 1, 'min_samples_split': 3}",0.7,5,0.7,0.7,0.7
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,3,0.925,1.0,0.95
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,3,0.925,1.0,0.95
4,"{'max_depth': 3, 'min_samples_split': 2}",0.975,1,0.975,1.0,0.95
5,"{'max_depth': 3, 'min_samples_split': 3}",0.975,1,0.975,1.0,0.95


In [19]:
print(f'최적 파라미터: {grid_dtree.best_params_}')
print(f'최고 정확도: {grid_dtree.best_score_}')

최적 파라미터: {'max_depth': 3, 'min_samples_split': 2}
최고 정확도: 0.975


In [23]:
#girdsearch로 찾은 파라미터로 학습해 검증해보기

estimator=grid_dtree.best_params_

#grid_dtree는 이미 최적 파라미터로 학습했기 때문에 바로 predict 수행
from sklearn.metrics import accuracy_score

pred=grid_dtree.predict(X_test)
print(f'테스트 데이터 정확도: {round(accuracy_score(y_test,pred),4)}')

테스트 데이터 정확도: 0.9667


데이터 전처리

In [24]:
#라벨 인코딩

from sklearn.preprocessing import LabelEncoder

items=['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']

encoder=LabelEncoder()
encoder.fit(items)
label=encoder.transform(items)

print(f'인코딩 반환값: {label}')

인코딩 반환값: [0 1 4 5 3 3 2 2]


In [25]:
print(f'인코딩 클래스: {encoder.classes_}')

인코딩 클래스: ['TV' '냉장고' '믹서' '선풍기' '전자레인지' '컴퓨터']


In [27]:
#원 핫 인코딩

from sklearn.preprocessing import OneHotEncoder
import numpy as np

items=['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']

#숫자 값으로 변환 위해 라벨 인코딩 먼저
encoder=LabelEncoder()
encoder.fit(items)
label=encoder.transform(items)

label=label.reshape(-1,1)

#원 핫 적용
oh_encoder=OneHotEncoder()
oh_encoder.fit(label)
oh_label=oh_encoder.transform(label)

print('원 핫 인코딩 데이터')
print(oh_label.toarray())
print('차원')
print(oh_label.shape)

원 핫 인코딩 데이터
[[1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0.]]
차원
(8, 6)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [28]:
#판다스에는 쉽게 원 핫 인코딩 할 수 있는 api가 있다.

df=pd.DataFrame({'item':items})
pd.get_dummies(df)

Unnamed: 0,item_TV,item_냉장고,item_믹서,item_선풍기,item_전자레인지,item_컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


피처 스케일링과 정규화

In [29]:
#StandardScaler

iris=load_iris()
data=iris.data
df=pd.DataFrame(data=data,columns=iris.feature_names)

In [30]:
print('feature들의 평균 값')
print(df.mean())
print('\nfeature들의 분산 값')
print(df.var())

feature들의 평균 값
sepal length (cm)    5.843333
sepal width (cm)     3.057333
petal length (cm)    3.758000
petal width (cm)     1.199333
dtype: float64

feature들의 분산 값
sepal length (cm)    0.685694
sepal width (cm)     0.189979
petal length (cm)    3.116278
petal width (cm)     0.581006
dtype: float64


In [33]:
from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
scaler.fit(df)
scaled_df=scaler.transform(df)

In [37]:
#trainsform 반환값은 ndarray라 dataframe으로 변환

df_scaled=pd.DataFrame(data=scaled_df,columns=iris.feature_names)
df_scaled

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


In [38]:
print('feature들의 평균 값')
print(df_scaled.mean())
print('\nfeature들의 분산 값')
print(df_scaled.var())

feature들의 평균 값
sepal length (cm)   -1.690315e-15
sepal width (cm)    -1.842970e-15
petal length (cm)   -1.698641e-15
petal width (cm)    -1.409243e-15
dtype: float64

feature들의 분산 값
sepal length (cm)    1.006711
sepal width (cm)     1.006711
petal length (cm)    1.006711
petal width (cm)     1.006711
dtype: float64


In [41]:
#minmaxscaler

from sklearn.preprocessing import MinMaxScaler

scaler=MinMaxScaler()
scaler.fit(df)
scaled_df=scaler.transform(df)

df_scaled=pd.DataFrame(data=scaled_df,columns=iris.feature_names)

print('feature들의 최소 값')
print(df_scaled.min())
print('\nfeature들의 최대 값')
print(df_scaled.max())

feature들의 최소 값
sepal length (cm)    0.0
sepal width (cm)     0.0
petal length (cm)    0.0
petal width (cm)     0.0
dtype: float64

feature들의 최대 값
sepal length (cm)    1.0
sepal width (cm)     1.0
petal length (cm)    1.0
petal width (cm)     1.0
dtype: float64
