# **검증**

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [None]:
iris = load_iris()
features = iris.data
labels = iris.target
clf = DecisionTreeClassifier()
kfold = KFold(n_splits = 5) # 5개의 폴드 세트
acc = []

In [None]:
n = 0

for train_idx, val_idx in kfold.split(features): # 폴드마다 train, val index 반환
  X_train, valx_test = features[train_idx], features[val_idx]
  Y_train, valy_test = labels[train_idx], labels[val_idx]

  clf.fit(X_train, Y_train)
  pred = clf.predict(valx_test)
  n+=1

  accuracy = np.round(accuracy_score(pred, valy_test), 4)
  print('{0}번째 검증 정확도 : '.format(n), accuracy)
  acc.append(accuracy)

print('평균 검증 정확도 : ', np.mean(acc))

1번째 검증 정확도 :  1.0
2번째 검증 정확도 :  0.9667
3번째 검증 정확도 :  0.8333
4번째 검증 정확도 :  0.9333
5번째 검증 정확도 :  0.7667
평균 검증 정확도 :  0.94833


In [None]:
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 3) # 검증 데이터 분포를 동일하게 나눔
n = 0

for train_idx, val_idx in skf.split(features, labels): # label이 들어가야함
  X_train, valx_test = features[train_idx], features[val_idx]
  Y_train, valy_test = labels[train_idx], labels[val_idx]

  clf = DecisionTreeClassifier()
  clf.fit(X_train, Y_train)
  pred = clf.predict(valx_test)
  n+=1

  accuracy = np.round(accuracy_score(pred, valy_test), 4)
  print('{0}번째 검증 정확도 : '.format(n), accuracy)
  acc.append(accuracy)

print('평균 검증 정확도 : ', np.mean(acc))

1번째 검증 정확도 :  0.98
2번째 검증 정확도 :  0.92
3번째 검증 정확도 :  0.98
평균 검증 정확도 :  0.9525615384615385


In [None]:
from sklearn.model_selection import cross_val_score, cross_validate

data = iris.data
target = iris.target

scores = cross_val_score(clf, data, target, scoring='accuracy', cv=3) # 위의 과정을 한 번에 함
print('교차 검증 : ', scores)
print('평균 교차 검증 : ', np.mean(scores))

교차 검증 :  [0.98 0.94 1.  ]
평균 교차 검증 :  0.9733333333333333


In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split

iris = load_iris()
X_train, X_test, Y_train, Y_test = train_test_split(iris.data, iris.target, test_size = 0.2)

parameters = {'max_depth' : [1,2,3], 'min_samples_split' : [2,3]} # hyperparameter 조건을 넣어줌 (이 예시에서는 Decision tree 사용)

In [None]:
grid_dtree = GridSearchCV(clf, param_grid=parameters, cv=3, refit=True, return_train_score=True) # refit : optimal hyperparameter를 모델에 적용
grid_dtree.fit(X_train, Y_train)
score = pd.DataFrame(grid_dtree.cv_results_) # .cv_results_에 결과가 저장됨
score[['params', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.675,5
1,"{'max_depth': 1, 'min_samples_split': 3}",0.675,5
2,"{'max_depth': 2, 'min_samples_split': 2}",0.95,1
3,"{'max_depth': 2, 'min_samples_split': 3}",0.95,1
4,"{'max_depth': 3, 'min_samples_split': 2}",0.933333,3
5,"{'max_depth': 3, 'min_samples_split': 3}",0.933333,3


In [None]:
estimator = grid_dtree.best_estimator_ # 최적의 학습된 모델 저장
pred = estimator.predict(X_test)
print('Acc : ', accuracy_score(pred, Y_test))

Acc :  0.9333333333333333


# **라벨링**

In [None]:
from sklearn.preprocessing import LabelEncoder

label = ['지금', '은', '새벽', '3시', '반', '이다']
encoder = LabelEncoder()
encoder.fit(label)
labels = encoder.transform(label)
#labels = encoder.fit_transform(label) 윗 줄과 같은 것
print(labels)

[5 3 2 0 1 4]


In [None]:
print('classes : ', encoder.classes_)

classes :  ['3시' '반' '새벽' '은' '이다' '지금']


In [None]:
print('inverse : ', encoder.inverse_transform([0, 1, 2, 3, 4, 5]))

inverse :  ['3시' '반' '새벽' '은' '이다' '지금']


In [None]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

encoder = LabelEncoder()
labels = encoder.fit_transform(label) # 먼저 해줘야 함

labels = labels.reshape(-1,1) # 2차원 데이터로 변환 해야함

onehot = OneHotEncoder()
onehot = onehot.fit_transform(labels)
print(onehot.toarray())

[[0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 1. 0. 0.]
 [0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0.]]


In [1]:
import pandas as pd

label = ['지금', '은', '새벽', '3시', '반', '이다']

df1 = pd.DataFrame(label, columns=['요로롱'])
df1

Unnamed: 0,요로롱
0,지금
1,은
2,새벽
3,3시
4,반
5,이다


In [2]:
onehot_ = pd.get_dummies(df1)

In [3]:
onehot_.values

array([[0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0, 0],
       [1, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0]], dtype=uint8)

In [4]:
onehot_

Unnamed: 0,요로롱_3시,요로롱_반,요로롱_새벽,요로롱_은,요로롱_이다,요로롱_지금
0,0,0,0,0,0,1
1,0,0,0,1,0,0
2,0,0,1,0,0,0
3,1,0,0,0,0,0
4,0,1,0,0,0,0
5,0,0,0,0,1,0


# **스케일링**

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
iris = load_iris()
iris_data = iris.data
iris_df = pd.DataFrame(iris_data, columns=iris.feature_names)

scaler = StandardScaler()
#scaler = MinMaxScaler()
scaler.fit(iris_df)
iris_std = scaler.transform(iris_df)