In [43]:
import warnings
warnings.filterwarnings('ignore')

# 교차 검증과 그리드 서치
- 머신러닝을 사용할 때 모델의 정확도를 측정하기 위해 반드시 사용해야 하는 방법.
- 딥러닝에서는 데이터의 크기가 커서 이 방법을 사용할 필요는 없다.

In [44]:
import pandas as pd
wine = pd.read_csv('data/wine.csv',)
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [45]:
# Feature, Target
data = wine.drop('class', axis=1).to_numpy()
target = wine['class'].to_numpy()

### 데이터를 6.4:1.6:2로 나눠 Training, Validation, Test Data로 사용할 예정

In [46]:
from sklearn.model_selection import train_test_split

# 우선 8:2로 Train/Test 분리
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

# 6.4 : 1.6으로 Validation 분리
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [47]:
# 훈련세트, 검증세트, 테스트세트의 크기 확인
print('훈련세트: ', X_train.shape)
print('검증세트: ', X_val.shape)
print('테스트세트: ', X_test.shape)

훈련세트:  (4157, 3)
검증세트:  (1040, 3)
테스트세트:  (1300, 3)


In [48]:
# 훈련세트와 검증세트를 결정트리를 이용해 모델 만들기
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

print('훈련 세트 정확도: ', accuracy_score(y_train, dt.predict(X_train)))
print('검증 세트 정확도: ', accuracy_score(y_val, dt.predict(X_val)))

훈련 세트 정확도:  0.9971133028626413
검증 세트 정확도:  0.864423076923077


---
# 교차검증

In [49]:
from sklearn.model_selection import cross_validate
scores = cross_validate(dt, X_train, y_train)

print('정확도: ', scores['test_score'].mean())

scores

정확도:  0.8450771776358419


{'fit_time': array([0.00496817, 0.00503707, 0.00490117, 0.00606418, 0.00541425]),
 'score_time': array([0.00056291, 0.00067496, 0.00055504, 0.0006988 , 0.00066781]),
 'test_score': array([0.84975962, 0.85456731, 0.85679904, 0.81588448, 0.84837545])}

---
# KFold: 분할기를 사용한 교차 검증

In [50]:
from sklearn.model_selection import StratifiedKFold
splitter = StratifiedKFold(n_splits=2,)
scores = cross_validate(dt, X_train, y_train, cv=splitter)
scores

scores['test_score'].mean()

0.8369005606734192

In [51]:
# KFold의 Fold를 10개로 나누어서 검증
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, X_train, y_train, cv=splitter)
scores

scores['test_score'].mean()

0.8482153614457832