# Model Selection 모듈

- Train/Test셋을 나누지 않고 머신러닝 수행

In [1]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
iris = load_iris()
dt_cls = DecisionTreeClassifier()

dt_cls.fit(iris.data, iris.target)

DecisionTreeClassifier()

In [3]:
pred = dt_cls.predict(iris.data)
accuracy_score(iris.target,pred)

1.0

- cross_validate 매쏘드

In [4]:
from sklearn.model_selection import cross_validate

In [5]:
dtc = DecisionTreeClassifier()

cross_validate(dtc, iris.data, iris.target)

{'fit_time': array([0.        , 0.0010016 , 0.00099397, 0.00096965, 0.0009985 ]),
 'score_time': array([0.00099635, 0.00099158, 0.        , 0.        , 0.        ]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ])}

In [6]:
cross_validate(dtc, iris.data, iris.target,return_train_score = True)

{'fit_time': array([0.00099635, 0.        , 0.00099778, 0.00103378, 0.00099826]),
 'score_time': array([0.        , 0.00099564, 0.        , 0.00095963, 0.        ]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ]),
 'train_score': array([1., 1., 1., 1., 1.])}

- Train/Test 데이터 셋을 분리

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, train_size=0.2, random_state=2021
)

In [9]:
import pandas as pd

pd.Series(y_test).value_counts()

0    41
1    41
2    38
dtype: int64

- stratified 분리  

랜덤이 아니라, 각각 y값의 비율을 맞추어준다. 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, train_size=0.2, random_state=2021, stratify = iris.target
)

pd.Series(y_test).value_counts()

0    40
1    40
2    40
dtype: int64

* cross_val_score() method

In [11]:
from sklearn.model_selection import cross_val_score

In [12]:
# 성능지표는 accuracy, 교차 검증 세트는 5개

cross_val_score(dtc, iris.data, iris.target, cv=5)

array([0.96666667, 0.96666667, 0.9       , 0.96666667, 1.        ])

In [13]:
import numpy as np

scores= cross_val_score(dtc, iris.data, iris.target, cv=5)

In [14]:
np.mean(scores)

0.9600000000000002

### GridSearchCV

- 교차 검증과 최적 하이퍼 파라미터 튜닝을 한번에

In [15]:
dtc = DecisionTreeClassifier()

In [27]:
# 파라미터를 딕셔너리 형태로 설정
params = {'max_depth':[1,2,3], 'min_samples_split':[2,3]}

In [28]:
from sklearn.model_selection import GridSearchCV
grid_dtc = GridSearchCV(dtc, param_grid=params, cv=3,refit=True)

In [29]:
grid_dtc.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': [1, 2, 3], 'min_samples_split': [2, 3]})

In [30]:
# 최적 파라미터
grid_dtc.best_params_

{'max_depth': 2, 'min_samples_split': 3}

In [31]:
best_estimator = grid_dtc.best_estimator_
best_estimator.score(X_test,y_test)

0.9583333333333334