## Model Selection module

In [18]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

- Machine learning proceeding without dividing as Train and Test

In [19]:
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score

In [20]:
iris = load_iris()
dt_clf = DecisionTreeClassifier()
dt_clf.fit(iris.data, iris.target)

DecisionTreeClassifier()

In [21]:
pred = dt_clf.predict(iris.data)
accuracy_score(iris.target, pred)

1.0

- cross_validate method

In [22]:
from sklearn.model_selection import cross_validate

In [23]:
dtc = DecisionTreeClassifier()
cross_validate(dtc, iris.data, iris.target)

{'fit_time': array([0.00067592, 0.00034285, 0.00028491, 0.0002768 , 0.00027514]),
 'score_time': array([0.00023007, 0.00015306, 0.00014019, 0.00013399, 0.00012803]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ])}

In [24]:
dtc = DecisionTreeClassifier()
cross_validate(dtc, iris.data, iris.target, return_train_score = True)

{'fit_time': array([0.000705  , 0.00036287, 0.00033712, 0.00032878, 0.00030184]),
 'score_time': array([0.00024509, 0.00015116, 0.00014377, 0.00016308, 0.00017595]),
 'test_score': array([0.96666667, 0.96666667, 0.9       , 0.93333333, 1.        ]),
 'train_score': array([1., 1., 1., 1., 1.])}

- Dividing Train and Test data set

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, train_size = 0.2, random_state = 2021
)

pd.Series(y_test).value_counts()

0    41
1    41
2    38
dtype: int64

In [29]:
# Stratified dividing | 비율을 맞춰서 분리
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, train_size = 0.2, random_state = 2021,
    stratify = iris.target
)

pd.Series(y_test).value_counts()

0    40
1    40
2    40
dtype: int64

- cross_val_score() method

In [30]:
# 성능 지표는 정확도, 교차검증 세트 5개
from sklearn.model_selection import cross_val_score

In [31]:
cross_val_score(dtc, iris.data, iris.target, cv = 5)

array([0.96666667, 0.96666667, 0.9       , 1.        , 1.        ])

In [32]:
scores = cross_val_score(dtc, iris.data, iris.target, cv = 5)
np.mean(scores)

0.9666666666666668

### GridSearchCV
- Proceeding cross checking and hyper parameter tuning

In [33]:
dtc = DecisionTreeClassifier(random_state = 2021)

In [34]:
from sklearn.model_selection import GridSearchCV

In [35]:
# parameter to dictiaonary
params = {
    'max_depth' : [2, 3, 4, 5],
    'min_samples_split' : [2, 3]
}

In [36]:
grid_dtc = GridSearchCV(dtc, param_grid = params, cv = 3)

In [37]:
grid_dtc.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 3, 4, 5],
                         'min_samples_split': [2, 3]})

In [42]:
grid_dtc.cv_results_

{'mean_fit_time': array([0.00113583, 0.00076373, 0.00072026, 0.00061202, 0.00052468,
        0.00051936, 0.00051936, 0.00049305]),
 'std_fit_time': array([4.12547419e-04, 8.68603818e-05, 5.92938905e-05, 6.41814076e-06,
        1.95961444e-06, 6.37469439e-06, 3.33975178e-06, 2.31752997e-05]),
 'mean_score_time': array([0.00037829, 0.00025924, 0.00024605, 0.00021251, 0.00018922,
        0.00018867, 0.00018795, 0.00016753]),
 'std_score_time': array([2.51787278e-05, 2.14888499e-05, 2.18418833e-05, 1.11867754e-05,
        1.91065713e-06, 1.12391596e-06, 2.43399824e-06, 2.40265790e-06]),
 'param_max_depth': masked_array(data=[2, 2, 3, 3, 4, 4, 5, 5],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_min_samples_split': masked_array(data=[2, 3, 2, 3, 2, 3, 2, 3],
              mask=[False, False, False, False, False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'ma

In [39]:
# 최적 파라미터
grid_dtc.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [40]:
# 최고 정확도
grid_dtc.best_score_

0.9666666666666667

In [41]:
# 최고 정확도를 가지는 최적 파라미터로 학습한 estimator
best_estimator = grid_dtc.best_estimator_
best_estimator.score(X_test, y_test)

0.9583333333333334