In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# 우리가 했던 와인데이터 간단한버전
wine = pd.read_csv("https://bit.ly/wine-date")
wine

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.20,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0
...,...,...,...,...
6492,11.2,1.6,3.27,1.0
6493,9.6,8.0,3.15,1.0
6494,9.4,1.2,2.99,1.0
6495,12.8,1.1,3.34,1.0


In [3]:
X = wine.iloc[:, :-1]

In [4]:
y = wine.iloc[:, -1]

In [5]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5197, 3), (1300, 3), (5197,), (1300,))

In [10]:
# 기본 의사결정나무 모델 만들기
from sklearn.tree import DecisionTreeClassifier

In [17]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

DecisionTreeClassifier()

In [18]:
model.score(X_test, y_test)

0.8746153846153846

In [19]:
model.score(X_train, y_train)

0.9971137194535309

In [20]:
# 교차검증
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

In [27]:
scores = cross_validate(model, X_train, y_train, cv=StratifiedKFold(n_splits=100))
scores

{'fit_time': array([0.00997305, 0.00997257, 0.00996256, 0.0089767 , 0.009974  ,
        0.00791168, 0.        , 0.        , 0.01884031, 0.        ,
        0.01565409, 0.        , 0.0156188 , 0.        , 0.01562667,
        0.        , 0.        , 0.01562071, 0.        , 0.01562142,
        0.        , 0.01562405, 0.        , 0.01561832, 0.        ,
        0.        , 0.        , 0.        , 0.01561832, 0.        ,
        0.01562095, 0.        , 0.01562309, 0.        , 0.01561999,
        0.        , 0.01562572, 0.        , 0.01561928, 0.        ,
        0.        , 0.01562166, 0.        , 0.01562166, 0.        ,
        0.01562119, 0.        , 0.01562095, 0.        , 0.01562238,
        0.        , 0.        , 0.01562285, 0.        , 0.01561928,
        0.        , 0.015625  , 0.        , 0.01561761, 0.        ,
        0.01562142, 0.        , 0.00880265, 0.0089767 , 0.00798225,
        0.00797987, 0.00697899, 0.00797892, 0.00708103, 0.        ,
        0.0156548 , 0.        , 0.01

In [28]:
pd.Series(scores["test_score"]).mean()

0.8660746606334839

In [29]:
# 그리드 서치를 통한 최적의 하이퍼파라미터 튜닝

In [30]:
from sklearn.model_selection import GridSearchCV

In [31]:
params = {
    "max_depth":range(1, 21, 1),
    "min_impurity_decrease":[0.0001,0.0002,0.003,0.0004,0.0005]
}

# 이 매개변수로 수행할 교차검증 횟수는 5*20=100
# 기본 5-폴드 교차검증수행 100*5=500
# 모델생성 개수 500개

In [32]:
model = DecisionTreeClassifier()

In [34]:
gs = GridSearchCV(model, params)

In [35]:
gs.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(),
             param_grid={'max_depth': range(1, 21),
                         'min_impurity_decrease': [0.0001, 0.0002, 0.003,
                                                   0.0004, 0.0005]})

In [36]:
gs.best_params_

{'max_depth': 18, 'min_impurity_decrease': 0.0002}

In [46]:
model_final = DecisionTreeClassifier(max_depth=18, min_impurity_decrease=0.0002)
model_final.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=18, min_impurity_decrease=0.0002)

In [47]:
model_final = gs.best_estimator_

In [48]:
model_final.score(X_test, y_test)

0.8761538461538462