In [1]:
import pandas as pd

wine = pd.read_csv('https://bit.ly/wine_csv_data')

In [2]:
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()
target = wine['class'].to_numpy()

# 넘파이 배열로 변환하는 이유
# 1. 대용량 데이터에 대해 효율적으로 수치 연산을 함
# 2. 벡터화된 연산을 수행함
# 3. 대부분의 데이터 과학 라이브러리들은 넘파이 배열을 기본으로 사용함
# .
# .
# .

In [4]:
data.shape

(6497, 3)

In [3]:
from sklearn.model_selection import train_test_split

train_input, test_input, train_target, test_target = train_test_split(
    data, target, test_size=0.2, random_state=42)

In [6]:
6497*0.8

5197.6

In [5]:
print(train_input.shape)

(5197, 3)


In [8]:
5197*0.2

1039.4

In [7]:
sub_input, val_input, sub_target, val_target = train_test_split(
    train_input, train_target, test_size=0.2, random_state=42)

In [9]:
val_input.shape

(1040, 3)

In [10]:
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier(random_state=42)
dt.fit(sub_input, sub_target)

print(dt.score(sub_input, sub_target))
print(dt.score(val_input, val_target))

0.9971133028626413
0.864423076923077


# 교차 검증

In [11]:
from sklearn.model_selection import cross_validate

In [12]:
scores = cross_validate(dt, train_input, train_target)
print(scores)

{'fit_time': array([0.00898671, 0.01001263, 0.00699592, 0.008003  , 0.00699735]), 'score_time': array([0.0009973 , 0.        , 0.0010004 , 0.00099921, 0.        ]), 'test_score': array([0.86923077, 0.84615385, 0.87680462, 0.84889317, 0.83541867])}


In [16]:
import numpy as np

print(np.mean(scores['test_score']))

0.855300214703487


In [18]:
from sklearn.model_selection import StratifiedKFold

scores = cross_validate(dt, train_input, train_target, cv=StratifiedKFold())
print(np.mean(scores['test_score']))

0.855300214703487


In [19]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_validate(dt, train_input, train_target, cv=splitter)
print(np.mean(scores['test_score']))

0.8574181117533719


# 하이퍼파라미터 튜닝

In [20]:
from sklearn.model_selection import GridSearchCV
# 모델의 성능을 최적화하기 위해 최적의 하이퍼파라미터를 찾는다
# 하이퍼파라미터들의 다양한 조합을 시도하여,
# 주어진 데이터에 대해 가장 성능이 좋은 조합을 찾는다

params = {'min_impurity_decrease': [0.0001, 0.0002, 0.0003, 0.0004, 0.0005]}

In [26]:
# 의사결정나무 모델의 하이퍼파라미터를 최적화하는 방법
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)

In [28]:
gs.fit(train_input, train_target)

In [29]:
gs.classes_

array([0., 1.])

In [23]:
dt = gs.best_estimator_
print(dt.score(train_input, train_target))

0.9615162593804117


In [38]:
dt2 = gs.best_estimator_
print(dt2.score(train_input, train_target))

0.892053107562055


In [36]:
params2 = {'min_impurity_decrease': np.arange(0.0001, 0.001,0.0001),
          'max_depth': range(5,20,1),
          'min_samples_split': range(2, 100,10)
          }

In [37]:
gs2 = GridSearchCV(DecisionTreeClassifier(random_state=42), params2, n_jobs=-1)
gs2.fit(train_input, train_target)

In [30]:
print(gs.best_params_)

{'min_impurity_decrease': 0.0001}


In [31]:
print(gs.cv_results_['mean_test_score'])

[0.86819297 0.86453617 0.86492226 0.86780891 0.86761605]


In [32]:
best_index = np.argmax(gs.cv_results_['mean_test_score'])
print(gs.cv_results_['params'][best_index])

{'min_impurity_decrease': 0.0001}


In [33]:
params = {'min_impurity_decrease': np.arange(0.0001, 0.001, 0.0001),
          'max_depth': range(5, 20, 1),
          'min_samples_split': range(2, 100, 10)
          }

In [34]:
gs = GridSearchCV(DecisionTreeClassifier(random_state=42), params, n_jobs=-1)
gs.fit(train_input, train_target)