In [1]:
#2021.06.17. THUR
#Hankyeong

#00. 패키지호출
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import pandas as pd 
import warnings

#00-1. warning message ignore
warnings.filterwarnings(action='ignore')

#14. wine 데이터셋으로 분류 분석하기. 
#(1) wine 데이터셋 불러오기.
wine = load_wine()
df_wine = pd.DataFrame(wine.data, columns= wine.feature_names)
df_wine['target'] = wine.target
df_wine

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.80,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.20,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.40,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.80,3.24,0.30,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.50,16.8,113.0,3.85,3.49,0.24,2.18,7.80,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.80,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,13.71,5.65,2.45,20.5,95.0,1.68,0.61,0.52,1.06,7.70,0.64,1.74,740.0,2
174,13.40,3.91,2.48,23.0,102.0,1.80,0.75,0.43,1.41,7.30,0.70,1.56,750.0,2
175,13.27,4.28,2.26,20.0,120.0,1.59,0.69,0.43,1.35,10.20,0.59,1.56,835.0,2
176,13.17,2.59,2.37,20.0,120.0,1.65,0.68,0.53,1.46,9.30,0.60,1.62,840.0,2


In [2]:
#(2) target 변수의 분포 확인하기. 
df_wine['target'].value_counts()

1    71
0    59
2    48
Name: target, dtype: int64

In [24]:
#(3) wine 데이터셋을 train, test 데이터셋으로 분리하기. 
X_train, X_test, y_train, y_test = train_test_split(
    wine.data, wine.target, test_size = 0.2, random_state=2021, stratify= wine.target
)

In [25]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((142, 13), (142,), (36, 13), (36,))

In [27]:
#or 
X_train, X_test, y_train, y_test = train_test_split(
    df_wine.iloc[:,0:13], df_wine.iloc[:,13:14], test_size = 0.2, random_state=2021, stratify=df_wine.iloc[:,13:14]
)

In [33]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((142, 13), (142, 1), (36, 13), (36, 1))

In [31]:
#(4) Decision Tree 모델을 형성하고 학습하기.
dtc = DecisionTreeClassifier(random_state=2021)
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

In [None]:
#MEMO, 단 Hyper parameter tuning과 Cross-Validation을 하지 않음. 즉, default값으로 학습되었음.  

In [36]:
#(5) 학습된 모델로 test 데이터셋을 가지고 예측하기. 
dtc_pred = dtc.predict(X_test)
dtc_pred

array([1, 2, 1, 0, 1, 2, 0, 0, 1, 0, 0, 0, 2, 1, 0, 2, 0, 1, 2, 1, 0, 1,
       2, 1, 1, 0, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2])

In [39]:
#(6) 예측값과 실제값을 비교해 모델 성능 평가하기. 
accuracy_score(y_test,dtc_pred)

0.9722222222222222

In [41]:
#(7) 하이퍼파라미터 튜닝하기. 
params = {
    'max_depth'         : range(1,10+1),
    'min_samples_split' : range(1,10+1)
}

In [None]:
#MEMO, 하이퍼파라미터 튜닝은 한 번의 결과로 만족하는 것이 아니라, 계속 진행하여 최적의 값을 확인해야함. 

In [51]:
#(8) GridSearchCV를 통해 모형 설정하기. 
dtc = DecisionTreeClassifier(random_state=2021)
grid_dtc = GridSearchCV(dtc, param_grid=params, cv=3)
grid_dtc.fit(X_train, y_train)

GridSearchCV(cv=3, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': range(1, 11),
                         'min_samples_split': range(1, 11)})

In [48]:
#(9) 최적 파라미터 확인하기. 
grid_dtc.best_params_

{'max_depth': 4, 'min_samples_split': 2}

In [47]:
#(10) 최적 파라미터에 의한 Accuracy 점수 파악하기. 
grid_dtc.best_score_

0.9159278959810875

In [49]:
#(11) test 데이터셋으로 값 예측하고 정확도 파악하기. 
dtc_pred_2 = grid_dtc.best_estimator_.predict(X_test)
accuracy_score(y_test,dtc_pred_2)

0.9722222222222222