In [None]:
# 실습에 필요한 패키지 설치 
!conda config --set ssl_verify false
!conda install python-graphviz scikit-learn -y

In [None]:
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt


# 경로를 설정하지 않으면 동작을 안할수 있음
from sklearn.tree import export_graphviz
import graphviz

# HPO를 위한 설정
from sklearn.model_selection import train_test_split, GridSearchCV

# XAI를 위한 설정
from sklearn.inspection import PartialDependenceDisplay

## 사외PC (Google Colab) 으로 실행 시 Data load

In [None]:
#데이터를 불러오기
wine = load_wine()

# 데이터프레임 생성
df = pd.DataFrame(data=wine.data, columns= wine.feature_names)
df['target'] = wine.target

## 사내 PC에서 코드 실행 시 Data load

In [None]:
#데이터를 불러오기
# 사내PC에서 이용 시 sklearn.datasets의 load_wine를 통해 load가 되지 않아 파일로 제공합니다.
df = pd.read_csv('wine.csv')

## 공통 코드 부분

In [None]:
#모형 학습
# 특성 (Feature)와 타겟(target)의 데이터를 분리

X = df.drop('target', axis=1)
y = df['target']

In [None]:
# 학습데이터와 테스트 데이터로 분리 (80% 학습, 20% 테스트)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state= 42)

In [None]:
# HypyerParameter를 '수기' 변경
clf_mannual = DecisionTreeClassifier(criterion='gini',
                                     max_depth=1,
                                     min_samples_split=5,
                                     min_samples_leaf=3,
                                     splitter='random',
                                     random_state=4
                                     )
clf_mannual.fit(X_train, y_train)
y_pred_mannual = clf_mannual.predict(X_test)
accuracy_mannual = accuracy_score(y_test, y_pred_mannual)
print("accuracy_mannual: ", accuracy_mannual)

In [None]:
# HyperParameter Tunning
# GridSearch를 HyperParameter를 범위를 한정

param_grid = {
    "criterion" : ['gini', 'entropy'],
    "max_depth" : [2,3,4,5],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4]
}

In [None]:
# HPO 및 Fitting

clf_grid = DecisionTreeClassifier( random_state= 42 )
# core
grid_search = GridSearchCV(clf_grid, param_grid, cv = 5)
# HyperParameter를 찾고, 이걸 가지고 fitting이 모두 수행
grid_search.fit(X_train, y_train)

print("Best Hyper-parameter", grid_search.best_params_)
print("Best Score", grid_search.best_score_)


In [None]:
# HPO만들어진 모형의 정확도 계산 
best_model = grid_search.best_estimator_

y_pred_grid = best_model.predict(X_test)
accuracy_grid = accuracy_score(y_test, y_pred_grid)
print('Accuracy Grid :', accuracy_grid)

In [None]:
# Feature Importance를 계산
importances = best_model.feature_importances_

In [None]:
# Best model의 Feature Importance를  시각화
plt.figure(figsize = (20,6))
# 막대 그래프 생성
plt.bar(range(len(importances)), importances, width=0.3)
plt.xlabel('Feature')
plt.ylabel('importances')
plt.title('Feature Importance')
plt.xticks(range(len(importances)), X.columns, rotation = 45)
plt.show()

In [None]:
#Partial Dependence Plot (PDP)
# 시각화할 특성을 선택
feature = ['flavanoids', 'color_intensity', 'proline']
fig, ax = plt.subplots(figsize= (20,6))
display = PartialDependenceDisplay.from_estimator(best_model, X_train, feature, target=0, ax=ax)

In [None]:
X_train

In [None]:
feature = ['flavanoids', 'color_intensity', 'proline']
fig, ax = plt.subplots(figsize= (20,6))
display = PartialDependenceDisplay.from_estimator(best_model, X_train, feature, target=1, ax=ax)

In [None]:
feature = ['flavanoids', 'color_intensity', 'proline']
fig, ax = plt.subplots(figsize= (20,6))
display = PartialDependenceDisplay.from_estimator(best_model, X_train, feature, target=2, ax=ax)