In [None]:
# 실습에 필요한 패키지 설치
!pip --trusted-host pypi.python.org --trusted-host files.pythonhosted.org --trusted-host pypi.org install xgboost ucimlrepo shap

In [None]:
# 실습에 필요한 패키지 설치(graphviz) -> 미설치시에만 설치 진행 해주세요.
!conda config --set ssl_verify false
!conda install python-graphviz -y

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pprint

# 학습데이터 구성 및 전처리
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier

# Xgboost
from xgboost import XGBClassifier
from xgboost import plot_importance, plot_tree

# XAI
import shap

# 학습 데이터원천
from ucimlrepo import fetch_ucirepo

# 평가지표
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score


## 사외PC (Google Colab) 으로 실행 시 Data load

In [None]:
# 데이터 로드
wine_quality = fetch_ucirepo(id=186)

In [None]:
X = wine_quality.data.features
y = wine_quality.data.targets

# metadata
# dataset에 대한 정보
print(wine_quality.metadata)
# variable에 대한 정보
print(wine_quality.variables)

## 사내 PC에서 코드 실행 시 Data load

In [None]:
# 데이터 로드
wine_quality = pd.read_csv('wine_quality.csv')

In [None]:
target = 'quality'
X = wine_quality.drop(target, axis=1)
y = wine_quality[target]

## 공통 코드 부분

In [None]:
# EDA (탐색적 데이터 분석)

print("Data shape", X.shape)
print("Target Distribution: \n", y.value_counts())

In [None]:
#히트맵 시각화
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title("Correlation Heatmap")
plt.show()

In [None]:
# Feature 분포 시각화
plt.figure(figsize=(16, 20))

for i, feature in enumerate(X.columns):
    plt.subplot(4, 3, i+1)
    plt.hist(X[feature], bins = 20, edgecolor='black' )
    plt.title(f"Distribution of {feature}")
    plt.xlabel(feature)
    plt.ylabel("Frequency")

plt.tight_layout()
plt.show()

In [None]:
# Box-plot
plt.figure(figsize=(16, 20))

for i, feature in enumerate(X.columns):
    plt.subplot(4, 3, i+1)
    sns.boxplot(x=X[feature])
    plt.title(f"Box Plot of {feature}")

plt.tight_layout
plt.show()

In [None]:
# 데이터 전처리
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 클래스 레이블 확인
unique_labels = np.unique(y)
print(unique_labels)

In [None]:
# 레이블링 인코딩

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)


In [None]:
# Random Forest 모델 생성 및 학습

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train_encoded)

In [None]:
# Random Forest 모델 예측

rf_y_pred_encoded = rf_model.predict(X_test)
rf_y_pred = label_encoder.inverse_transform(rf_y_pred_encoded)

print(classification_report(y_test, rf_y_pred,zero_division=0.0))

In [None]:
# XGboost 모델 생성 및 학습

xgb_model = XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train_encoded)

In [None]:
# 예측 및 레이블 디코딩

rf_y_pred_encoded = xgb_model.predict(X_test)
y_pred = label_encoder.inverse_transform(rf_y_pred_encoded)

In [None]:
print(classification_report(y_test, y_pred, zero_division=0.0))

In [None]:
print("XGboost Hyperparemeters:")
# print(xgb_model.get_params())


pprint.pprint(xgb_model.get_params())

In [None]:
# 하이퍼파라미터 범위지정

params = {
    "max_depth" : [3, 5, 7, 9, 15],
    "learning_rate" : [0.1, 0.01, 0.001],
    "n_estimators": [50, 100, 200, 300]
}

In [None]:
# 하이퍼파라미터 최적화 
grid_search = GridSearchCV(estimator=xgb_model, param_grid=params, cv=4, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train_encoded)

print("Best parameters:", grid_search.best_params_)
print("Best accuracy:" , grid_search.best_score_)


In [None]:
## 최적의 하이퍼파라미터의 학습

best_model = grid_search.best_estimator_

#테스트 데이터에 대한 예측
y_pred_encoded = best_model.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

print(classification_report(y_test, y_pred, zero_division=0.0))


In [None]:
# XAI
# Feature Importance 시각화
importances = best_model.feature_importances_

plt.figure(figsize= (20,12))
#막대그래프 생성
plt.bar(range(len(importances)), importances, width= 0.3)
plt.xlabel('Feature')
plt.ylabel('importance')
plt.title('Feature Importance')
plt.xticks(range(len(importances)), X.columns, rotation =45)
plt.show()

In [None]:
# XAI
# SHAP (Shapley Additive Explanations)
explainer = shap.TreeExplainer(xgb_model)
shap_values = explainer.shap_values(X_test)

shap.summary_plot(shap_values, X_test, plot_type="bar", feature_names=X.columns)

In [None]:
# XGboost 트리 시각화

fig, ax = plt.subplots(figsize = (20, 20))
plot_tree(xgb_model, num_trees=150, rankdir='LR', ax=ax)

plt.title("XGBoost Tree Visualization")
plt.show()