# 概述

[Heart Disease Predictor XM](https://www.kaggle.com/competitions/heart-disease-predictor-xm/rules)

目標: 構建 ML/DL 模型，可以準確預測患者是否存在心臟病。

In [1]:
import os
# Set the environment variable
os.environ['LOKY_MAX_CPU_COUNT'] = '4'

# 載入套件

In [2]:
import numpy as np
import pandas as pd
import datetime

import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

import warnings
warnings.filterwarnings('ignore')

# 載入資料集

In [3]:
# 讀取訓練資料集和測試資料集
train_data = pd.read_csv('./data/heart_disease_train.csv')
test_data = pd.read_csv('./data/heart_disease_test.csv')

print('訓練資料集和測試資料集成功!\n')
print(f'訓練資料集: {train_data.shape[0]} rows, {train_data.shape[1]} columns')
print(f'測試資料集: {test_data.shape[0]} rows, {test_data.shape[1]} columns')

訓練資料集和測試資料集成功!

訓練資料集: 952 rows, 13 columns
測試資料集: 238 rows, 12 columns


In [4]:
# 保存測試集中的ID欄位
test_ids = test_data['ID'].copy()
# 測試集添加一個target欄位，設置為NaN
test_data['target'] = np.nan

all_data = pd.concat([train_data, test_data]).reset_index(drop=True)

print(f'合併資料集: {all_data.shape[0]} rows, {all_data.shape[1]} columns')
print(f"Target NaN 數量(測試資料集數量): {all_data['target'].isnull().sum()}")

合併資料集: 1190 rows, 13 columns
Target NaN 數量(測試資料集數量): 238


# 資料處理

## 處理缺失值

In [5]:
# 檢查缺失值數量
missing_values = all_data.isnull().sum()
missing_features = missing_values[missing_values > 0]

print(f'有 {len(missing_features)} 個特徵有缺失值')
# 列出所有缺失值的特徵
missing_features

有 1 個特徵有缺失值


target    238
dtype: int64

## 處理異常值

In [6]:
def handle_outliers(df, columns, factor=1.5):
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - factor * IQR
        upper_bound = Q3 + factor * IQR
        # 限制異常值到合理範圍內
        df[col] = df[col].clip(lower_bound, upper_bound)
    return df

# 排除'target'和'ID'列不進行異常值處理
numeric_columns = all_data.select_dtypes(include=[np.number]).columns.drop(['target', 'ID'])
all_data = handle_outliers(all_data, numeric_columns)

## 特徵工程

In [7]:
def create_features(df):
    # 年齡分組
    df['age_group'] = pd.cut(df['age'], bins=[0, 40, 60, 100], labels=['Young', 'Middle', 'Old'])

    # 心率儲備
    df['heart_rate_reserve'] = 220 - df['age'] - df['max.heart.rate']

    # 血壓分類
    df['bp_category'] = pd.cut(df['resting.bp.s'], bins=[0, 120, 140, 200], labels=['Normal', 'Prehypertension', 'Hypertension'])

    # 膽固醇水平分類
    df['cholesterol_category'] = pd.cut(df['cholesterol'], bins=[0, 200, 240, 1000], labels=['Normal', 'Borderline', 'High'])

    # 綜合風險評分（簡化版）
    df['risk_score'] = (
        (df['age'] > 50).astype(int) +
        (df['resting.bp.s'] > 140).astype(int) +
        (df['cholesterol'] > 200).astype(int) +
        (df['fasting.blood.sugar'] == 1).astype(int) +
        (df['max.heart.rate'] > 150).astype(int)
    )

    # 特徵交互
    df['age_chest_pain'] = df['age'] * df['chest.pain.type']
    df['sex_max_heart_rate'] = df['sex'] * df['max.heart.rate']
    df['bp_cholesterol'] = df['resting.bp.s'] * df['cholesterol']
    df['exercise_st_slope'] = df['exercise.angina'] * df['ST.slope']

    return df
# 創建新的特徵
all_data = create_features(all_data)

## OneHot Encoding

In [8]:
# 手動指定特徵種類
numeric = [
    'age', 'resting.bp.s', 'cholesterol', 'max.heart.rate', 'oldpeak', 
    'heart_rate_reserve', 'risk_score', 'age_chest_pain', 
    'sex_max_heart_rate', 'bp_cholesterol', 'exercise_st_slope'
]

categorical = [
    'sex', 'chest.pain.type', 'fasting.blood.sugar', 'resting.ecg', 
    'exercise.angina', 'ST.slope', 'age_group', 'bp_category', 
    'cholesterol_category'
]

In [9]:
all_data_before = all_data.copy()
# categorical = all_data.select_dtypes(include=['object']).columns
all_data = pd.get_dummies(all_data, columns=categorical, drop_first=True)

print('OneHot encoding completed!\n')
print('Shape before: ', all_data_before.shape)
print('Shape after: ', all_data.shape)

OneHot encoding completed!

Shape before:  (1190, 22)
Shape after:  (1190, 27)


## 資料拆分

合併資料集拆分回原訓練集和原測試集

根據target是否為空來拆分數據：
- 訓練集(train_df)包含有已知target的房屋樣本。
- 測試集(test_df)包含target缺失的房屋樣本，用已完成訓練模型來預測。

In [10]:
# 將數據重新拆分回訓練集和測試集
idx = all_data['target'].notnull().sum()

# Train Set
train_df = all_data[:idx]
X_train_data = train_df.drop(columns=['target'])
y_train_data = train_df['target']

# Test Set
test_df = all_data[idx:]
X_test_data = test_df.drop('target', axis=1)

## Standardization

In [11]:
# 資料標準化
scaler = StandardScaler()
X_train_data = scaler.fit_transform(X_train_data)
X_test_data = scaler.transform(X_test_data)

## 資料拆分
訓練集拆分訓練集和測試集以進行模型訓練

In [12]:
# 將訓練集拆分為訓練集和驗證集
X_train, X_test, y_train, y_test = train_test_split(X_train_data, y_train_data, test_size=0.2, random_state=42)

# 輸出拆分後的數據集大小
print(f'X_train: {X_train.shape}')
print(f'y_train: {y_train.shape}')
print('='*33)
print(f'X_test: {X_test.shape}')
print(f'y_test: {X_test.shape}')

X_train: (761, 26)
y_train: (761,)
X_test: (191, 26)
y_test: (191, 26)


# 訓練模型

In [13]:
auto_cv_tuned = {}
def Auto_CrossValidation(model_name, model):
    print(f'\n{model_name} Cross-Validation Results')
    print('=' * 33)
    
    scores_list = None
    # 檢查是否有交叉驗證結果
    if hasattr(model, 'cv_results_'):
        scores_list = model.cv_results_['mean_test_score']
        
        # 計算平均 Validation Accuracy
        val_acc_mean = np.mean(scores_list)
        print('Validation Accuracy (Mean): ', np.round(val_acc_mean, 3))
    else:
        print('Warning: No cross-validation results found.')
    
    # 儲存交叉驗證結果
    auto_cv_tuned[model_name] = {
        'Validation Accuracy (Mean)': val_acc_mean if scores_list is not None else None,
        'CV Scores': scores_list
    }

`GridSearchCV`具體過程:
- `X_train`和`y_train`會根據交叉驗證參數`cv`被自動分割成多個子集。
- `cv=5`，那麼`GridSearchCV`會將`X_train`和`y_train`分成 5 個等大小的子集。
- 在每次訓練中，會選擇其中 4 個子集作為訓練集，第 5 個子集作為驗證集。
- 這個過程會重複 5 次，每次使用不同的子集作為驗證集，從而確保每個數據點都參與驗證。

In [14]:
# 1. 決策樹模型 GridSearchCV
dt = DecisionTreeClassifier(random_state=42)
param_grid_dt = {
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search_dt = GridSearchCV(estimator=dt, param_grid=param_grid_dt, 
                              cv=5, n_jobs=1, verbose=0, scoring='accuracy')
grid_search_dt.fit(X_train, y_train)
Auto_CrossValidation('Decision Tree', grid_search_dt)

# 2. 隨機森林模型 GridSearchCV
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': [None, 'sqrt'],
    'bootstrap': [True, False]
}
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, 
                              cv=5, n_jobs=1, verbose=0)
grid_search_rf.fit(X_train, y_train)
Auto_CrossValidation('Random Forest', grid_search_rf)

# 3. XGBoost 模型 GridSearchCV
xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')
xgb_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5],
    'learning_rate': [0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'gamma': [0, 0.1],
    'lambda': [1, 2]
}
grid_search_xgb = GridSearchCV(estimator=xgb, param_grid=xgb_param_grid, 
                               cv=5, n_jobs=1, verbose=0, scoring='accuracy')
grid_search_xgb.fit(X_train, y_train)
Auto_CrossValidation('XGBoost', grid_search_xgb)

# 4. LightGBM 模型 GridSearchCV
lgbm = LGBMClassifier(random_state=42)
lgbm_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 15],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
grid_search_lgbm = GridSearchCV(estimator=lgbm, param_grid=lgbm_param_grid, 
                                cv=5, n_jobs=1, verbose=0, scoring='accuracy')
grid_search_lgbm.fit(X_train, y_train)
Auto_CrossValidation('LightGBM', grid_search_lgbm)

# 5. CatBoost 模型 GridSearchCV
catboost = CatBoostClassifier(random_state=42, silent=True)
catboost_param_grid = {
    'iterations': [250, 300, 350],
    'depth': [6, 7, 8],
    'learning_rate': [0.04, 0.05, 0.06],
    'subsample': [0.7, 0.8, 0.9]
}
grid_search_catboost = GridSearchCV(estimator=catboost, param_grid=catboost_param_grid, 
                                    cv=5, n_jobs=1, verbose=0, scoring='accuracy')
grid_search_catboost.fit(X_train, y_train)
Auto_CrossValidation('CatBoost', grid_search_catboost)

# 6. Gradient Boosting 模型 GridSearchCV
gboost = GradientBoostingClassifier(random_state=42)
gboost_param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0]
}
grid_search_gboost = GridSearchCV(estimator=gboost, param_grid=gboost_param_grid, 
                                  cv=5, n_jobs=1, verbose=0, scoring='accuracy')
grid_search_gboost.fit(X_train, y_train)
Auto_CrossValidation('Gradient Boosting', grid_search_gboost)

# 7. 堆疊分類器
stacking_clf = StackingClassifier(
    estimators=[
        ('dt', grid_search_dt.best_estimator_),
        ('xgb', grid_search_xgb.best_estimator_),
        ('rf', grid_search_rf.best_estimator_),
        ('lgbm', grid_search_lgbm.best_estimator_),
        ('catboost', grid_search_catboost.best_estimator_),
        ('gboost', grid_search_gboost.best_estimator_)
    ],
    final_estimator=GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42),
    cv=5,
    n_jobs=1
)
stacking_clf.fit(X_train, y_train)
Auto_CrossValidation('Stacking Classifier', stacking_clf)

# 儲存自動交叉驗證結果
auto_cv_results = pd.DataFrame(auto_cv_tuned).T
auto_cv_results


Decision Tree Cross-Validation Results
Validation Accuracy (Mean):  0.813

Random Forest Cross-Validation Results
Validation Accuracy (Mean):  0.847

XGBoost Cross-Validation Results
Validation Accuracy (Mean):  0.853
[LightGBM] [Info] Number of positive: 320, number of negative: 288
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000212 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1915
[LightGBM] [Info] Number of data points in the train set: 608, number of used features: 26
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.526316 -> initscore=0.105361
[LightGBM] [Info] Start training from score 0.105361
[LightGBM] [Info] Number of positive: 320, number of negative: 289
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000235 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1916
[LightGBM] [Info] Number of data

Unnamed: 0,Validation Accuracy (Mean),CV Scores
Decision Tree,0.81265,"[0.8291881664946681, 0.8305039559683521, 0.834..."
Random Forest,0.847101,"[0.8567595459236326, 0.8593997248022015, 0.860..."
XGBoost,0.853156,"[0.8528551771585828, 0.8541537667698658, 0.851..."
LightGBM,0.864684,"[0.8567939456484348, 0.8567939456484348, 0.866..."
CatBoost,0.868136,"[0.8607241142070864, 0.8594255245958031, 0.862..."
Gradient Boosting,0.858507,"[0.8554609563123495, 0.851530787753698, 0.8607..."
Stacking Classifier,,


# 評估模型

評估的衡量標準是準確度分數。

準確率分數的定義是:
$$\text{Accuracy} = \frac{TP + TN}{TP + FP + TN + FN}$$
- 準確率分數簡單定義為正確預測的數量除以預測的總數
- 真陽性 = TP (正確預測的心臟病)
- 假陽性 = FP (模型預測心臟病，但實際上患者沒有心臟病)
- 真陰性 = TN (正確預測的無心臟病)
- 假陰性 = FN (模型預測沒有心臟病，但實際上患者有心臟病)

In [15]:
test_tuned = {}
def Test_Evaluation(model_name, model, X_test, y_test):
    print(f'\n{model_name} Test Evaluation')
    print('=' * 33)
    
    # 預測測試集
    y_pred = model.predict(X_test)
    
    # 計算測試集的 Accuracy
    test_acc = accuracy_score(y_test, y_pred)
    print('Test Accuracy: ', np.round(test_acc, 3))
    
    # 檢查是否是 GridSearchCV 或 RandomizedSearchCV
    if hasattr(model, 'best_params_'):
        print('Best Parameters: ', model.best_params_)
        best_params = model.best_params_
    else:
        best_params = None
    
    # 儲存結果
    test_tuned[model_name] = {
        'Test Accuracy': test_acc,
        'Best Params': best_params
    }

In [16]:
# 評估模型在測試集上的表現
Test_Evaluation('Decision Tree', grid_search_dt, X_test, y_test)
Test_Evaluation('Random Forest', grid_search_rf, X_test, y_test)
Test_Evaluation('XGBoost', grid_search_xgb, X_test, y_test)
Test_Evaluation('LightGBM', grid_search_lgbm, X_test, y_test)
Test_Evaluation('CatBoost', grid_search_catboost, X_test, y_test)
Test_Evaluation('Gradient Boosting', grid_search_gboost, X_test, y_test)
Test_Evaluation('Stacking Classifier', stacking_clf, X_test, y_test)

# 儲存測試集結果
test_results = pd.DataFrame(test_tuned).T
test_results


Decision Tree Test Evaluation
Test Accuracy:  0.838
Best Parameters:  {'max_depth': 5, 'min_samples_leaf': 2, 'min_samples_split': 2}

Random Forest Test Evaluation
Test Accuracy:  0.885
Best Parameters:  {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}

XGBoost Test Evaluation
Test Accuracy:  0.916
Best Parameters:  {'colsample_bytree': 0.8, 'gamma': 0.1, 'lambda': 1, 'learning_rate': 0.2, 'max_depth': 5, 'n_estimators': 200, 'subsample': 0.8}

LightGBM Test Evaluation
Test Accuracy:  0.906
Best Parameters:  {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 200, 'subsample': 0.8}

CatBoost Test Evaluation
Test Accuracy:  0.911
Best Parameters:  {'depth': 7, 'iterations': 350, 'learning_rate': 0.04, 'subsample': 0.9}

Gradient Boosting Test Evaluation
Test Accuracy:  0.911
Best Parameters:  {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 0.8}

Stacki

Unnamed: 0,Test Accuracy,Best Params
Decision Tree,0.837696,"{'max_depth': 5, 'min_samples_leaf': 2, 'min_s..."
Random Forest,0.884817,"{'bootstrap': False, 'max_depth': 20, 'max_fea..."
XGBoost,0.91623,"{'colsample_bytree': 0.8, 'gamma': 0.1, 'lambd..."
LightGBM,0.905759,"{'colsample_bytree': 0.8, 'learning_rate': 0.1..."
CatBoost,0.910995,"{'depth': 7, 'iterations': 350, 'learning_rate..."
Gradient Boosting,0.910995,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti..."
Stacking Classifier,0.884817,


# 重新訓練

In [17]:
# 選出 Test Accuracy 最高的模型
best_model_name = max(test_tuned, key=lambda k: test_tuned[k]['Test Accuracy'])
print(f"\nBest Model: {best_model_name} with Test Accuracy: {test_tuned[best_model_name]['Test Accuracy']}")

# 根據模型名稱取出對應的最佳模型
if best_model_name == 'Random Forest':
    final_model = RandomForestClassifier(**test_tuned['Random Forest']['Best Params'], random_state=42)
elif best_model_name == 'XGBoost':
    final_model = XGBClassifier(**test_tuned['XGBoost']['Best Params'], random_state=42, use_label_encoder=False, eval_metric='logloss')
elif best_model_name == 'LightGBM':
    final_model = LGBMClassifier(**test_tuned['LightGBM']['Best Params'], random_state=42)
elif best_model_name == 'CatBoost':
    final_model = CatBoostClassifier(**test_tuned['CatBoost']['Best Params'], random_state=42, silent=True)
elif best_model_name == 'Gradient Boosting':
    final_model = GradientBoostingClassifier(**test_tuned['Gradient Boosting']['Best Params'], random_state=42)
elif best_model_name == 'Stacking Classifier':
    final_model = stacking_clf

# 用 X_train_data 和 y_train_data 重新訓練模型
print(f"Training the best model {best_model_name} with full training data...")
final_model.fit(X_train_data, y_train_data)


Best Model: XGBoost with Test Accuracy: 0.9162303664921466
Training the best model XGBoost with full training data...


# 產生提交檔

![Kaggle-Submission-XGBoost-20240820](./images/Kaggle-Submission-XGBoost-20240820.png)

In [18]:
# 使用重新訓練的模型進行測試集預測
test_predictions = final_model.predict(X_test_data)

# 創建提交數據
submission = pd.DataFrame({
    'ID': test_data['ID'],
    'target': test_predictions
})

date = datetime.datetime.today().strftime('%Y-%m-%d')
submission_path = f'./data/Submission-{best_model_name}-{date}.csv'
submission.to_csv(submission_path, index=False)
print(f"Submission saved successfully as: {submission_path}")

Submission saved successfully as: ./data/Submission-XGBoost-2024-08-20.csv
