In [9]:
# ===============================================================
# 決策樹：手刻 ID3 / C4.5 + CART (sklearn)
# - Numeric features => 二分閾值
# - Categorical features => 等值多路切
# - 節點眾數 fallback
# - Graphviz 選用，無則用 Matplotlib
# ===============================================================

import pandas as pd
import numpy as np
import math
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.tree import DecisionTreeClassifier

# ---------------------------
# 參數
# ---------------------------
TRAIN_PATH = "adult_data_no_duplicates.csv"
TEST_PATH  = "adult_test_no_duplicates.csv"
TRAIN_ID3_PATH = "adult_data_inputation.csv"

MAX_DEPTH = 10
MIN_SAMPLES_SPLIT = 20
MIN_GAIN = 1e-4          # 最小資訊增益/增益率/基尼減少
MAX_LEAF_NODES = 64

# ---------------------------
# 工具：欄名標準化
# ---------------------------
def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = (
        df.columns.astype(str)
        .str.strip().str.lower().str.replace(r"\s+", "_", regex=True)
    )
    alias = {"income": "class", "class.": "class", "class ": "class"}
    df.rename(columns={k: v for k, v in alias.items() if k in df.columns}, inplace=True)
    return df

# ---------------------------
# 工具：清理目標欄（只保留 <=50K / >50K）
# ---------------------------
def clean_target_col(df: pd.DataFrame) -> pd.DataFrame:
    s = df["class"].astype(str).str.strip()
    s = s.str.replace(r"\.$", "", regex=True).str.replace(" ", "", regex=False).str.upper()

    s = s.replace({
        "<=50K.": "<=50K", ">50K.": ">50K",
        "<=50K?": "<=50K", ">50K?": ">50K",
        "<=50K,": "<=50K", ">50K,": ">50K",
        "<=50K ": "<=50K", ">50K ": ">50K"
    }, regex=False)

    s.loc[~s.isin({"<=50K", ">50K"}) & s.str.contains("<=", na=False)] = "<=50K"
    s.loc[~s.isin({"<=50K", ">50K"}) & s.str.contains(">",  na=False)] = ">50K"

    out = df.copy()
    out["class"] = s
    out = out[out["class"].isin({"<=50K", ">50K"})].copy()
    return out

# ---------------------------
# 讀檔 + 標準化 + 目標清理
# ---------------------------
train_df = pd.read_csv(TRAIN_PATH, skipinitialspace=True)
test_df  = pd.read_csv(TEST_PATH,  skipinitialspace=True)
train_id3_df = pd.read_csv(TRAIN_ID3_PATH, skipinitialspace=True)
train_df = standardize_columns(train_df)
test_df  = standardize_columns(test_df)
train_id3_df = standardize_columns(train_id3_df)

# 若測試檔沒有 header，改用訓練集欄名補上
if "class" not in test_df.columns:
    test_df = pd.read_csv(TEST_PATH, header=None, names=train_df.columns, skipinitialspace=True)
    test_df = standardize_columns(test_df)

train_df = clean_target_col(train_df)
test_df  = clean_target_col(test_df)
train_id3_df = clean_target_col(train_id3_df)

# ---------------------------
# X / y 切分與一致編碼
# ---------------------------
X_train = train_df.drop(columns=["class"])
y_train = train_df["class"]
X_test  = test_df.drop(columns=["class"])
y_test  = test_df["class"]
X_train_id3 = train_id3_df.drop(columns=["class"])
y_train_id3 = train_id3_df["class"]

# 記住「類別欄位」索引（手刻樹要用）
cat_cols = X_train.select_dtypes(include="object").columns.tolist()
cat_idx = [X_train.columns.get_loc(c) for c in cat_cols]

# 類別欄位用同一個 OrdinalEncoder
enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X_train_enc = X_train.copy()
X_test_enc  = X_test.copy()
X_train_id3_enc  = X_train_id3.copy()
if cat_cols:
    X_train_enc[cat_cols] = enc.fit_transform(X_train[cat_cols].astype(str))
    X_test_enc[cat_cols]  = enc.transform(X_test[cat_cols].astype(str))
    X_train_id3_enc[cat_cols] = enc.fit_transform(X_train_id3_enc[cat_cols].astype(str))

# 轉成 numpy
X_train_np = X_train_enc.to_numpy(dtype=float)
X_test_np  = X_test_enc.to_numpy(dtype=float)
X_train_id3_np = X_train_id3_enc.to_numpy(dtype=float)

# 目標欄位用同一個 LabelEncoder
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc  = le.transform(y_test)
y_train_id3_enc = le.fit_transform(y_train_id3)
CLASS_NAMES = list(le.classes_)         # ['<=50K', '>50K']
feature_names = list(X_train_enc.columns)

# ===============================================================
# Impurity & Gains
# ===============================================================
def entropy(y):
    if len(y) == 0:
        return 0.0
    counts = np.bincount(y)
    p = counts[counts > 0] / len(y)
    return -np.sum(p * np.log2(p))

def gini_impurity(y):
    if len(y) == 0:
        return 0.0
    counts = np.bincount(y)
    p = counts[counts > 0] / len(y)
    return 1.0 - np.sum(p**2)

def info_gain(parent_y, parts):
    """parts: list of child y arrays"""
    parent_H = entropy(parent_y)
    n = len(parent_y)
    weighted = sum((len(pi)/n) * entropy(pi) for pi in parts)
    return parent_H - weighted

def split_info(parts, n_total):
    """for C4.5 gain ratio"""
    result = 0.0
    for p in parts:
        w = len(p) / n_total
        if w > 0:
            result -= w * math.log2(w)
    return result

def gini_gain(parent_y, parts):
    parent_G = gini_impurity(parent_y)
    n = len(parent_y)
    weighted = sum((len(pi)/n) * gini_impurity(pi) for pi in parts)
    return parent_G - weighted

# ===============================================================
# 尋找最佳切分（支援數值/類別）
# ===============================================================
def best_split(X, y, cat_idx_set, criterion='id3'):
    n_samples, n_features = X.shape
    best = {
        "feature": None,
        "threshold": None,   # 只有數值特徵用
        "gain": -1.0,
        "is_categorical": False
    }

    for f in range(n_features):
        col = X[:, f]
        if f in cat_idx_set:
            # 類別：等值多路切
            values = np.unique(col)
            parts = [y[col == v] for v in values]
            if criterion == 'id3':
                gain = info_gain(y, parts)
            elif criterion in ('c45'):
                ig = info_gain(y, parts)
                si = split_info(parts, len(y))
                gain = (ig / si) if si > 0 else 0.0
            else:
                # 其他度量可再擴充
                gain = gini_gain(y, parts)

            if gain > best["gain"]:
                best.update({
                    "feature": f,
                    "threshold": None,
                    "gain": float(gain),
                    "is_categorical": True
                })
        else:
            # 數值：嘗試所有候選閾值（相鄰唯一值中點）
            uniq = np.unique(col)
            if len(uniq) <= 1:
                continue
            thresholds = (uniq[:-1] + uniq[1:]) / 2.0
            # 向量化效率可再優化；這裡求穩定易讀
            for thr in thresholds:
                left_mask = col <= thr
                right_mask = ~left_mask
                y_left, y_right = y[left_mask], y[right_mask]
                parts = [y_left, y_right]
                if criterion == 'id3':
                    gain = info_gain(y, parts)
                elif criterion in ('c45'):
                    ig = info_gain(y, parts)
                    si = split_info(parts, len(y))
                    gain = (ig / si) if si > 0 else 0.0
                   
                else:
                    gain = gini_gain(y, parts)

                if gain > best["gain"]:
                    best.update({
                        "feature": f,
                        "threshold": float(thr),
                        "gain": float(gain),
                        "is_categorical": False
                    })
    return best

# ===============================================================
# 建樹（遞迴）
# ===============================================================
def build_tree(X, y, feature_names, cat_idx_set,
               criterion='id3', depth=0,
               max_depth=MAX_DEPTH,
               min_samples_split=MIN_SAMPLES_SPLIT,
               min_gain=MIN_GAIN):
    node = {
        "leaf": False,
        "majority_class": int(Counter(y).most_common(1)[0][0])
    }

    # 停分條件
    if len(np.unique(y)) == 1:
        node.update({"leaf": True, "class": int(y[0])})
        return node
    if depth >= max_depth or len(y) < min_samples_split:
        node.update({"leaf": True, "class": node["majority_class"]})
        return node

    # 尋找最佳切分
    split = best_split(X, y, cat_idx_set, criterion)
    if split["feature"] is None or split["gain"] < min_gain:
        node.update({"leaf": True, "class": node["majority_class"]})
        return node

    f = split["feature"]
    node.update({
        "feature": feature_names[f],
        "feature_index": f,
        "is_categorical": split["is_categorical"],
        "threshold": split["threshold"],
        "children": {}
    })

    if split["is_categorical"]:
        col = X[:, f]
        for v in np.unique(col):
            mask = (col == v)
            child = build_tree(
                X[mask], y[mask],
                feature_names, cat_idx_set,
                criterion, depth+1, max_depth, min_samples_split, min_gain
            )
            node["children"][v] = child
    else:
        col = X[:, f]
        thr = split["threshold"]
        left_mask = col <= thr
        right_mask = ~left_mask
        node["children"]["le"] = build_tree(
            X[left_mask], y[left_mask],
            feature_names, cat_idx_set,
            criterion, depth+1, max_depth, min_samples_split, min_gain
        )
        node["children"]["gt"] = build_tree(
            X[right_mask], y[right_mask],
            feature_names, cat_idx_set,
            criterion, depth+1, max_depth, min_samples_split, min_gain
        )
    return node

# ===============================================================
# 預測
# ===============================================================
def predict_one(tree, x_row):
    if tree.get("leaf", False):
        return tree.get("class", tree["majority_class"])
    fidx = tree["feature_index"]
    if tree["is_categorical"]:
        v = x_row[fidx]
        child = tree["children"].get(v)
        if child is None:
            return tree["majority_class"]
        return predict_one(child, x_row)
    else:
        thr = tree["threshold"]
        branch = "le" if x_row[fidx] <= thr else "gt"
        child = tree["children"].get(branch)
        if child is None:
            return tree["majority_class"]
        return predict_one(child, x_row)

def predict_tree(tree, X):
    return np.array([predict_one(tree, row) for row in X])

# ===============================================================
# 圖像輸出（Graphviz 如可用，否則 Matplotlib）
# ===============================================================
def save_tree_png(clf, name, feature_names, class_names, max_depth=None):
    try:
        import graphviz
        from sklearn.tree import export_graphviz
        dot = export_graphviz(
            clf, out_file=None, feature_names=feature_names,
            class_names=class_names, filled=False, rounded=True,
            max_depth=max_depth
        )
        graphviz.Source(dot).render(name, format='png', cleanup=True)
        print(f"已輸出 {name}.png（Graphviz）")
    except Exception as e:
        from sklearn.tree import plot_tree
        plt.figure(figsize=(18, 10))
        plot_tree(clf, feature_names=feature_names, class_names=class_names,
                  filled=False, rounded=True, max_depth=max_depth)
        plt.tight_layout()
        plt.savefig(f"{name}.png", dpi=300)
        plt.close()
        print(f"已輸出 {name}.png（Matplotlib；{e}）")

# ===============================================================
# 訓練與評估：ID3
# ===============================================================
def train_and_report_id3(criterion_name):
    print(f"\n=== Building {criterion_name.upper()} Decision Tree (custom) ===")
    tree = build_tree(X_train_id3_np, y_train_id3_enc, feature_names,
                      set(cat_idx),
                      criterion=criterion_name,
                      max_depth=MAX_DEPTH,
                      min_samples_split=MIN_SAMPLES_SPLIT,
                      min_gain=MIN_GAIN)
    y_tr_pred = predict_tree(tree, X_train_id3_np)
    y_te_pred = predict_tree(tree, X_test_np)

    print("Train Accuracy:", accuracy_score(y_train_enc, y_tr_pred))
    print("Test  Accuracy:",  accuracy_score(y_test_enc,  y_te_pred))
    print(classification_report(y_test_enc, y_te_pred, target_names=CLASS_NAMES, digits=2))
    return tree, y_te_pred

tree_id3, y_test_pred_id3 = train_and_report_id3('id3')

# ===============================================================
# 訓練與評估：C4.5
# ===============================================================
def train_and_report(criterion_name):
    print(f"\n=== Building {criterion_name.upper()} Decision Tree (custom) ===")
    tree = build_tree(X_train_np, y_train_enc, feature_names,
                      set(cat_idx),
                      criterion=criterion_name,
                      max_depth=MAX_DEPTH,
                      min_samples_split=MIN_SAMPLES_SPLIT,
                      min_gain=MIN_GAIN)
    y_tr_pred = predict_tree(tree, X_train_np)
    y_te_pred = predict_tree(tree, X_test_np)

    print("Train Accuracy:", accuracy_score(y_train_enc, y_tr_pred))
    print("Test  Accuracy:",  accuracy_score(y_test_enc,  y_te_pred))
    print(classification_report(y_test_enc, y_te_pred, target_names=CLASS_NAMES, digits=2))
    return tree, y_te_pred

tree_c45, y_test_pred_c45 = train_and_report('c45')

# ===============================================================
# sklearn 基準樹：CART(gini) + Entropy（近似可視化）
# ===============================================================
clf_cart = DecisionTreeClassifier(criterion='gini', max_depth=MAX_DEPTH,
                                  max_leaf_nodes=MAX_LEAF_NODES,
                                  min_samples_split=MIN_SAMPLES_SPLIT,
                                  random_state=42)
clf_cart.fit(X_train_np, y_train_enc)
y_train_pred_cart = clf_cart.predict(X_train_np)
y_test_pred_cart  = clf_cart.predict(X_test_np)
print("\n=== CART (sklearn, gini) ===")
print("Train Accuracy:", accuracy_score(y_train_enc, y_train_pred_cart))
print("Test  Accuracy:",  accuracy_score(y_test_enc,  y_test_pred_cart))
print(classification_report(y_test_enc, y_test_pred_cart, target_names=CLASS_NAMES, digits=2))
save_tree_png(clf_cart, "Tree_CART", feature_names, CLASS_NAMES, max_depth=MAX_DEPTH)

# 只為了提供近似視覺化（與手刻樹不同步）
clf_entropy = DecisionTreeClassifier(criterion='entropy', max_depth=MAX_DEPTH,
                                     max_leaf_nodes=MAX_LEAF_NODES,
                                     min_samples_split=MIN_SAMPLES_SPLIT,
                                     random_state=42)
clf_entropy.fit(X_train_np, y_train_enc)
save_tree_png(clf_entropy, "Tree_Entropy", feature_names, CLASS_NAMES, max_depth=MAX_DEPTH)

# ===============================================================
# 匯出整合結果
# ===============================================================
output_all = pd.DataFrame({
    "Actual": y_test_enc,
    "ID3_Pred":  y_test_pred_id3,
    "C45_Pred":  y_test_pred_c45,
    "CART_Pred": y_test_pred_cart
})
try:
    output_all.to_excel("DecisionTree_AllModels_Predictions.xlsx", index=False)
    print("\n已輸出：DecisionTree_AllModels_Predictions.xlsx")
except Exception as e:
    output_all.to_csv("DecisionTree_AllModels_Predictions.csv", index=False)
    print(f"\nExcel 匯出失敗（{e}），已改存 CSV：DecisionTree_AllModels_Predictions.csv")

# 額外：輸出混淆矩陣（看類別 1 不再全 0）
def print_cm(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    print(f"\n{title} Confusion Matrix (rows=true, cols=pred):\n{cm}")

print_cm(y_test_enc, y_test_pred_id3, "ID3")
print_cm(y_test_enc, y_test_pred_c45, "C4.5")
print_cm(y_test_enc, y_test_pred_cart, "CART")

print("\n完成。樹圖已輸出：Tree_CART.png、Tree_Entropy.png")


=== Building ID3 Decision Tree (custom) ===
Train Accuracy: 0.8944893505854873
Test  Accuracy: 0.8065249447038585
              precision    recall  f1-score   support

       <=50K       0.84      0.92      0.88     12430
        >50K       0.63      0.45      0.52      3846

    accuracy                           0.81     16276
   macro avg       0.73      0.68      0.70     16276
weighted avg       0.79      0.81      0.79     16276


=== Building C45 Decision Tree (custom) ===
Train Accuracy: 0.833082337031687
Test  Accuracy: 0.8309166871467191
              precision    recall  f1-score   support

       <=50K       0.82      0.99      0.90     12430
        >50K       0.91      0.31      0.47      3846

    accuracy                           0.83     16276
   macro avg       0.87      0.65      0.68     16276
weighted avg       0.85      0.83      0.80     16276


=== CART (sklearn, gini) ===
Train Accuracy: 0.8628023480960137
Test  Accuracy: 0.8551855492750061
              pre

In [2]:
# ===============================================================
# 統一決策樹視覺化（ID3 / C4.5 / CART）
# 若未安裝 graphviz → 自動改用 matplotlib
# 統一深度：3
# 節點數 ≤ 64
# 黑白風格 (filled=False)
# ===============================================================

import os
import matplotlib.pyplot as plt
from sklearn.tree import export_graphviz, plot_tree
from sklearn.tree import DecisionTreeClassifier

MAX_DEPTH_VIS = 3
MAX_LEAF_VIS  = 64

# 嘗試載入 graphviz，如果沒有則 fallback
try:
    import graphviz
    HAS_GRAPHVIZ = True
except ImportError:
    HAS_GRAPHVIZ = False
    print("未偵測到 Graphviz，將自動改用 Matplotlib 輸出 PNG。")

def export_tree_safely(X, y, feature_names, class_names, criterion, filename):
    """
    自動選擇 Graphviz 或 Matplotlib 方式繪製決策樹
    """
    clf = DecisionTreeClassifier(
        criterion='entropy' if criterion in ('id3', 'c45') else 'gini',
        max_depth=MAX_DEPTH_VIS,
        max_leaf_nodes=MAX_LEAF_VIS,
        min_samples_split=MIN_SAMPLES_SPLIT,
        random_state=42
    )
    clf.fit(X, y)

    if HAS_GRAPHVIZ:
        try:
            dot = export_graphviz(
                clf,
                out_file=None,
                feature_names=feature_names,
                class_names=class_names,
                filled=False,      # 黑白風格
                rounded=True,
                max_depth=MAX_DEPTH_VIS
            )
            graphviz.Source(dot).render(filename, format='png', cleanup=True)
            print(f"已使用 Graphviz 輸出 {filename}.png（深度={MAX_DEPTH_VIS}, 節點≤{MAX_LEAF_VIS}）")
            return
        except Exception as e:
            print(f"Graphviz 繪圖失敗 ({e})，改用 Matplotlib。")

    # --- Fallback: Matplotlib ---
    plt.figure(figsize=(18, 10))
    plot_tree(
        clf,
        feature_names=feature_names,
        class_names=class_names,
        filled=False,   # 黑白
        rounded=True,
        max_depth=MAX_DEPTH_VIS
    )
    plt.title(f"{filename} (depth={MAX_DEPTH_VIS}, leaf≤{MAX_LEAF_VIS})", fontsize=14)
    plt.savefig(f"{filename}.png", dpi=300, bbox_inches='tight')
    plt.close()
    print(f"已使用 Matplotlib 輸出 {filename}.png（深度={MAX_DEPTH_VIS}, 節點≤{MAX_LEAF_VIS}）")


# ===============================================================
# ID3、C4.5、CART模型輸出
# ===============================================================
export_tree_safely(X_train_id3_np, y_train_id3_enc, feature_names, CLASS_NAMES, 'id3', "Tree_ID3")
export_tree_safely(X_train_np, y_train_enc, feature_names, CLASS_NAMES, 'c45', "Tree_C45")
export_tree_safely(X_train_np, y_train_enc, feature_names, CLASS_NAMES, 'cart', "Tree_CART")


Graphviz 繪圖失敗 (Length of feature_names, 14 does not match number of features, 17)，改用 Matplotlib。
已使用 Matplotlib 輸出 Tree_ID3.png（深度=3, 節點≤64）
已使用 Graphviz 輸出 Tree_C45.png（深度=3, 節點≤64）
已使用 Graphviz 輸出 Tree_CART.png（深度=3, 節點≤64）
