# 03 · 合成注入訓練管線

## 工作流程
1. **資料生成**：合成注入 200 正類 + 200 負類
2. **特徵萃取**：BLS/TLS 指標 + 幾何統計
3. **模型訓練**：LogisticRegression/XGBoost + 機率校準
4. **評估指標**：PR-AUC, Precision@K, ECE, Brier Score
5. **持久化**：儲存模型與特徵架構

---

## 1. 環境設定與依賴安裝

In [None]:
# 環境設定與依賴安裝（Colab）
import sys, subprocess, pkgutil
import warnings
warnings.filterwarnings('ignore')

def pipi(*pkgs):
    """安裝套件的輔助函式"""
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", *pkgs])

# 安裝必要套件（避免 numpy 2.0 相容性問題）
print("🚀 正在安裝依賴套件...")
try:
    import numpy as np
    import lightkurve as lk
    import sklearn
    import xgboost
    print("✅ 基礎套件已安裝")
except Exception:
    pipi("numpy<2", "lightkurve", "astroquery", "scikit-learn", 
         "matplotlib", "seaborn", "xgboost", "joblib", "pandas", "pyarrow")
    print("✅ 依賴套件安裝完成")

# 檢查是否在 Colab 環境
IN_COLAB = 'google.colab' in sys.modules

if IN_COLAB:
    print("📍 在 Google Colab 環境執行")
    # Clone repository if needed
    import os
    if not os.path.exists('/content/exoplanet-starter'):
        !git clone https://github.com/exoplanet-spaceapps/exoplanet-starter.git /content/exoplanet-starter
        os.chdir('/content/exoplanet-starter')
    sys.path.append('/content/exoplanet-starter')
else:
    print("💻 在本地環境執行")
    import os
    os.chdir(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
    sys.path.append(os.getcwd())

print("\n環境設定完成！")

## 2. 導入套件與模組

In [None]:
# 標準函式庫
import json
import time
from pathlib import Path
from typing import Dict, List, Tuple, Optional

# 數據處理
import numpy as np
import pandas as pd

# 天文資料
import lightkurve as lk

# 機器學習
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV, calibration_curve
from sklearn.metrics import (
    average_precision_score,
    precision_recall_curve,
    roc_auc_score,
    brier_score_loss,
    classification_report,
    confusion_matrix
)
import xgboost as xgb
import joblib

# 視覺化
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.gridspec import GridSpec

# 設定視覺化風格
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

# 導入自定義模組
from app.injection import (
    inject_box_transit,
    generate_synthetic_dataset,
    save_synthetic_dataset,
    generate_transit_parameters
)

from app.bls_features import (
    run_bls,
    extract_features,
    extract_features_batch,
    compute_feature_importance,
    create_feature_schema
)

print("📚 套件導入完成")
print(f"   NumPy 版本: {np.__version__}")
print(f"   Pandas 版本: {pd.__version__}")
print(f"   Scikit-learn 版本: {sklearn.__version__}")
print(f"   XGBoost 版本: {xgb.__version__}")

## 3. 資料生成：合成凌日注入

### 3.1 下載基礎光曲線

In [None]:
# 下載真實光曲線作為基礎
print("📡 下載基礎光曲線...")

try:
    # 使用 TIC 25155310 (TOI-431) 作為基礎
    target = "TIC 25155310"
    search_result = lk.search_lightcurve(target, mission="TESS", author="SPOC")
    lc = search_result[0].download()
    
    # 清理和去趨勢
    lc_clean = lc.remove_nans()
    lc_flat = lc_clean.flatten(window_length=401)
    
    base_time = lc_flat.time.value
    base_flux = lc_flat.flux.value
    
    print(f"✅ 成功下載 {target}")
    print(f"   資料點數: {len(base_time)}")
    print(f"   時間跨度: {base_time[-1] - base_time[0]:.1f} 天")
    
except Exception as e:
    print(f"⚠️ 無法下載真實光曲線: {e}")
    print("   使用模擬光曲線...")
    
    # 生成模擬光曲線（27天 TESS 觀測）
    base_time = np.linspace(0, 27, 20000)
    base_flux = np.ones(20000) + np.random.normal(0, 0.0001, 20000)
    
    print(f"✅ 生成模擬光曲線")
    print(f"   資料點數: {len(base_time)}")
    print(f"   時間跨度: {base_time[-1] - base_time[0]:.1f} 天")

### 3.2 生成合成資料集

In [None]:
# 生成合成資料集
print("\n🔨 生成合成資料集...")
print("   參數範圍：")
print("   • 週期: 0.6 - 10.0 天")
print("   • 深度: 0.0005 - 0.02 (500 - 20000 ppm)")
print("   • 持續時間: 週期的 2% - 10%")

samples_df, labels_df = generate_synthetic_dataset(
    base_time=base_time,
    base_flux=base_flux,
    n_positive=200,
    n_negative=200,
    period_range=(0.6, 10.0),
    depth_range=(0.0005, 0.02),
    duration_fraction_range=(0.02, 0.1),
    noise_level=0.0001,
    seed=42
)

print(f"\n✅ 生成 {len(samples_df)} 個樣本")
print(f"   正樣本（有凌日）: {len(samples_df[samples_df['label'] == 1])}")
print(f"   負樣本（無凌日）: {len(samples_df[samples_df['label'] == 0])}")

# 儲存資料集
dataset_paths = save_synthetic_dataset(
    samples_df,
    labels_df,
    output_dir="data/synthetic",
    format="parquet"
)

print(f"\n💾 資料集已儲存至:")
for key, path in dataset_paths.items():
    print(f"   {key}: {path}")

### 3.3 參數分布視覺化

In [None]:
# 視覺化參數分布
positive_labels = labels_df[labels_df['label'] == 1]

fig, axes = plt.subplots(2, 2, figsize=(12, 10))

# 週期分布
axes[0, 0].hist(positive_labels['period'], bins=30, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('週期 (天)')
axes[0, 0].set_ylabel('數量')
axes[0, 0].set_title('週期分布')
axes[0, 0].grid(True, alpha=0.3)

# 深度分布
axes[0, 1].hist(positive_labels['depth'] * 1e6, bins=30, edgecolor='black', alpha=0.7, color='orange')
axes[0, 1].set_xlabel('深度 (ppm)')
axes[0, 1].set_ylabel('數量')
axes[0, 1].set_title('凌日深度分布')
axes[0, 1].grid(True, alpha=0.3)

# 持續時間分布
axes[1, 0].hist(positive_labels['duration'] * 24, bins=30, edgecolor='black', alpha=0.7, color='green')
axes[1, 0].set_xlabel('持續時間 (小時)')
axes[1, 0].set_ylabel('數量')
axes[1, 0].set_title('凌日持續時間分布')
axes[1, 0].grid(True, alpha=0.3)

# SNR 分布
axes[1, 1].hist(positive_labels['snr_estimate'], bins=30, edgecolor='black', alpha=0.7, color='red')
axes[1, 1].set_xlabel('SNR 估計')
axes[1, 1].set_ylabel('數量')
axes[1, 1].set_title('信噪比分布')
axes[1, 1].grid(True, alpha=0.3)

plt.suptitle('合成凌日參數分布', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

# 統計摘要
print("\n📊 參數統計摘要：")
print(positive_labels[['period', 'depth', 'duration', 'snr_estimate']].describe())

## 4. 特徵萃取

### 4.1 批次提取 BLS 特徵

In [None]:
# 提取特徵
print("🔍 開始批次特徵提取...")
print("   這可能需要幾分鐘時間...\n")

start_time = time.time()

# 批次提取特徵
features_df = extract_features_batch(
    samples_df,
    compute_advanced=True,
    verbose=True
)

elapsed_time = time.time() - start_time

print(f"\n✅ 特徵提取完成")
print(f"   耗時: {elapsed_time:.1f} 秒")
print(f"   平均每個樣本: {elapsed_time/len(samples_df):.2f} 秒")
print(f"   提取特徵數: {len(features_df.columns) - 2}")  # 扣除 sample_id 和 label

# 顯示特徵列表
feature_cols = [col for col in features_df.columns if col not in ['sample_id', 'label']]
print(f"\n📋 特徵列表:")
for i, feat in enumerate(feature_cols, 1):
    print(f"   {i:2d}. {feat}")

### 4.2 特徵重要性分析

In [None]:
# 計算特徵重要性
print("🎯 計算特徵重要性...")

importance_df = compute_feature_importance(
    features_df,
    features_df['label'].values,
    method="random_forest"
)

# 視覺化特徵重要性
fig, ax = plt.subplots(figsize=(10, 6))

top_features = importance_df.head(10)
bars = ax.barh(range(len(top_features)), top_features['importance'].values)
ax.set_yticks(range(len(top_features)))
ax.set_yticklabels(top_features['feature'].values)
ax.set_xlabel('重要性分數')
ax.set_title('特徵重要性排名 (Top 10)', fontsize=14, fontweight='bold')
ax.grid(True, alpha=0.3, axis='x')

# 添加數值標籤
for i, (bar, val) in enumerate(zip(bars, top_features['importance'].values)):
    ax.text(val, bar.get_y() + bar.get_height()/2, f'{val:.3f}', 
            ha='left', va='center', fontsize=9)

plt.tight_layout()
plt.show()

print("\n🏆 Top 5 最重要特徵:")
for idx, row in importance_df.head(5).iterrows():
    print(f"   {idx+1}. {row['feature']}: {row['importance']:.4f}")

### 4.3 建立特徵架構

In [None]:
# 建立並儲存特徵架構
feature_schema = create_feature_schema(
    feature_cols,
    output_path="data/feature_schema.json"
)

print("📝 特徵架構已建立")
print(f"   特徵數量: {feature_schema['n_features']}")
print(f"   版本: {feature_schema['version']}")
print(f"   儲存位置: data/feature_schema.json")

## 5. 模型訓練與校準

### 5.1 資料準備

In [None]:
# 準備訓練資料
X = features_df[feature_cols].values
y = features_df['label'].values

# 處理無效值
X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)

# 分割訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# 標準化特徵
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("📊 資料集統計:")
print(f"   訓練集: {len(X_train)} 樣本")
print(f"   測試集: {len(X_test)} 樣本")
print(f"   正樣本比例 (訓練): {y_train.mean():.2%}")
print(f"   正樣本比例 (測試): {y_test.mean():.2%}")

### 5.2 訓練多個模型

In [None]:
# 訓練多個模型
models = {}
print("🚀 開始訓練模型...\n")

# 1. Logistic Regression
print("1️⃣ 訓練 Logistic Regression...")
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)
models['LogisticRegression'] = lr_model
print(f"   訓練分數: {lr_model.score(X_train_scaled, y_train):.3f}")
print(f"   測試分數: {lr_model.score(X_test_scaled, y_test):.3f}")

# 2. Random Forest
print("\n2️⃣ 訓練 Random Forest...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)  # Random Forest 不需要標準化
models['RandomForest'] = rf_model
print(f"   訓練分數: {rf_model.score(X_train, y_train):.3f}")
print(f"   測試分數: {rf_model.score(X_test, y_test):.3f}")

# 3. XGBoost
print("\n3️⃣ 訓練 XGBoost...")
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_model.fit(X_train, y_train)
models['XGBoost'] = xgb_model
print(f"   訓練分數: {xgb_model.score(X_train, y_train):.3f}")
print(f"   測試分數: {xgb_model.score(X_test, y_test):.3f}")

### 5.3 機率校準

In [None]:
# 選擇最佳模型進行校準
print("\n🎯 進行機率校準...")

# 選擇 XGBoost 作為基礎模型
base_model = models['XGBoost']

# Isotonic 校準
print("   使用 Isotonic Regression 校準...")
calibrated_model = CalibratedClassifierCV(
    base_model,
    method='isotonic',
    cv=3
)
calibrated_model.fit(X_train, y_train)

# 獲取預測機率
prob_uncalibrated = base_model.predict_proba(X_test)[:, 1]
prob_calibrated = calibrated_model.predict_proba(X_test)[:, 1]

print("✅ 校準完成")

## 6. 模型評估

### 6.1 計算評估指標

In [None]:
def calculate_metrics(y_true, y_prob, model_name="Model"):
    """
    計算全面的評估指標
    """
    # PR-AUC
    pr_auc = average_precision_score(y_true, y_prob)
    
    # ROC-AUC
    roc_auc = roc_auc_score(y_true, y_prob)
    
    # Brier Score
    brier = brier_score_loss(y_true, y_prob)
    
    # ECE (Expected Calibration Error)
    n_bins = 10
    bin_boundaries = np.linspace(0, 1, n_bins + 1)
    bin_indices = np.digitize(y_prob, bin_boundaries) - 1
    
    ece = 0
    for i in range(n_bins):
        mask = bin_indices == i
        if np.sum(mask) > 0:
            bin_acc = np.mean(y_true[mask])
            bin_conf = np.mean(y_prob[mask])
            bin_size = np.sum(mask) / len(y_true)
            ece += bin_size * np.abs(bin_acc - bin_conf)
    
    # Precision@K
    k_values = [10, 20, 50]
    precision_at_k = {}
    sorted_indices = np.argsort(y_prob)[::-1]
    
    for k in k_values:
        if k <= len(y_true):
            top_k_true = y_true[sorted_indices[:k]]
            precision_at_k[f'P@{k}'] = np.mean(top_k_true)
    
    return {
        'Model': model_name,
        'PR-AUC': pr_auc,
        'ROC-AUC': roc_auc,
        'Brier Score': brier,
        'ECE': ece,
        **precision_at_k
    }

# 計算所有指標
metrics_uncalibrated = calculate_metrics(y_test, prob_uncalibrated, "XGBoost (未校準)")
metrics_calibrated = calculate_metrics(y_test, prob_calibrated, "XGBoost (已校準)")

# 顯示結果
metrics_df = pd.DataFrame([metrics_uncalibrated, metrics_calibrated])
print("\n📊 模型評估指標:")
print(metrics_df.to_string(index=False))

# 改善比較
print("\n📈 校準改善:")
print(f"   ECE 改善: {(metrics_uncalibrated['ECE'] - metrics_calibrated['ECE'])/metrics_uncalibrated['ECE']*100:.1f}%")
print(f"   Brier Score 改善: {(metrics_uncalibrated['Brier Score'] - metrics_calibrated['Brier Score'])/metrics_uncalibrated['Brier Score']*100:.1f}%")

### 6.2 可靠度曲線視覺化

In [None]:
# 繪製可靠度曲線
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# 未校準模型
fraction_pos_uncal, mean_pred_uncal = calibration_curve(
    y_test, prob_uncalibrated, n_bins=10
)

axes[0].plot(mean_pred_uncal, fraction_pos_uncal, 'o-', label='未校準', color='red')
axes[0].plot([0, 1], [0, 1], 'k--', label='完美校準')
axes[0].set_xlabel('平均預測機率')
axes[0].set_ylabel('實際正樣本比例')
axes[0].set_title('未校準模型可靠度曲線', fontsize=12, fontweight='bold')
axes[0].legend(loc='best')
axes[0].grid(True, alpha=0.3)

# 已校準模型
fraction_pos_cal, mean_pred_cal = calibration_curve(
    y_test, prob_calibrated, n_bins=10
)

axes[1].plot(mean_pred_cal, fraction_pos_cal, 'o-', label='已校準', color='green')
axes[1].plot([0, 1], [0, 1], 'k--', label='完美校準')
axes[1].set_xlabel('平均預測機率')
axes[1].set_ylabel('實際正樣本比例')
axes[1].set_title('已校準模型可靠度曲線', fontsize=12, fontweight='bold')
axes[1].legend(loc='best')
axes[1].grid(True, alpha=0.3)

plt.suptitle('機率校準效果比較', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.show()

print("\n💡 說明:")
print("   • 理想的可靠度曲線應該接近對角線")
print("   • 曲線在對角線上方表示模型過度保守")
print("   • 曲線在對角線下方表示模型過度自信")
print("   • Isotonic 校準有效改善了模型的機率預測")

### 6.3 PR 曲線與 Precision@K

In [None]:
# 繪製 PR 曲線
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# PR 曲線
precision_uncal, recall_uncal, _ = precision_recall_curve(y_test, prob_uncalibrated)
precision_cal, recall_cal, _ = precision_recall_curve(y_test, prob_calibrated)

axes[0].plot(recall_uncal, precision_uncal, label=f'未校準 (AP={metrics_uncalibrated["PR-AUC"]:.3f})', color='red')
axes[0].plot(recall_cal, precision_cal, label=f'已校準 (AP={metrics_calibrated["PR-AUC"]:.3f})', color='green')
axes[0].set_xlabel('Recall')
axes[0].set_ylabel('Precision')
axes[0].set_title('Precision-Recall 曲線', fontsize=12, fontweight='bold')
axes[0].legend(loc='best')
axes[0].grid(True, alpha=0.3)

# Precision@K 柱狀圖
k_values = [10, 20, 50]
precision_at_k_uncal = []
precision_at_k_cal = []

for k in k_values:
    if f'P@{k}' in metrics_uncalibrated:
        precision_at_k_uncal.append(metrics_uncalibrated[f'P@{k}'])
        precision_at_k_cal.append(metrics_calibrated[f'P@{k}'])

x = np.arange(len(k_values))
width = 0.35

bars1 = axes[1].bar(x - width/2, precision_at_k_uncal, width, label='未校準', color='red', alpha=0.7)
bars2 = axes[1].bar(x + width/2, precision_at_k_cal, width, label='已校準', color='green', alpha=0.7)

axes[1].set_xlabel('K')
axes[1].set_ylabel('Precision@K')
axes[1].set_title('Precision@K 比較', fontsize=12, fontweight='bold')
axes[1].set_xticks(x)
axes[1].set_xticklabels([f'Top {k}' for k in k_values])
axes[1].legend()
axes[1].grid(True, alpha=0.3, axis='y')

# 添加數值標籤
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        axes[1].text(bar.get_x() + bar.get_width()/2., height,
                    f'{height:.2f}',
                    ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.show()

## 7. 模型持久化

In [None]:
# 建立輸出目錄
output_dir = Path("model")
output_dir.mkdir(parents=True, exist_ok=True)

# 儲存模型
print("💾 儲存模型與相關檔案...\n")

# 1. 儲存校準模型
model_path = output_dir / "ranker.joblib"
joblib.dump(calibrated_model, model_path)
print(f"✅ 模型已儲存: {model_path}")

# 2. 儲存特徵標準化器
scaler_path = output_dir / "scaler.joblib"
joblib.dump(scaler, scaler_path)
print(f"✅ 標準化器已儲存: {scaler_path}")

# 3. 儲存特徵架構
import shutil
shutil.copy("data/feature_schema.json", output_dir / "feature_schema.json")
print(f"✅ 特徵架構已複製: {output_dir / 'feature_schema.json'}")

# 4. 儲存模型元資料
metadata = {
    "model_type": "XGBoost with Isotonic Calibration",
    "training_date": time.strftime("%Y-%m-%d %H:%M:%S"),
    "n_features": len(feature_cols),
    "feature_names": feature_cols,
    "training_samples": len(X_train),
    "test_samples": len(X_test),
    "metrics": metrics_calibrated,
    "parameters": {
        "period_range": [0.6, 10.0],
        "depth_range": [0.0005, 0.02],
        "duration_fraction_range": [0.02, 0.1]
    }
}

metadata_path = output_dir / "model_metadata.json"
with open(metadata_path, 'w') as f:
    json.dump(metadata, f, indent=2, default=str)
print(f"✅ 元資料已儲存: {metadata_path}")

print("\n📦 所有檔案已成功儲存至 'model/' 目錄")

## 8. 總結報告

In [None]:
print("="*60)
print("📊 訓練管線執行總結")
print("="*60)

print(f"""
🎯 資料集:
   • 總樣本數: {len(samples_df)}
   • 正樣本: {len(samples_df[samples_df['label'] == 1])}
   • 負樣本: {len(samples_df[samples_df['label'] == 0])}
   
🔍 特徵工程:
   • 特徵數量: {len(feature_cols)}
   • Top 3 重要特徵:
""")

for idx, row in importance_df.head(3).iterrows():
    print(f"     - {row['feature']}: {row['importance']:.4f}")

print(f"""
🤖 模型效能:
   • PR-AUC: {metrics_calibrated['PR-AUC']:.3f}
   • ROC-AUC: {metrics_calibrated['ROC-AUC']:.3f}
   • Brier Score: {metrics_calibrated['Brier Score']:.3f}
   • ECE: {metrics_calibrated['ECE']:.3f}
   • Precision@10: {metrics_calibrated.get('P@10', 'N/A')}
   
💡 關鍵發現:
   1. Isotonic 校準顯著改善了機率預測的可靠性
   2. BLS 特徵（週期、SNR、深度）是最重要的預測因子
   3. 模型在高置信度預測上表現優異（高 Precision@K）
   
📦 輸出檔案:
   • 模型: model/ranker.joblib
   • 標準化器: model/scaler.joblib
   • 特徵架構: model/feature_schema.json
   • 元資料: model/model_metadata.json
   • 合成資料: data/synthetic/
""")

print("="*60)
print("✅ 訓練管線完成！")
print("="*60)

## 9. 監督式學習管線（真實 TOI + Kepler EB 資料）

> 💡 **新增功能**：使用真實的 TOI（正類）和 Kepler EB（負類）資料進行監督式訓練，與合成注入方法比較。

### 9.1 載入真實資料集

## 9. 監督式學習管線（真實 TOI + Kepler EB 資料）

> 💡 **新增功能**：使用真實的 TOI（正類）和 Kepler EB（負類）資料進行監督式訓練，與合成注入方法比較。

---

## 9. 監督式學習管線（真實 TOI + Kepler EB 資料）

> 💡 **新增功能**：使用真實的 TOI（正類）和 Kepler EB（負類）資料進行監督式訓練，與合成注入方法比較。

### 9.1 載入真實資料集

In [None]:
# Load real datasets
print("📂 載入真實資料集...")

# 載入 TOI 資料（正樣本）
toi_path = Path("data/toi.csv")
if toi_path.exists():
    toi_df = pd.read_csv(toi_path)
    # 篩選已確認的行星候選（PC/CP/KP）
    toi_positive = toi_df[toi_df['tfopwg_disp'].isin(['PC', 'CP', 'KP'])].copy()
    toi_positive['label'] = 1
    toi_positive['source'] = 'TOI'
    print(f"✅ TOI 正樣本: {len(toi_positive)} 個")
else:
    print("⚠️ 找不到 TOI 資料，請先執行 01_tap_download.ipynb")
    toi_positive = pd.DataFrame()

# 載入 Kepler EB 資料（負樣本）
eb_path = Path("data/kepler_eb.csv")
if eb_path.exists():
    eb_df = pd.read_csv(eb_path)
    # 使用 EB 作為負樣本
    eb_negative = eb_df.copy()
    eb_negative['label'] = 0
    eb_negative['source'] = 'Kepler_EB'
    print(f"✅ Kepler EB 負樣本: {len(eb_negative)} 個")
else:
    print("⚠️ 找不到 Kepler EB 資料，請先執行 01_tap_download.ipynb")
    eb_negative = pd.DataFrame()

# 載入 TOI 假陽性（額外負樣本）
if 'toi_df' in locals() and not toi_df.empty:
    toi_fp = toi_df[toi_df['tfopwg_disp'] == 'FP'].copy()
    toi_fp['label'] = 0
    toi_fp['source'] = 'TOI_FP'
    print(f"✅ TOI 假陽性負樣本: {len(toi_fp)} 個")
else:
    toi_fp = pd.DataFrame()

print(f"\n📊 總計:")
print(f"   正樣本: {len(toi_positive)}")
print(f"   負樣本: {len(eb_negative) + len(toi_fp)}")

### 9.2 下載並處理真實光曲線

In [None]:
def download_and_process_lightcurve(target_id, mission="TESS"):
    """
    下載並處理單個目標的光曲線

    Parameters:
    -----------
    target_id : str
        目標識別碼（TIC 或 KIC）
    mission : str
        任務名稱（TESS 或 Kepler）

    Returns:
    --------
    tuple : (time, flux) 或 (None, None) 如果失敗
    """
    try:
        # 搜尋光曲線
        search_result = lk.search_lightcurve(target_id, mission=mission, author="SPOC" if mission=="TESS" else "Kepler")
        if len(search_result) == 0:
            return None, None

        # 下載第一個結果
        lc = search_result[0].download()

        # 清理和去趨勢
        lc_clean = lc.remove_nans()
        if len(lc_clean) < 100:  # 太少資料點
            return None, None

        lc_flat = lc_clean.flatten(window_length=401)

        return lc_flat.time.value, lc_flat.flux.value

    except Exception as e:
        return None, None

# 示範：處理部分真實樣本
print("🔬 處理真實光曲線樣本...")
print("   （為節省時間，僅處理前 50 個樣本）")

supervised_samples = []
supervised_labels = []

# 處理 TOI 正樣本（最多 25 個）
n_toi_samples = min(25, len(toi_positive)) if 'toi_positive' in locals() else 0
if n_toi_samples > 0:
    print(f"\n處理 {n_toi_samples} 個 TOI 正樣本...")
    for idx, row in toi_positive.head(n_toi_samples).iterrows():
        tic_id = f"TIC {int(row['tid'])}"
        time_data, flux_data = download_and_process_lightcurve(tic_id, "TESS")

        if time_data is not None:
            supervised_samples.append({
                'sample_id': f"toi_{row['tid']}",
                'time': time_data,
                'flux': flux_data,
                'label': 1,
                'period': row.get('pl_orbper', np.nan),
                'source': 'TOI'
            })
            print(f"   ✓ {tic_id}")
        else:
            print(f"   ✗ {tic_id} (無法下載)")

# 處理 Kepler EB 負樣本（最多 25 個）
n_eb_samples = min(25, len(eb_negative)) if 'eb_negative' in locals() else 0
if n_eb_samples > 0:
    print(f"\n處理 {n_eb_samples} 個 Kepler EB 負樣本...")
    for idx, row in eb_negative.head(n_eb_samples).iterrows():
        kic_id = f"KIC {int(row['KIC'])}"
        time_data, flux_data = download_and_process_lightcurve(kic_id, "Kepler")

        if time_data is not None:
            supervised_samples.append({
                'sample_id': f"eb_{row['KIC']}",
                'time': time_data,
                'flux': flux_data,
                'label': 0,
                'period': row.get('period', np.nan),
                'source': 'Kepler_EB'
            })
            print(f"   ✓ {kic_id}")
        else:
            print(f"   ✗ {kic_id} (無法下載)")

supervised_samples_df = pd.DataFrame(supervised_samples) if supervised_samples else pd.DataFrame()
print(f"\n✅ 成功處理 {len(supervised_samples_df)} 個真實樣本")
if len(supervised_samples_df) > 0:
    print(f"   正樣本: {len(supervised_samples_df[supervised_samples_df['label']==1])}")
    print(f"   負樣本: {len(supervised_samples_df[supervised_samples_df['label']==0])}")

### 9.3 提取真實資料特徵

In [None]:
# 從真實光曲線提取特徵
if 'supervised_samples_df' in locals() and len(supervised_samples_df) > 0:
    print("🔍 提取真實資料特徵...")

    supervised_features = []

    for idx, row in supervised_samples_df.iterrows():
        # 執行 BLS
        bls_result = run_bls(row['time'], row['flux'])

        # 提取特徵
        features = extract_features(row['time'], row['flux'], bls_result, compute_advanced=True)
        features['sample_id'] = row['sample_id']
        features['label'] = row['label']
        features['source'] = row['source']
        features['true_period'] = row['period']

        supervised_features.append(features)

        if (idx + 1) % 10 == 0:
            print(f"   處理進度: {idx+1}/{len(supervised_samples_df)}")

    supervised_features_df = pd.DataFrame(supervised_features)
    print(f"\n✅ 特徵提取完成: {len(supervised_features_df)} 個樣本")

    # 顯示特徵統計
    print("\n📊 真實資料特徵統計:")
    feature_cols_real = [col for col in supervised_features_df.columns
                         if col not in ['sample_id', 'label', 'source', 'true_period']]
    print(supervised_features_df[feature_cols_real].describe())
else:
    print("⚠️ 無真實樣本可用於監督式學習")
    supervised_features_df = pd.DataFrame()

### 9.4 訓練監督式模型

In [None]:
if 'supervised_features_df' in locals() and len(supervised_features_df) > 10:
    print("🚀 訓練監督式模型...\n")

    # 準備資料
    X_supervised = supervised_features_df[feature_cols].values
    y_supervised = supervised_features_df['label'].values

    # 處理無效值
    X_supervised = np.nan_to_num(X_supervised, nan=0.0, posinf=0.0, neginf=0.0)

    # 分割資料集
    X_train_sup, X_test_sup, y_train_sup, y_test_sup = train_test_split(
        X_supervised, y_supervised, test_size=0.3, random_state=42, stratify=y_supervised
    )

    # 標準化
    scaler_supervised = StandardScaler()
    X_train_sup_scaled = scaler_supervised.fit_transform(X_train_sup)
    X_test_sup_scaled = scaler_supervised.transform(X_test_sup)

    print(f"📊 監督式資料集:")
    print(f"   訓練集: {len(X_train_sup)} 樣本")
    print(f"   測試集: {len(X_test_sup)} 樣本")
    print(f"   正樣本比例: {y_supervised.mean():.2%}\n")

    # 訓練 XGBoost（監督式）
    xgb_supervised = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb_supervised.fit(X_train_sup, y_train_sup)

    print("✅ XGBoost（監督式）訓練完成")
    print(f"   訓練分數: {xgb_supervised.score(X_train_sup, y_train_sup):.3f}")
    print(f"   測試分數: {xgb_supervised.score(X_test_sup, y_test_sup):.3f}")

    # 機率校準
    print("\n🎯 進行機率校準...")
    calibrated_supervised = CalibratedClassifierCV(
        xgb_supervised,
        method='isotonic',
        cv=3
    )
    calibrated_supervised.fit(X_train_sup, y_train_sup)

    # 獲取預測機率
    prob_supervised = calibrated_supervised.predict_proba(X_test_sup)[:, 1]

    # 計算指標
    metrics_supervised = calculate_metrics(y_test_sup, prob_supervised, "XGBoost (監督式)")

    print("\n📊 監督式模型評估指標:")
    print(pd.DataFrame([metrics_supervised]).to_string(index=False))
else:
    print("⚠️ 樣本數不足，跳過監督式訓練")
    metrics_supervised = None

### 9.5 方法比較：合成注入 vs 監督式學習

In [None]:
# 比較兩種方法的效能
print("🔬 方法比較：合成注入 vs 監督式學習")
print("="*60)

if 'metrics_supervised' in locals() and metrics_supervised is not None:
    # 建立比較表
    comparison_df = pd.DataFrame([
        metrics_calibrated,  # 合成注入方法
        metrics_supervised   # 監督式方法
    ])
    comparison_df['Model'] = ['合成注入', '監督式']

    print("\n📊 效能指標對比:")
    print(comparison_df.to_string(index=False))

    # 視覺化比較
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))

    # 指標列表
    metrics_to_compare = ['PR-AUC', 'ROC-AUC', 'Brier Score', 'ECE', 'P@10', 'P@20']

    for idx, metric in enumerate(metrics_to_compare):
        row = idx // 3
        col = idx % 3
        ax = axes[row, col]

        if metric in comparison_df.columns:
            values = [
                metrics_calibrated.get(metric, 0),
                metrics_supervised.get(metric, 0)
            ]
            colors = ['blue', 'orange']
            bars = ax.bar(['合成注入', '監督式'], values, color=colors, alpha=0.7)

            # 添加數值標籤
            for bar, val in zip(bars, values):
                if val is not None and not pd.isna(val):
                    ax.text(bar.get_x() + bar.get_width()/2., val,
                           f'{val:.3f}',
                           ha='center', va='bottom', fontsize=10)

            ax.set_title(metric, fontsize=12, fontweight='bold')
            ax.set_ylabel('分數')
            ax.grid(True, alpha=0.3, axis='y')

            # 根據指標類型設置 y 軸範圍
            if metric in ['PR-AUC', 'ROC-AUC', 'P@10', 'P@20']:
                ax.set_ylim([0, 1.1])
            elif metric == 'ECE':
                ax.set_ylim([0, 0.2])

    plt.suptitle('合成注入 vs 監督式學習 效能比較', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

    # 優劣勢分析
    print("\n💡 分析總結:")
    print("="*60)

    # 計算相對改善
    pr_auc_diff = (metrics_supervised['PR-AUC'] - metrics_calibrated['PR-AUC']) / metrics_calibrated['PR-AUC'] * 100
    ece_diff = (metrics_calibrated['ECE'] - metrics_supervised['ECE']) / metrics_calibrated['ECE'] * 100

    print("📈 **合成注入方法**的優勢:")
    print("   • 不需要大量標註資料")
    print("   • 可以控制訓練樣本的參數分布")
    print("   • 適合快速原型開發和測試")
    print(f"   • 在本實驗中 ECE: {metrics_calibrated['ECE']:.3f}")

    print("\n📊 **監督式學習**的優勢:")
    print("   • 使用真實天文資料，更接近實際應用")
    print("   • 能學習到真實資料中的複雜模式")
    print("   • 對真實噪音和系統誤差有更好的魯棒性")
    print(f"   • 在本實驗中 PR-AUC: {metrics_supervised['PR-AUC']:.3f}")

    if pr_auc_diff > 0:
        print(f"\n🏆 監督式方法在 PR-AUC 上提升了 {pr_auc_diff:.1f}%")
    else:
        print(f"\n🏆 合成注入方法在 PR-AUC 上領先 {-pr_auc_diff:.1f}%")

else:
    print("\n⚠️ 無法進行比較（監督式模型未訓練）")
    print("   原因：真實資料樣本不足或無法下載")
    print("   建議：")
    print("   1. 確保已執行 01_tap_download.ipynb")
    print("   2. 檢查網路連線")
    print("   3. 增加處理的樣本數量")
    comparison_df = None

print("\n="*60)

### 9.6 儲存監督式模型

In [None]:
if 'metrics_supervised' in locals() and metrics_supervised is not None:
    print("💾 儲存監督式模型...")

    # 建立輸出目錄
    supervised_dir = Path("model/supervised")
    supervised_dir.mkdir(parents=True, exist_ok=True)

    # 儲存模型
    joblib.dump(calibrated_supervised, supervised_dir / "ranker_supervised.joblib")
    joblib.dump(scaler_supervised, supervised_dir / "scaler_supervised.joblib")

    # 儲存元資料
    supervised_metadata = {
        "model_type": "XGBoost with Isotonic Calibration (Supervised)",
        "training_date": time.strftime("%Y-%m-%d %H:%M:%S"),
        "data_sources": {
            "positive": "TOI (PC/CP/KP)",
            "negative": "Kepler EB + TOI FP"
        },
        "n_features": len(feature_cols),
        "feature_names": feature_cols,
        "training_samples": len(X_train_sup),
        "test_samples": len(X_test_sup),
        "metrics": metrics_supervised
    }

    with open(supervised_dir / "model_metadata.json", 'w') as f:
        json.dump(supervised_metadata, f, indent=2, default=str)

    print(f"✅ 監督式模型已儲存至: {supervised_dir}")

    # 儲存比較結果
    if 'comparison_df' in locals() and comparison_df is not None:
        comparison_df.to_csv("model/method_comparison.csv", index=False)
        print("✅ 方法比較結果已儲存至: model/method_comparison.csv")
else:
    print("⚠️ 無監督式模型可儲存")

## 10. 完整總結報告

In [None]:
print("="*70)
print("📊 完整訓練管線執行總結")
print("="*70)

print(f"""
🎯 資料集統計:

   【合成注入資料】
   • 總樣本數: {len(samples_df)}
   • 正樣本: {len(samples_df[samples_df['label'] == 1])}
   • 負樣本: {len(samples_df[samples_df['label'] == 0])}

   【真實監督資料】
   • 總樣本數: {len(supervised_features_df) if 'supervised_features_df' in locals() else 0}
   • TOI 正樣本: {len(supervised_features_df[supervised_features_df['source']=='TOI']) if 'supervised_features_df' in locals() and len(supervised_features_df) > 0 else 0}
   • Kepler EB 負樣本: {len(supervised_features_df[supervised_features_df['source']=='Kepler_EB']) if 'supervised_features_df' in locals() and len(supervised_features_df) > 0 else 0}

🔍 特徵工程:
   • 特徵數量: {len(feature_cols)}
   • Top 3 重要特徵:
""")

for idx, row in importance_df.head(3).iterrows():
    print(f"     - {row['feature']}: {row['importance']:.4f}")

print(f"""
🤖 模型效能比較:

   【合成注入方法】
   • PR-AUC: {metrics_calibrated['PR-AUC']:.3f}
   • ROC-AUC: {metrics_calibrated['ROC-AUC']:.3f}
   • Brier Score: {metrics_calibrated['Brier Score']:.3f}
   • ECE: {metrics_calibrated['ECE']:.3f}
""")

if 'metrics_supervised' in locals() and metrics_supervised is not None:
    print(f"""
   【監督式學習】
   • PR-AUC: {metrics_supervised['PR-AUC']:.3f}
   • ROC-AUC: {metrics_supervised['ROC-AUC']:.3f}
   • Brier Score: {metrics_supervised['Brier Score']:.3f}
   • ECE: {metrics_supervised['ECE']:.3f}
    """)

print(f"""
💡 關鍵發現與建議:
   1. Isotonic 校準顯著改善了機率預測的可靠性
   2. BLS 特徵（週期、SNR、深度）是最重要的預測因子
   3. 合成注入適合快速開發，監督式學習更接近實際應用
   4. 建議在實際部署時結合兩種方法的優勢

📦 輸出檔案:
   • 合成模型: model/ranker.joblib
   • 監督模型: model/supervised/ranker_supervised.joblib
   • 特徵架構: model/feature_schema.json
   • 比較結果: model/method_comparison.csv
   • 資料集: data/synthetic/ 和 data/

🚀 下一步:
   1. 使用 04_newdata_inference.ipynb 對新資料進行推論
   2. 在更大的真實資料集上訓練監督式模型
   3. 探索深度學習方法（CNN/Transformer）
   4. 部署為 Web 應用或 API 服務
""")

print("="*70)
print("✅ 訓練管線（含監督式分支）完成！")
print("="*70)

## 9. 監督式學習管線（真實 TOI + Kepler EB 資料）

> 💡 **新增功能**：使用真實的 TOI（正類）和 Kepler EB（負類）資料進行監督式訓練，與合成注入方法比較。

### 9.1 載入真實資料集

In [None]:
# Load real datasets
print("📂 載入真實資料集...")

# 載入 TOI 資料（正樣本）
toi_path = Path("data/toi.csv")
if toi_path.exists():
    toi_df = pd.read_csv(toi_path)
    # 篩選已確認的行星候選（PC/CP/KP）
    toi_positive = toi_df[toi_df['tfopwg_disp'].isin(['PC', 'CP', 'KP'])].copy()
    toi_positive['label'] = 1
    toi_positive['source'] = 'TOI'
    print(f"✅ TOI 正樣本: {len(toi_positive)} 個")
else:
    print("⚠️ 找不到 TOI 資料，請先執行 01_tap_download.ipynb")
    toi_positive = pd.DataFrame()

# 載入 Kepler EB 資料（負樣本）
eb_path = Path("data/kepler_eb.csv")
if eb_path.exists():
    eb_df = pd.read_csv(eb_path)
    # 使用 EB 作為負樣本
    eb_negative = eb_df.copy()
    eb_negative['label'] = 0
    eb_negative['source'] = 'Kepler_EB'
    print(f"✅ Kepler EB 負樣本: {len(eb_negative)} 個")
else:
    print("⚠️ 找不到 Kepler EB 資料，請先執行 01_tap_download.ipynb")
    eb_negative = pd.DataFrame()

# 載入 TOI 假陽性（額外負樣本）
if 'toi_df' in locals() and not toi_df.empty:
    toi_fp = toi_df[toi_df['tfopwg_disp'] == 'FP'].copy()
    toi_fp['label'] = 0
    toi_fp['source'] = 'TOI_FP'
    print(f"✅ TOI 假陽性負樣本: {len(toi_fp)} 個")
else:
    toi_fp = pd.DataFrame()

print(f"\n📊 總計:")
print(f"   正樣本: {len(toi_positive)}")
print(f"   負樣本: {len(eb_negative) + len(toi_fp)}")

### 9.2 下載並處理真實光曲線

In [None]:
def download_and_process_lightcurve(target_id, mission="TESS"):
    """
    下載並處理單個目標的光曲線

    Parameters:
    -----------
    target_id : str
        目標識別碼（TIC 或 KIC）
    mission : str
        任務名稱（TESS 或 Kepler）

    Returns:
    --------
    tuple : (time, flux) 或 (None, None) 如果失敗
    """
    try:
        # 搜尋光曲線
        search_result = lk.search_lightcurve(target_id, mission=mission, author="SPOC" if mission=="TESS" else "Kepler")
        if len(search_result) == 0:
            return None, None

        # 下載第一個結果
        lc = search_result[0].download()

        # 清理和去趨勢
        lc_clean = lc.remove_nans()
        if len(lc_clean) < 100:  # 太少資料點
            return None, None

        lc_flat = lc_clean.flatten(window_length=401)

        return lc_flat.time.value, lc_flat.flux.value

    except Exception as e:
        return None, None

# 示範：處理部分真實樣本
print("🔬 處理真實光曲線樣本...")
print("   （為節省時間，僅處理前 50 個樣本）")

supervised_samples = []
supervised_labels = []

# 處理 TOI 正樣本（最多 25 個）
n_toi_samples = min(25, len(toi_positive)) if 'toi_positive' in locals() else 0
if n_toi_samples > 0:
    print(f"\n處理 {n_toi_samples} 個 TOI 正樣本...")
    for idx, row in toi_positive.head(n_toi_samples).iterrows():
        tic_id = f"TIC {int(row['tid'])}"
        time_data, flux_data = download_and_process_lightcurve(tic_id, "TESS")

        if time_data is not None:
            supervised_samples.append({
                'sample_id': f"toi_{row['tid']}",
                'time': time_data,
                'flux': flux_data,
                'label': 1,
                'period': row.get('pl_orbper', np.nan),
                'source': 'TOI'
            })
            print(f"   ✓ {tic_id}")
        else:
            print(f"   ✗ {tic_id} (無法下載)")

# 處理 Kepler EB 負樣本（最多 25 個）
n_eb_samples = min(25, len(eb_negative)) if 'eb_negative' in locals() else 0
if n_eb_samples > 0:
    print(f"\n處理 {n_eb_samples} 個 Kepler EB 負樣本...")
    for idx, row in eb_negative.head(n_eb_samples).iterrows():
        kic_id = f"KIC {int(row['KIC'])}"
        time_data, flux_data = download_and_process_lightcurve(kic_id, "Kepler")

        if time_data is not None:
            supervised_samples.append({
                'sample_id': f"eb_{row['KIC']}",
                'time': time_data,
                'flux': flux_data,
                'label': 0,
                'period': row.get('period', np.nan),
                'source': 'Kepler_EB'
            })
            print(f"   ✓ {kic_id}")
        else:
            print(f"   ✗ {kic_id} (無法下載)")

supervised_samples_df = pd.DataFrame(supervised_samples) if supervised_samples else pd.DataFrame()
print(f"\n✅ 成功處理 {len(supervised_samples_df)} 個真實樣本")
if len(supervised_samples_df) > 0:
    print(f"   正樣本: {len(supervised_samples_df[supervised_samples_df['label']==1])}")
    print(f"   負樣本: {len(supervised_samples_df[supervised_samples_df['label']==0])}")

### 9.3 提取真實資料特徵

In [None]:
# 從真實光曲線提取特徵
if 'supervised_samples_df' in locals() and len(supervised_samples_df) > 0:
    print("🔍 提取真實資料特徵...")

    supervised_features = []

    for idx, row in supervised_samples_df.iterrows():
        # 執行 BLS
        bls_result = run_bls(row['time'], row['flux'])

        # 提取特徵
        features = extract_features(row['time'], row['flux'], bls_result, compute_advanced=True)
        features['sample_id'] = row['sample_id']
        features['label'] = row['label']
        features['source'] = row['source']
        features['true_period'] = row['period']

        supervised_features.append(features)

        if (idx + 1) % 10 == 0:
            print(f"   處理進度: {idx+1}/{len(supervised_samples_df)}")

    supervised_features_df = pd.DataFrame(supervised_features)
    print(f"\n✅ 特徵提取完成: {len(supervised_features_df)} 個樣本")

    # 顯示特徵統計
    print("\n📊 真實資料特徵統計:")
    feature_cols_real = [col for col in supervised_features_df.columns
                         if col not in ['sample_id', 'label', 'source', 'true_period']]
    print(supervised_features_df[feature_cols_real].describe())
else:
    print("⚠️ 無真實樣本可用於監督式學習")
    supervised_features_df = pd.DataFrame()

### 9.4 訓練監督式模型

In [None]:
if 'supervised_features_df' in locals() and len(supervised_features_df) > 10:
    print("🚀 訓練監督式模型...\n")

    # 準備資料
    X_supervised = supervised_features_df[feature_cols].values
    y_supervised = supervised_features_df['label'].values

    # 處理無效值
    X_supervised = np.nan_to_num(X_supervised, nan=0.0, posinf=0.0, neginf=0.0)

    # 分割資料集
    X_train_sup, X_test_sup, y_train_sup, y_test_sup = train_test_split(
        X_supervised, y_supervised, test_size=0.3, random_state=42, stratify=y_supervised
    )

    # 標準化
    scaler_supervised = StandardScaler()
    X_train_sup_scaled = scaler_supervised.fit_transform(X_train_sup)
    X_test_sup_scaled = scaler_supervised.transform(X_test_sup)

    print(f"📊 監督式資料集:")
    print(f"   訓練集: {len(X_train_sup)} 樣本")
    print(f"   測試集: {len(X_test_sup)} 樣本")
    print(f"   正樣本比例: {y_supervised.mean():.2%}\n")

    # 訓練 XGBoost（監督式）
    xgb_supervised = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=5,
        learning_rate=0.1,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    xgb_supervised.fit(X_train_sup, y_train_sup)

    print("✅ XGBoost（監督式）訓練完成")
    print(f"   訓練分數: {xgb_supervised.score(X_train_sup, y_train_sup):.3f}")
    print(f"   測試分數: {xgb_supervised.score(X_test_sup, y_test_sup):.3f}")

    # 機率校準
    print("\n🎯 進行機率校準...")
    calibrated_supervised = CalibratedClassifierCV(
        xgb_supervised,
        method='isotonic',
        cv=3
    )
    calibrated_supervised.fit(X_train_sup, y_train_sup)

    # 獲取預測機率
    prob_supervised = calibrated_supervised.predict_proba(X_test_sup)[:, 1]

    # 計算指標
    metrics_supervised = calculate_metrics(y_test_sup, prob_supervised, "XGBoost (監督式)")

    print("\n📊 監督式模型評估指標:")
    print(pd.DataFrame([metrics_supervised]).to_string(index=False))
else:
    print("⚠️ 樣本數不足，跳過監督式訓練")
    metrics_supervised = None

### 9.5 方法比較：合成注入 vs 監督式學習

In [None]:
# 比較兩種方法的效能
print("🔬 方法比較：合成注入 vs 監督式學習")
print("="*60)

if 'metrics_supervised' in locals() and metrics_supervised is not None:
    # 建立比較表
    comparison_df = pd.DataFrame([
        metrics_calibrated,  # 合成注入方法
        metrics_supervised   # 監督式方法
    ])
    comparison_df['Model'] = ['合成注入', '監督式']

    print("\n📊 效能指標對比:")
    print(comparison_df.to_string(index=False))

    # 視覺化比較
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))

    # 指標列表
    metrics_to_compare = ['PR-AUC', 'ROC-AUC', 'Brier Score', 'ECE', 'P@10', 'P@20']

    for idx, metric in enumerate(metrics_to_compare):
        row = idx // 3
        col = idx % 3
        ax = axes[row, col]

        if metric in comparison_df.columns:
            values = [
                metrics_calibrated.get(metric, 0),
                metrics_supervised.get(metric, 0)
            ]
            colors = ['blue', 'orange']
            bars = ax.bar(['合成注入', '監督式'], values, color=colors, alpha=0.7)

            # 添加數值標籤
            for bar, val in zip(bars, values):
                if val is not None and not pd.isna(val):
                    ax.text(bar.get_x() + bar.get_width()/2., val,
                           f'{val:.3f}',
                           ha='center', va='bottom', fontsize=10)

            ax.set_title(metric, fontsize=12, fontweight='bold')
            ax.set_ylabel('分數')
            ax.grid(True, alpha=0.3, axis='y')

            # 根據指標類型設置 y 軸範圍
            if metric in ['PR-AUC', 'ROC-AUC', 'P@10', 'P@20']:
                ax.set_ylim([0, 1.1])
            elif metric == 'ECE':
                ax.set_ylim([0, 0.2])

    plt.suptitle('合成注入 vs 監督式學習 效能比較', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.show()

    # 優劣勢分析
    print("\n💡 分析總結:")
    print("="*60)

    # 計算相對改善
    pr_auc_diff = (metrics_supervised['PR-AUC'] - metrics_calibrated['PR-AUC']) / metrics_calibrated['PR-AUC'] * 100
    ece_diff = (metrics_calibrated['ECE'] - metrics_supervised['ECE']) / metrics_calibrated['ECE'] * 100

    print("📈 **合成注入方法**的優勢:")
    print("   • 不需要大量標註資料")
    print("   • 可以控制訓練樣本的參數分布")
    print("   • 適合快速原型開發和測試")
    print(f"   • 在本實驗中 ECE: {metrics_calibrated['ECE']:.3f}")

    print("\n📊 **監督式學習**的優勢:")
    print("   • 使用真實天文資料，更接近實際應用")
    print("   • 能學習到真實資料中的複雜模式")
    print("   • 對真實噪音和系統誤差有更好的魯棒性")
    print(f"   • 在本實驗中 PR-AUC: {metrics_supervised['PR-AUC']:.3f}")

    if pr_auc_diff > 0:
        print(f"\n🏆 監督式方法在 PR-AUC 上提升了 {pr_auc_diff:.1f}%")
    else:
        print(f"\n🏆 合成注入方法在 PR-AUC 上領先 {-pr_auc_diff:.1f}%")

else:
    print("\n⚠️ 無法進行比較（監督式模型未訓練）")
    print("   原因：真實資料樣本不足或無法下載")
    print("   建議：")
    print("   1. 確保已執行 01_tap_download.ipynb")
    print("   2. 檢查網路連線")
    print("   3. 增加處理的樣本數量")
    comparison_df = None

print("\n="*60)

### 9.6 儲存監督式模型

In [None]:
if 'metrics_supervised' in locals() and metrics_supervised is not None:
    print("💾 儲存監督式模型...")

    # 建立輸出目錄
    supervised_dir = Path("model/supervised")
    supervised_dir.mkdir(parents=True, exist_ok=True)

    # 儲存模型
    joblib.dump(calibrated_supervised, supervised_dir / "ranker_supervised.joblib")
    joblib.dump(scaler_supervised, supervised_dir / "scaler_supervised.joblib")

    # 儲存元資料
    supervised_metadata = {
        "model_type": "XGBoost with Isotonic Calibration (Supervised)",
        "training_date": time.strftime("%Y-%m-%d %H:%M:%S"),
        "data_sources": {
            "positive": "TOI (PC/CP/KP)",
            "negative": "Kepler EB + TOI FP"
        },
        "n_features": len(feature_cols),
        "feature_names": feature_cols,
        "training_samples": len(X_train_sup),
        "test_samples": len(X_test_sup),
        "metrics": metrics_supervised
    }

    with open(supervised_dir / "model_metadata.json", 'w') as f:
        json.dump(supervised_metadata, f, indent=2, default=str)

    print(f"✅ 監督式模型已儲存至: {supervised_dir}")

    # 儲存比較結果
    if 'comparison_df' in locals() and comparison_df is not None:
        comparison_df.to_csv("model/method_comparison.csv", index=False)
        print("✅ 方法比較結果已儲存至: model/method_comparison.csv")
else:
    print("⚠️ 無監督式模型可儲存")

## 10. 完整總結報告

In [None]:
print("="*70)
print("📊 完整訓練管線執行總結")
print("="*70)

print(f"""
🎯 資料集統計:

   【合成注入資料】
   • 總樣本數: {len(samples_df)}
   • 正樣本: {len(samples_df[samples_df['label'] == 1])}
   • 負樣本: {len(samples_df[samples_df['label'] == 0])}

   【真實監督資料】
   • 總樣本數: {len(supervised_features_df) if 'supervised_features_df' in locals() else 0}
   • TOI 正樣本: {len(supervised_features_df[supervised_features_df['source']=='TOI']) if 'supervised_features_df' in locals() and len(supervised_features_df) > 0 else 0}
   • Kepler EB 負樣本: {len(supervised_features_df[supervised_features_df['source']=='Kepler_EB']) if 'supervised_features_df' in locals() and len(supervised_features_df) > 0 else 0}

🔍 特徵工程:
   • 特徵數量: {len(feature_cols)}
   • Top 3 重要特徵:
""")

for idx, row in importance_df.head(3).iterrows():
    print(f"     - {row['feature']}: {row['importance']:.4f}")

print(f"""
🤖 模型效能比較:

   【合成注入方法】
   • PR-AUC: {metrics_calibrated['PR-AUC']:.3f}
   • ROC-AUC: {metrics_calibrated['ROC-AUC']:.3f}
   • Brier Score: {metrics_calibrated['Brier Score']:.3f}
   • ECE: {metrics_calibrated['ECE']:.3f}
""")

if 'metrics_supervised' in locals() and metrics_supervised is not None:
    print(f"""
   【監督式學習】
   • PR-AUC: {metrics_supervised['PR-AUC']:.3f}
   • ROC-AUC: {metrics_supervised['ROC-AUC']:.3f}
   • Brier Score: {metrics_supervised['Brier Score']:.3f}
   • ECE: {metrics_supervised['ECE']:.3f}
    """)

print(f"""
💡 關鍵發現與建議:
   1. Isotonic 校準顯著改善了機率預測的可靠性
   2. BLS 特徵（週期、SNR、深度）是最重要的預測因子
   3. 合成注入適合快速開發，監督式學習更接近實際應用
   4. 建議在實際部署時結合兩種方法的優勢

📦 輸出檔案:
   • 合成模型: model/ranker.joblib
   • 監督模型: model/supervised/ranker_supervised.joblib
   • 特徵架構: model/feature_schema.json
   • 比較結果: model/method_comparison.csv
   • 資料集: data/synthetic/ 和 data/

🚀 下一步:
   1. 使用 04_newdata_inference.ipynb 對新資料進行推論
   2. 在更大的真實資料集上訓練監督式模型
   3. 探索深度學習方法（CNN/Transformer）
   4. 部署為 Web 應用或 API 服務
""")

print("="*70)
print("✅ 訓練管線（含監督式分支）完成！")
print("="*70)