# 🚀 Kepler Exoplanet Detection - Complete Model Training & Comparison

**訓練 Genesis CNN、XGBoost、Random Forest 並生成完整比較報告**

---

## 📋 步驟

1. ✅ 檢查 GPU 可用性
2. 📦 安裝依賴
3. 📥 下載 Kaggle 數據集
4. 🔄 數據預處理（SMOTE 平衡）
5. 🧠 訓練 3 個模型（Genesis CNN、XGBoost、RF）
6. 📊 生成比較圖表和 PDF 報告
7. 💾 下載結果

---

**預計訓練時間**：
- A100 GPU: ~3-5 分鐘
- L4 GPU: ~5-8 分鐘
- T4 GPU: ~8-12 分鐘

## 1️⃣ 檢查 GPU 並安裝依賴

In [None]:
# 檢查 GPU
!nvidia-smi

import tensorflow as tf
print("\n" + "="*60)
print("TensorFlow version:", tf.__version__)
print("GPU available:", len(tf.config.list_physical_devices('GPU')) > 0)
print("GPU devices:", tf.config.list_physical_devices('GPU'))
print("="*60)

In [None]:
# 安裝必要套件
!pip install -q imbalanced-learn xgboost reportlab seaborn

## 2️⃣ 下載 Kaggle 數據集

In [None]:
# 設置 Kaggle API（需要上傳 kaggle.json）
# 請先到 https://www.kaggle.com/settings -> Create New API Token
# 下載 kaggle.json，然後執行下面的 file upload

from google.colab import files
print("請上傳 kaggle.json 檔案（從 https://www.kaggle.com/settings 下載）")
uploaded = files.upload()

# 配置 Kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# 下載數據集
!kaggle datasets download -d keplersmachines/kepler-labelled-time-series-data
!unzip -q kepler-labelled-time-series-data.zip -d data/
!ls -lh data/

## 3️⃣ 訓練所有模型

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import json
from pathlib import Path

# TensorFlow / Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, AveragePooling1D, Dropout, Flatten, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical

# ML libraries
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
import xgboost as xgb
from imblearn.over_sampling import SMOTE

# PDF generation
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Table, TableStyle, Paragraph, Spacer, Image
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors

print("✓ All imports successful!")

In [None]:
# 配置
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
tf.random.set_seed(RANDOM_STATE)

# 創建輸出目錄
REPORTS_DIR = Path('reports/kaggle_comparison')
FIGURES_DIR = REPORTS_DIR / 'figures'
FIGURES_DIR.mkdir(parents=True, exist_ok=True)

print("="*80)
print("COMPLETE MODEL TRAINING & COMPARISON - KAGGLE DATASET (Google Colab)")
print("="*80)
print()

In [None]:
# ==================== STEP 1: 載入數據 ====================
print("[STEP 1/6] Loading Kaggle Kepler dataset...")

train_df = pd.read_csv('data/exoTrain.csv')
test_df = pd.read_csv('data/exoTest.csv')

X_train_raw = train_df.iloc[:, 1:].values
y_train_raw = train_df.iloc[:, 0].values
X_test = test_df.iloc[:, 1:].values
y_test = test_df.iloc[:, 0].values

# 轉換標籤：2 (行星) -> 1, 1 (非行星) -> 0
y_train_raw = (y_train_raw == 2).astype(int)
y_test = (y_test == 2).astype(int)

print(f"  Train: {X_train_raw.shape}, Planets: {y_train_raw.sum()}, Non-planets: {len(y_train_raw) - y_train_raw.sum()}")
print(f"  Test: {X_test.shape}, Planets: {y_test.sum()}, Non-planets: {len(y_test) - y_test.sum()}")
print(f"  Class imbalance: {100*y_train_raw.mean():.2f}% planets")
print()

In [None]:
# ==================== STEP 2: SMOTE 平衡 ====================
print("[STEP 2/6] Handling class imbalance with SMOTE...")

smote = SMOTE(random_state=RANDOM_STATE)
X_train, y_train = smote.fit_resample(X_train_raw, y_train_raw)

print(f"  After SMOTE - Train: {X_train.shape}")
print(f"  Planets: {y_train.sum()}, Non-planets: {len(y_train) - y_train.sum()}")
print(f"  Balance: {100*y_train.mean():.1f}% planets")
print()

In [None]:
# ==================== STEP 3: Genesis CNN ====================
print("[STEP 3/6] Training Genesis CNN model...")

def build_genesis_adapted():
    model = Sequential([
        Conv1D(64, 50, padding='same', activation='relu', input_shape=(X_train.shape[1], 1)),
        Conv1D(64, 50, padding='same', activation='relu'),
        MaxPooling1D(pool_size=16, strides=16),
        Conv1D(64, 12, padding='same', activation='relu'),
        Conv1D(64, 12, padding='same', activation='relu'),
        AveragePooling1D(pool_size=8),
        Dropout(0.25),
        Flatten(),
        Dense(256, activation='relu'),
        Dense(256, activation='relu'),
        Dense(2, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# 準備 CNN 輸入
X_train_cnn = X_train.reshape(-1, X_train.shape[1], 1)
X_test_cnn = X_test.reshape(-1, X_test.shape[1], 1)
y_train_cat = to_categorical(y_train, 2)
y_test_cat = to_categorical(y_test, 2)

# 訓練 Genesis
genesis_start = time.time()
genesis_model = build_genesis_adapted()
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

print("  Training Genesis CNN (10 epochs with GPU)...")
history = genesis_model.fit(
    X_train_cnn, y_train_cat,
    validation_data=(X_test_cnn, y_test_cat),
    epochs=10,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

genesis_loss, genesis_acc = genesis_model.evaluate(X_test_cnn, y_test_cat, verbose=0)
genesis_time = time.time() - genesis_start

y_pred_genesis_proba = genesis_model.predict(X_test_cnn, verbose=0)
y_pred_genesis = np.argmax(y_pred_genesis_proba, axis=1)

print(f"  ✓ Genesis trained in {genesis_time:.1f}s")
print(f"  Accuracy: {genesis_acc:.4f}")
print()

In [None]:
# ==================== STEP 4: XGBoost ====================
print("[STEP 4/6] Training XGBoost model...")

xgb_start = time.time()
scale_pos_weight = (len(y_train) - y_train.sum()) / y_train.sum()

xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    scale_pos_weight=scale_pos_weight,
    random_state=RANDOM_STATE,
    tree_method='gpu_hist',  # GPU acceleration
    gpu_id=0
)

xgb_model.fit(X_train, y_train, verbose=False)
xgb_time = time.time() - xgb_start

y_pred_xgb = xgb_model.predict(X_test)
y_pred_xgb_proba = xgb_model.predict_proba(X_test)[:, 1]
xgb_acc = accuracy_score(y_test, y_pred_xgb)

print(f"  ✓ XGBoost trained in {xgb_time:.1f}s")
print(f"  Accuracy: {xgb_acc:.4f}")
print()

In [None]:
# ==================== STEP 5: Random Forest ====================
print("[STEP 5/6] Training Random Forest model...")

rf_start = time.time()
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    class_weight='balanced',
    random_state=RANDOM_STATE,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
rf_time = time.time() - rf_start

y_pred_rf = rf_model.predict(X_test)
y_pred_rf_proba = rf_model.predict_proba(X_test)[:, 1]
rf_acc = accuracy_score(y_test, y_pred_rf)

print(f"  ✓ Random Forest trained in {rf_time:.1f}s")
print(f"  Accuracy: {rf_acc:.4f}")
print()

## 4️⃣ 計算指標並生成圖表

In [None]:
# ==================== STEP 6: 計算指標 ====================
print("[STEP 6/6] Computing metrics and generating visualizations...")

def compute_metrics(y_true, y_pred, y_proba):
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'roc_auc': roc_auc_score(y_true, y_proba) if len(np.unique(y_true)) > 1 else 0.0
    }

genesis_metrics = compute_metrics(y_test, y_pred_genesis, y_pred_genesis_proba[:, 1])
xgb_metrics = compute_metrics(y_test, y_pred_xgb, y_pred_xgb_proba)
rf_metrics = compute_metrics(y_test, y_pred_rf, y_pred_rf_proba)

# 保存結果
results = {
    'genesis_cnn': {'metrics': genesis_metrics, 'training_time_seconds': genesis_time},
    'xgboost': {'metrics': xgb_metrics, 'training_time_seconds': xgb_time},
    'random_forest': {'metrics': rf_metrics, 'training_time_seconds': rf_time},
    'dataset_info': {
        'train_samples': len(X_train),
        'test_samples': len(X_test),
        'features': X_train.shape[1],
        'planets_train': int(y_train.sum()),
        'planets_test': int(y_test.sum())
    }
}

with open(REPORTS_DIR / 'kaggle_comparison_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("  ✓ Metrics computed")

In [None]:
# 生成圖表 1: Performance Comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Model Performance Comparison - Kaggle Kepler Dataset', fontsize=16, fontweight='bold')

metrics_names = ['Accuracy', 'Precision', 'Recall', 'F1-Score']
metrics_keys = ['accuracy', 'precision', 'recall', 'f1']
models = ['Genesis CNN', 'XGBoost', 'Random Forest']

for idx, (metric_name, metric_key) in enumerate(zip(metrics_names, metrics_keys)):
    ax = axes[idx // 2, idx % 2]
    values = [genesis_metrics[metric_key], xgb_metrics[metric_key], rf_metrics[metric_key]]
    colors_bar = ['#1f77b4', '#ff7f0e', '#2ca02c']
    bars = ax.bar(models, values, color=colors_bar, alpha=0.8, edgecolor='black')
    ax.set_ylabel(metric_name, fontsize=12, fontweight='bold')
    ax.set_ylim(0, 1.0)
    ax.grid(axis='y', alpha=0.3)
    for bar, val in zip(bars, values):
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height + 0.02, f'{val:.3f}',
                ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'performance_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
print("  ✓ Performance comparison chart saved")

In [None]:
# 生成圖表 2: ROC-AUC and Training Time
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('ROC-AUC Score & Training Time Comparison', fontsize=16, fontweight='bold')

# ROC-AUC
roc_values = [genesis_metrics['roc_auc'], xgb_metrics['roc_auc'], rf_metrics['roc_auc']]
bars1 = ax1.bar(models, roc_values, color=['#1f77b4', '#ff7f0e', '#2ca02c'], alpha=0.8, edgecolor='black')
ax1.set_ylabel('ROC-AUC Score', fontsize=12, fontweight='bold')
ax1.set_ylim(0, 1.0)
ax1.grid(axis='y', alpha=0.3)
for bar, val in zip(bars1, roc_values):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02, f'{val:.3f}',
             ha='center', va='bottom', fontsize=10, fontweight='bold')

# Training Time
times = [genesis_time, xgb_time, rf_time]
bars2 = ax2.bar(models, times, color=['#1f77b4', '#ff7f0e', '#2ca02c'], alpha=0.8, edgecolor='black')
ax2.set_ylabel('Training Time (seconds)', fontsize=12, fontweight='bold')
ax2.grid(axis='y', alpha=0.3)
for bar, val in zip(bars2, times):
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + 5, f'{val:.1f}s',
             ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'roc_time_comparison.png', dpi=150, bbox_inches='tight')
plt.show()
print("  ✓ ROC-AUC and training time chart saved")

In [None]:
# 生成圖表 3: Confusion Matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Confusion Matrices', fontsize=16, fontweight='bold')

predictions = [y_pred_genesis, y_pred_xgb, y_pred_rf]
for ax, model_name, y_pred in zip(axes, models, predictions):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax,
                xticklabels=['Non-Planet', 'Planet'],
                yticklabels=['Non-Planet', 'Planet'])
    ax.set_title(model_name, fontsize=14, fontweight='bold')
    ax.set_ylabel('True Label', fontsize=12)
    ax.set_xlabel('Predicted Label', fontsize=12)

plt.tight_layout()
plt.savefig(FIGURES_DIR / 'confusion_matrices.png', dpi=150, bbox_inches='tight')
plt.show()
print("  ✓ Confusion matrices saved")

## 5️⃣ 生成 PDF 報告

In [None]:
# 生成 PDF 報告
from datetime import datetime

pdf_path = REPORTS_DIR / 'KAGGLE_MODEL_COMPARISON_REPORT.pdf'
doc = SimpleDocTemplate(str(pdf_path), pagesize=letter)
story = []
styles = getSampleStyleSheet()

# 標題
title = Paragraph("<b>Kaggle Kepler Dataset - Model Comparison Report</b>", styles['Title'])
story.append(title)
story.append(Spacer(1, 12))

# 元數據
gpu_info = tf.config.list_physical_devices('GPU')
gpu_name = str(gpu_info[0]).split("'")[1] if gpu_info else "CPU"

metadata = f"""
<b>Generated:</b> {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}<br/>
<b>Platform:</b> Google Colab<br/>
<b>GPU:</b> {gpu_name}<br/>
<b>TensorFlow:</b> {tf.__version__}<br/>
<b>Dataset:</b> Kaggle Kepler Labelled Time Series<br/>
<b>Training Samples:</b> {len(X_train):,} (after SMOTE)<br/>
<b>Test Samples:</b> {len(X_test):,}
"""
story.append(Paragraph(metadata, styles['Normal']))
story.append(Spacer(1, 20))

# 性能表格
story.append(Paragraph("<b>Model Performance Metrics</b>", styles['Heading2']))
data = [['Model', 'Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC', 'Time (s)']]
data.append(['Genesis CNN', f"{genesis_metrics['accuracy']:.4f}", f"{genesis_metrics['precision']:.4f}",
             f"{genesis_metrics['recall']:.4f}", f"{genesis_metrics['f1']:.4f}",
             f"{genesis_metrics['roc_auc']:.4f}", f"{genesis_time:.1f}"])
data.append(['XGBoost', f"{xgb_metrics['accuracy']:.4f}", f"{xgb_metrics['precision']:.4f}",
             f"{xgb_metrics['recall']:.4f}", f"{xgb_metrics['f1']:.4f}",
             f"{xgb_metrics['roc_auc']:.4f}", f"{xgb_time:.1f}"])
data.append(['Random Forest', f"{rf_metrics['accuracy']:.4f}", f"{rf_metrics['precision']:.4f}",
             f"{rf_metrics['recall']:.4f}", f"{rf_metrics['f1']:.4f}",
             f"{rf_metrics['roc_auc']:.4f}", f"{rf_time:.1f}"])

table = Table(data)
table.setStyle(TableStyle([
    ('BACKGROUND', (0, 0), (-1, 0), colors.grey),
    ('TEXTCOLOR', (0, 0), (-1, 0), colors.whitesmoke),
    ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
    ('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
    ('FONTSIZE', (0, 0), (-1, 0), 12),
    ('BOTTOMPADDING', (0, 0), (-1, 0), 12),
    ('BACKGROUND', (0, 1), (-1, -1), colors.beige),
    ('GRID', (0, 0), (-1, -1), 1, colors.black)
]))
story.append(table)

# 添加圖表
for img_name in ['performance_comparison.png', 'roc_time_comparison.png', 'confusion_matrices.png']:
    img_path = FIGURES_DIR / img_name
    if img_path.exists():
        story.append(Spacer(1, 20))
        story.append(Image(str(img_path), width=500, height=300))

doc.build(story)
print(f"\n✓ PDF report generated: {pdf_path}")
print()

## 6️⃣ 顯示結果摘要

In [None]:
# 結果摘要
print("="*80)
print("TRAINING COMPLETE - RESULTS SUMMARY")
print("="*80)
print()
print(f"{'Model':<20} {'Accuracy':<12} {'F1-Score':<12} {'ROC-AUC':<12} {'Time (s)':<10}")
print("-"*80)
print(f"{'Genesis CNN':<20} {genesis_metrics['accuracy']:<12.4f} {genesis_metrics['f1']:<12.4f} {genesis_metrics['roc_auc']:<12.4f} {genesis_time:<10.1f}")
print(f"{'XGBoost':<20} {xgb_metrics['accuracy']:<12.4f} {xgb_metrics['f1']:<12.4f} {xgb_metrics['roc_auc']:<12.4f} {xgb_time:<10.1f}")
print(f"{'Random Forest':<20} {rf_metrics['accuracy']:<12.4f} {rf_metrics['f1']:<12.4f} {rf_metrics['roc_auc']:<12.4f} {rf_time:<10.1f}")
print("="*80)
print()
print("Output files:")
print(f"  - JSON results: {REPORTS_DIR / 'kaggle_comparison_results.json'}")
print(f"  - Figures: {FIGURES_DIR}/")
print(f"  - PDF report: {pdf_path}")
print()
print("Training completed successfully on Google Colab!")
print("="*80)

## 7️⃣ 下載結果檔案

In [None]:
# 壓縮所有結果並下載
!zip -r kaggle_comparison_results.zip reports/

from google.colab import files
files.download('kaggle_comparison_results.zip')

print("\n✓ Results package downloaded!")
print("\nPackage contents:")
!unzip -l kaggle_comparison_results.zip | head -20