# Kepler 系外行星三分类检测系统 (2025)

## 项目概述
本 Notebook 实现了基于 NASA Kepler 任务数据的系外行星三分类检测系统：
- **CONFIRMED**: 已确认的系外行星
- **CANDIDATE**: 候选系外行星
- **FALSE POSITIVE**: 假阳性

## 模型架构
1. Genesis CNN (深度学习)
2. XGBoost (梯度提升)
3. Random Forest (随机森林)
4. Voting Ensemble (集成学习)

## 运行环境
- Google Colab (2025年10月兼容)
- TensorFlow 2.15.0
- Python 3.10+

---

## 1. 环境设置与依赖安装

安装所有必需的库，确保与 2025 年 10 月环境兼容。

In [None]:
# 安装依赖库（2025年10月兼容版本）
!pip install -q tensorflow==2.15.0 xgboost==2.0.3 scikit-learn==1.3.2 \
    pandas==2.1.4 numpy==1.24.3 matplotlib==3.8.2 seaborn==0.13.0 \
    imbalanced-learn==0.11.0 joblib==1.3.2

print("✅ 所有依赖库安装完成！")

In [None]:
# 导入必要的库
import os
import json
import zipfile
import warnings
from datetime import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import (
    classification_report, confusion_matrix, roc_curve, auc,
    accuracy_score, precision_recall_fscore_support
)
from imblearn.over_sampling import SMOTE

import xgboost as xgb
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Model
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

import joblib
from google.colab import files

warnings.filterwarnings('ignore')

# 设置随机种子
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# 设置绘图样式
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'SimHei']
plt.rcParams['axes.unicode_minus'] = False

print(f"TensorFlow 版本: {tf.__version__}")
print(f"XGBoost 版本: {xgb.__version__}")
print(f"✅ 所有库导入成功！")

## 2. 数据上传与加载

上传 Kepler 数据集 CSV 文件并进行初步检查。

In [None]:
# 上传数据文件
print("📤 请上传 Kepler 数据集 CSV 文件...")
uploaded = files.upload()

# 获取上传的文件名
csv_filename = list(uploaded.keys())[0]
print(f"✅ 文件已上传: {csv_filename}")

In [None]:
# 加载数据
df = pd.read_csv(csv_filename)

print(f"数据集形状: {df.shape}")
print(f"\n列名: {df.columns.tolist()}")
print(f"\n前 5 行数据:")
display(df.head())

# 检查标签分布
if 'koi_disposition' in df.columns:
    print(f"\n标签分布:")
    print(df['koi_disposition'].value_counts())
    print(f"\n标签分布比例:")
    print(df['koi_disposition'].value_counts(normalize=True))

## 3. 数据预处理

清洗数据、处理缺失值、特征工程和数据平衡。

In [None]:
def preprocess_kepler_data(df, test_size=0.2, balance_data=True):
    """
    预处理 Kepler 数据集
    
    参数:
        df: 原始数据框
        test_size: 测试集比例
        balance_data: 是否使用 SMOTE 平衡数据
    
    返回:
        X_train, X_test, y_train, y_test, label_encoder, scaler
    """
    print("🔄 开始数据预处理...")
    
    # 1. 复制数据
    data = df.copy()
    
    # 2. 处理标签列
    label_col = 'koi_disposition'
    if label_col not in data.columns:
        raise ValueError(f"数据集中未找到标签列 '{label_col}'")
    
    # 3. 删除不相关的列
    cols_to_drop = ['kepid', 'kepoi_name', 'kepler_name', 'koi_tce_delivname']
    cols_to_drop = [col for col in cols_to_drop if col in data.columns]
    data = data.drop(columns=cols_to_drop)
    
    # 4. 分离特征和标签
    X = data.drop(columns=[label_col])
    y = data[label_col]
    
    # 5. 处理缺失值
    print(f"缺失值数量: {X.isnull().sum().sum()}")
    X = X.fillna(X.median())
    
    # 6. 标签编码
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    print(f"\n标签映射: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")
    
    # 7. 划分训练集和测试集
    X_train, X_test, y_train, y_test = train_test_split(
        X, y_encoded, test_size=test_size, random_state=RANDOM_SEED, stratify=y_encoded
    )
    print(f"\n训练集大小: {X_train.shape}")
    print(f"测试集大小: {X_test.shape}")
    
    # 8. 特征标准化
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # 9. 数据平衡（SMOTE）
    if balance_data:
        print("\n🔄 使用 SMOTE 进行数据平衡...")
        smote = SMOTE(random_state=RANDOM_SEED)
        X_train_balanced, y_train_balanced = smote.fit_resample(X_train_scaled, y_train)
        print(f"平衡后训练集大小: {X_train_balanced.shape}")
        print(f"平衡后标签分布: {np.bincount(y_train_balanced)}")
        return X_train_balanced, X_test_scaled, y_train_balanced, y_test, label_encoder, scaler
    else:
        return X_train_scaled, X_test_scaled, y_train, y_test, label_encoder, scaler

# 执行预处理
X_train, X_test, y_train, y_test, label_encoder, scaler = preprocess_kepler_data(df)

print(f"\n✅ 数据预处理完成！")
print(f"特征数量: {X_train.shape[1]}")
print(f"类别数量: {len(np.unique(y_train))}")

## 4. 模型定义

定义所有分类模型架构。

In [None]:
def build_genesis_cnn(input_dim, num_classes=3):
    """
    构建 Genesis CNN 模型
    
    架构:
        - 输入层
        - 3个卷积块（Conv1D + BatchNorm + ReLU + Dropout）
        - 全局平均池化
        - 全连接层
        - Softmax 输出层
    """
    inputs = layers.Input(shape=(input_dim, 1), name='input')
    
    # 卷积块 1
    x = layers.Conv1D(64, 3, padding='same', name='conv1')(inputs)
    x = layers.BatchNormalization(name='bn1')(x)
    x = layers.Activation('relu', name='relu1')(x)
    x = layers.Dropout(0.3, name='dropout1')(x)
    
    # 卷积块 2
    x = layers.Conv1D(128, 3, padding='same', name='conv2')(x)
    x = layers.BatchNormalization(name='bn2')(x)
    x = layers.Activation('relu', name='relu2')(x)
    x = layers.Dropout(0.3, name='dropout2')(x)
    
    # 卷积块 3
    x = layers.Conv1D(256, 3, padding='same', name='conv3')(x)
    x = layers.BatchNormalization(name='bn3')(x)
    x = layers.Activation('relu', name='relu3')(x)
    x = layers.Dropout(0.4, name='dropout3')(x)
    
    # 全局池化
    x = layers.GlobalAveragePooling1D(name='global_pool')(x)
    
    # 全连接层
    x = layers.Dense(128, activation='relu', name='dense1')(x)
    x = layers.Dropout(0.5, name='dropout4')(x)
    
    # 输出层
    outputs = layers.Dense(num_classes, activation='softmax', name='output')(x)
    
    model = Model(inputs=inputs, outputs=outputs, name='Genesis_CNN')
    
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=0.001),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    return model

# 构建模型
num_features = X_train.shape[1]
num_classes = len(np.unique(y_train))

genesis_cnn = build_genesis_cnn(num_features, num_classes)
print(genesis_cnn.summary())

## 5. 模型训练

训练所有模型并保存最佳版本。

In [None]:
# 创建模型保存目录
os.makedirs('models', exist_ok=True)

# 准备数据（CNN需要3D输入）
X_train_cnn = X_train.reshape(-1, num_features, 1)
X_test_cnn = X_test.reshape(-1, num_features, 1)

# 设置回调函数
callbacks = [
    EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-7, verbose=1),
    ModelCheckpoint('models/genesis_cnn_best.keras', save_best_only=True, monitor='val_accuracy', verbose=1)
]

# 训练 Genesis CNN
print("🚀 开始训练 Genesis CNN...")
history_cnn = genesis_cnn.fit(
    X_train_cnn, y_train,
    validation_data=(X_test_cnn, y_test),
    epochs=100,
    batch_size=32,
    callbacks=callbacks,
    verbose=1
)

# 保存最终模型
genesis_cnn.save('models/genesis_cnn_final.keras')
print("✅ Genesis CNN 训练完成并已保存！")

In [None]:
# 训练 XGBoost
print("\n🚀 开始训练 XGBoost...")

xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    num_class=num_classes,
    random_state=RANDOM_SEED,
    eval_metric='mlogloss',
    early_stopping_rounds=15
)

xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=True
)

# 保存模型
joblib.dump(xgb_model, 'models/xgboost_model.pkl')
print("✅ XGBoost 训练完成并已保存！")

In [None]:
# 训练 Random Forest
print("\n🚀 开始训练 Random Forest...")

rf_model = RandomForestClassifier(
    n_estimators=200,
    max_depth=15,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=RANDOM_SEED,
    n_jobs=-1,
    verbose=1
)

rf_model.fit(X_train, y_train)

# 保存模型
joblib.dump(rf_model, 'models/random_forest_model.pkl')
print("✅ Random Forest 训练完成并已保存！")

In [None]:
# 训练 Voting Ensemble
print("\n🚀 开始训练 Voting Ensemble...")

# 创建一个包装器用于 Genesis CNN
class CNNWrapper:
    def __init__(self, model):
        self.model = model
    
    def predict(self, X):
        X_reshaped = X.reshape(-1, X.shape[1], 1)
        return np.argmax(self.model.predict(X_reshaped, verbose=0), axis=1)
    
    def predict_proba(self, X):
        X_reshaped = X.reshape(-1, X.shape[1], 1)
        return self.model.predict(X_reshaped, verbose=0)

cnn_wrapper = CNNWrapper(genesis_cnn)

# 创建集成模型
voting_clf = VotingClassifier(
    estimators=[
        ('genesis_cnn', cnn_wrapper),
        ('xgboost', xgb_model),
        ('random_forest', rf_model)
    ],
    voting='soft',
    n_jobs=-1
)

voting_clf.fit(X_train, y_train)

# 保存集成模型
joblib.dump(voting_clf, 'models/voting_ensemble_model.pkl')
print("✅ Voting Ensemble 训练完成并已保存！")

## 6. 模型评估与可视化

评估所有模型性能并进行可视化比较。

In [None]:
def evaluate_model(model, X_test, y_test, model_name, is_cnn=False):
    """
    评估单个模型
    
    返回:
        metrics: 包含准确率、精确率、召回率、F1分数的字典
    """
    print(f"\n{'='*60}")
    print(f"评估模型: {model_name}")
    print(f"{'='*60}")
    
    # 预测
    if is_cnn:
        X_test_input = X_test.reshape(-1, X_test.shape[1], 1)
        y_pred_proba = model.predict(X_test_input, verbose=0)
        y_pred = np.argmax(y_pred_proba, axis=1)
    else:
        y_pred = model.predict(X_test)
        if hasattr(model, 'predict_proba'):
            y_pred_proba = model.predict_proba(X_test)
        else:
            y_pred_proba = None
    
    # 计算指标
    accuracy = accuracy_score(y_test, y_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
    
    print(f"\n📊 整体性能:")
    print(f"  准确率: {accuracy:.4f}")
    print(f"  精确率: {precision:.4f}")
    print(f"  召回率: {recall:.4f}")
    print(f"  F1分数: {f1:.4f}")
    
    # 分类报告
    print(f"\n📋 详细分类报告:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    
    # 混淆矩阵
    cm = confusion_matrix(y_test, y_pred)
    
    metrics = {
        'model_name': model_name,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'confusion_matrix': cm,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }
    
    return metrics

def plot_confusion_matrix(cm, model_name, class_names):
    """
    绘制混淆矩阵
    """
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
    plt.title(f'混淆矩阵 - {model_name}')
    plt.ylabel('真实标签')
    plt.xlabel('预测标签')
    plt.tight_layout()
    plt.savefig(f'models/{model_name}_confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()

def plot_roc_curves(models_metrics, X_test, y_test, class_names):
    """
    绘制多类别 ROC 曲线（One-vs-Rest）
    """
    from sklearn.preprocessing import label_binarize
    from itertools import cycle
    
    # 二值化标签
    y_test_bin = label_binarize(y_test, classes=range(len(class_names)))
    n_classes = len(class_names)
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    axes = axes.flatten()
    
    colors = cycle(['blue', 'red', 'green'])
    
    for idx, metrics in enumerate(models_metrics):
        if metrics['y_pred_proba'] is not None:
            ax = axes[idx]
            
            for i, color, class_name in zip(range(n_classes), colors, class_names):
                fpr, tpr, _ = roc_curve(y_test_bin[:, i], metrics['y_pred_proba'][:, i])
                roc_auc = auc(fpr, tpr)
                
                ax.plot(fpr, tpr, color=color, lw=2,
                       label=f'{class_name} (AUC = {roc_auc:.2f})')
            
            ax.plot([0, 1], [0, 1], 'k--', lw=2)
            ax.set_xlim([0.0, 1.0])
            ax.set_ylim([0.0, 1.05])
            ax.set_xlabel('假阳性率')
            ax.set_ylabel('真阳性率')
            ax.set_title(f'ROC 曲线 - {metrics["model_name"]}')
            ax.legend(loc='lower right')
            ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig('models/roc_curves_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# 评估所有模型
all_metrics = []

# Genesis CNN
metrics_cnn = evaluate_model(genesis_cnn, X_test, y_test, 'Genesis CNN', is_cnn=True)
all_metrics.append(metrics_cnn)
plot_confusion_matrix(metrics_cnn['confusion_matrix'], 'Genesis_CNN', label_encoder.classes_)

# XGBoost
metrics_xgb = evaluate_model(xgb_model, X_test, y_test, 'XGBoost')
all_metrics.append(metrics_xgb)
plot_confusion_matrix(metrics_xgb['confusion_matrix'], 'XGBoost', label_encoder.classes_)

# Random Forest
metrics_rf = evaluate_model(rf_model, X_test, y_test, 'Random Forest')
all_metrics.append(metrics_rf)
plot_confusion_matrix(metrics_rf['confusion_matrix'], 'Random_Forest', label_encoder.classes_)

# Voting Ensemble
metrics_ensemble = evaluate_model(voting_clf, X_test, y_test, 'Voting Ensemble')
all_metrics.append(metrics_ensemble)
plot_confusion_matrix(metrics_ensemble['confusion_matrix'], 'Voting_Ensemble', label_encoder.classes_)

# 绘制 ROC 曲线对比
plot_roc_curves(all_metrics, X_test, y_test, label_encoder.classes_)

In [None]:
# 模型性能对比
comparison_df = pd.DataFrame([{
    '模型': m['model_name'],
    '准确率': f"{m['accuracy']:.4f}",
    '精确率': f"{m['precision']:.4f}",
    '召回率': f"{m['recall']:.4f}",
    'F1分数': f"{m['f1']:.4f}"
} for m in all_metrics])

print("\n" + "="*80)
print("📊 模型性能对比")
print("="*80)
display(comparison_df)

# 可视化对比
fig, axes = plt.subplots(2, 2, figsize=(16, 10))
metrics_to_plot = ['accuracy', 'precision', 'recall', 'f1']
metric_names = ['准确率', '精确率', '召回率', 'F1分数']

for idx, (metric, name) in enumerate(zip(metrics_to_plot, metric_names)):
    ax = axes[idx // 2, idx % 2]
    values = [m[metric] for m in all_metrics]
    model_names = [m['model_name'] for m in all_metrics]
    
    bars = ax.bar(model_names, values, color=['#3498db', '#e74c3c', '#2ecc71', '#f39c12'])
    ax.set_ylabel(name)
    ax.set_title(f'{name}对比')
    ax.set_ylim([0, 1.1])
    ax.grid(axis='y', alpha=0.3)
    
    # 添加数值标签
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
               f'{height:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.savefig('models/models_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

## 7. 自动选择最佳模型

基于综合评分自动选择性能最好的模型。

In [None]:
def select_best_model(metrics_list, weights={'accuracy': 0.3, 'precision': 0.2, 'recall': 0.2, 'f1': 0.3}):
    """
    基于加权评分选择最佳模型
    
    参数:
        metrics_list: 所有模型的评估指标列表
        weights: 各指标的权重
    
    返回:
        best_model_name, best_metrics
    """
    scores = []
    
    for metrics in metrics_list:
        score = (
            weights['accuracy'] * metrics['accuracy'] +
            weights['precision'] * metrics['precision'] +
            weights['recall'] * metrics['recall'] +
            weights['f1'] * metrics['f1']
        )
        scores.append(score)
    
    best_idx = np.argmax(scores)
    best_metrics = metrics_list[best_idx]
    
    print("\n" + "="*80)
    print("🏆 最佳模型选择")
    print("="*80)
    
    for idx, (metrics, score) in enumerate(zip(metrics_list, scores)):
        marker = "👑" if idx == best_idx else "  "
        print(f"{marker} {metrics['model_name']}: 综合得分 = {score:.4f}")
    
    print(f"\n✨ 最佳模型: {best_metrics['model_name']}")
    print(f"   准确率: {best_metrics['accuracy']:.4f}")
    print(f"   精确率: {best_metrics['precision']:.4f}")
    print(f"   召回率: {best_metrics['recall']:.4f}")
    print(f"   F1分数: {best_metrics['f1']:.4f}")
    
    return best_metrics['model_name'], best_metrics

# 选择最佳模型
best_model_name, best_metrics = select_best_model(all_metrics)

# 保存最佳模型信息
best_model_info = {
    'model_name': best_model_name,
    'metrics': {
        'accuracy': float(best_metrics['accuracy']),
        'precision': float(best_metrics['precision']),
        'recall': float(best_metrics['recall']),
        'f1': float(best_metrics['f1'])
    },
    'timestamp': datetime.now().isoformat(),
    'num_classes': num_classes,
    'class_names': label_encoder.classes_.tolist()
}

with open('models/best_model_info.json', 'w') as f:
    json.dump(best_model_info, f, indent=2)

print("\n✅ 最佳模型信息已保存！")

## 8. 训练历史可视化（CNN）

可视化 Genesis CNN 的训练过程。

In [None]:
# 绘制训练历史
fig, axes = plt.subplots(1, 2, figsize=(16, 5))

# 损失曲线
axes[0].plot(history_cnn.history['loss'], label='训练损失', linewidth=2)
axes[0].plot(history_cnn.history['val_loss'], label='验证损失', linewidth=2)
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('损失')
axes[0].set_title('Genesis CNN 训练损失曲线')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# 准确率曲线
axes[1].plot(history_cnn.history['accuracy'], label='训练准确率', linewidth=2)
axes[1].plot(history_cnn.history['val_accuracy'], label='验证准确率', linewidth=2)
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('准确率')
axes[1].set_title('Genesis CNN 训练准确率曲线')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('models/genesis_cnn_training_history.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. 保存模型元数据

保存所有必要的预处理器和模型配置。

In [None]:
# 保存预处理器
joblib.dump(scaler, 'models/scaler.pkl')
joblib.dump(label_encoder, 'models/label_encoder.pkl')

# 保存所有模型的元数据
metadata = {
    'project': 'Kepler Exoplanet 3-Class Detection',
    'version': '2.0',
    'date': datetime.now().isoformat(),
    'num_features': num_features,
    'num_classes': num_classes,
    'class_names': label_encoder.classes_.tolist(),
    'train_size': len(X_train),
    'test_size': len(X_test),
    'models': {
        'genesis_cnn': {
            'type': 'deep_learning',
            'framework': 'tensorflow',
            'file': 'genesis_cnn_final.keras',
            'metrics': {
                'accuracy': float(metrics_cnn['accuracy']),
                'precision': float(metrics_cnn['precision']),
                'recall': float(metrics_cnn['recall']),
                'f1': float(metrics_cnn['f1'])
            }
        },
        'xgboost': {
            'type': 'gradient_boosting',
            'framework': 'xgboost',
            'file': 'xgboost_model.pkl',
            'metrics': {
                'accuracy': float(metrics_xgb['accuracy']),
                'precision': float(metrics_xgb['precision']),
                'recall': float(metrics_xgb['recall']),
                'f1': float(metrics_xgb['f1'])
            }
        },
        'random_forest': {
            'type': 'ensemble',
            'framework': 'sklearn',
            'file': 'random_forest_model.pkl',
            'metrics': {
                'accuracy': float(metrics_rf['accuracy']),
                'precision': float(metrics_rf['precision']),
                'recall': float(metrics_rf['recall']),
                'f1': float(metrics_rf['f1'])
            }
        },
        'voting_ensemble': {
            'type': 'ensemble',
            'framework': 'sklearn',
            'file': 'voting_ensemble_model.pkl',
            'metrics': {
                'accuracy': float(metrics_ensemble['accuracy']),
                'precision': float(metrics_ensemble['precision']),
                'recall': float(metrics_ensemble['recall']),
                'f1': float(metrics_ensemble['f1'])
            }
        }
    },
    'best_model': best_model_name,
    'preprocessing': {
        'scaler': 'StandardScaler',
        'label_encoder': 'LabelEncoder',
        'balance_method': 'SMOTE'
    }
}

with open('models/metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print("✅ 元数据已保存！")
print("\n📦 模型文件列表:")
for file in os.listdir('models'):
    print(f"  - {file}")

## 10. 打包并下载模型

将所有模型文件打包成 ZIP 文件供下载。

In [None]:
# 创建 ZIP 文件
zip_filename = 'kepler_exoplanet_models_2025.zip'

with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk('models'):
        for file in files:
            file_path = os.path.join(root, file)
            arcname = os.path.relpath(file_path, '.')
            zipf.write(file_path, arcname)
            print(f"✅ 已添加: {arcname}")

print(f"\n📦 所有模型已打包到: {zip_filename}")
print(f"文件大小: {os.path.getsize(zip_filename) / (1024*1024):.2f} MB")

In [None]:
# 下载 ZIP 文件
print("📥 开始下载模型文件...")
files.download(zip_filename)
print("✅ 下载完成！")

## 11. 使用示例

演示如何加载模型并进行预测。

In [None]:
# 示例：加载最佳模型并预测
print("\n" + "="*80)
print("🔮 预测示例")
print("="*80)

# 加载元数据
with open('models/metadata.json', 'r') as f:
    loaded_metadata = json.load(f)

print(f"\n最佳模型: {loaded_metadata['best_model']}")

# 加载预处理器
loaded_scaler = joblib.load('models/scaler.pkl')
loaded_label_encoder = joblib.load('models/label_encoder.pkl')

# 选择几个测试样本
sample_indices = np.random.choice(len(X_test), 5, replace=False)
X_sample = X_test[sample_indices]
y_sample_true = y_test[sample_indices]

# 根据最佳模型进行预测
if best_model_name == 'Genesis CNN':
    X_sample_input = X_sample.reshape(-1, num_features, 1)
    y_sample_pred_proba = genesis_cnn.predict(X_sample_input, verbose=0)
    y_sample_pred = np.argmax(y_sample_pred_proba, axis=1)
elif best_model_name == 'XGBoost':
    y_sample_pred = xgb_model.predict(X_sample)
    y_sample_pred_proba = xgb_model.predict_proba(X_sample)
elif best_model_name == 'Random Forest':
    y_sample_pred = rf_model.predict(X_sample)
    y_sample_pred_proba = rf_model.predict_proba(X_sample)
else:  # Voting Ensemble
    y_sample_pred = voting_clf.predict(X_sample)
    y_sample_pred_proba = voting_clf.predict_proba(X_sample)

# 显示预测结果
print("\n预测结果:")
for i in range(len(sample_indices)):
    true_label = loaded_label_encoder.classes_[y_sample_true[i]]
    pred_label = loaded_label_encoder.classes_[y_sample_pred[i]]
    confidence = y_sample_pred_proba[i][y_sample_pred[i]] * 100
    
    match = "✓" if y_sample_true[i] == y_sample_pred[i] else "✗"
    print(f"\n样本 {i+1}: {match}")
    print(f"  真实标签: {true_label}")
    print(f"  预测标签: {pred_label}")
    print(f"  置信度: {confidence:.2f}%")
    print(f"  概率分布: {dict(zip(loaded_label_encoder.classes_, [f'{p*100:.2f}%' for p in y_sample_pred_proba[i]]))}")

## 总结

### 📊 项目成果

本 Notebook 成功实现了 Kepler 系外行星三分类检测系统，包括：

1. **数据处理**
   - 自动数据清洗和特征工程
   - SMOTE 数据平衡
   - 标准化预处理

2. **模型训练**
   - Genesis CNN（深度学习）
   - XGBoost（梯度提升）
   - Random Forest（随机森林）
   - Voting Ensemble（集成学习）

3. **性能评估**
   - 混淆矩阵可视化
   - ROC 曲线分析
   - 综合指标对比
   - 自动最佳模型选择

4. **可部署资源**
   - 所有训练好的模型
   - 预处理器和编码器
   - 完整的元数据
   - 使用示例代码

### 🚀 下一步

- 在生产环境中部署最佳模型
- 实时预测系外行星候选
- 持续模型优化和更新

---

**开发日期**: 2025年10月  
**环境**: Google Colab  
**框架**: TensorFlow 2.15.0 + XGBoost 2.0.3  
