# ETF收益率预测系统演示

本笔记本将完整运行ETF预测系统，包含以下步骤：
1. 环境设置与依赖安装
2. 数据加载与预处理
3. 模型训练与参数调优
4. 回测评估
5. 可视化结果
6. 选基策略执行

## 1. 环境设置

In [None]:
import os
import sys
import shutil
from pathlib import Path

# 设置项目根目录
project_root = Path.cwd()
print(f"项目根目录: {project_root}")

# 创建必要的目录结构
os.makedirs('data', exist_ok=True)
os.makedirs('outputs/results', exist_ok=True)
os.makedirs('outputs/models', exist_ok=True)

# 将项目目录添加到Python路径
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

## 2. 安装依赖

运行前请确保已上传`requirements.txt`文件

In [None]:
!pip install -r requirements.txt

## 3. 数据检查

请确保已上传以下文件到`data/`目录：
- factor_daily.csv
- return_monthly.csv

In [None]:
from utils.data_loader import DataLoader

# 初始化数据加载器
data_loader = DataLoader(
    factor_path='data/factor_daily.csv',
    return_path='data/return_monthly.csv'
)

# 加载并检查数据
try:
    full_data = data_loader.load_and_merge_data()
    train_data, predict_data = data_loader.split_train_predict_data(full_data)
    
    print("数据加载成功！")
    print(f"训练数据形状: {train_data.shape}")
    print(f"预测数据形状: {predict_data.shape}")
    print("\n训练数据预览:")
    display(train_data.head())
except Exception as e:
    print(f"数据加载失败: {str(e)}")
    print("请检查data/目录下是否有正确的CSV文件")

## 4. 系统配置

从config.py导入默认配置

In [None]:
from config import PARAM_GRIDS, TRAIN_WINDOWS

# 显示可用模型
print("可用模型:", list(PARAM_GRIDS.keys()))
print("训练窗口设置:", TRAIN_WINDOWS)

# 选择要运行的模型（默认全选）
MODELS_TO_RUN = list(PARAM_GRIDS.keys())

## 5. 模型训练与评估

In [None]:
from core.model import (
    XGBoostModel, RandomForestModel, 
    LassoModel, LSTMModel
)
from core.trainer import ModelTrainer
from core.evaluator import ModelEvaluator
from core.selector import ETFSelector
from utils.visualize import ResultVisualizer
import pandas as pd
import numpy as np

# 初始化组件
evaluator = ModelEvaluator()
selector = ETFSelector(top_n=10)
visualizer = ResultVisualizer()

# 模型映射字典
MODEL_CLASSES = {
    'xgboost': XGBoostModel,
    'random_forest': RandomForestModel,
    'lasso': LassoModel,
    'lstm': LSTMModel
}

# 存储所有结果
all_results = []

for model_name in MODELS_TO_RUN:
    print(f"\n===== 开始处理 {model_name} 模型 =====")
    
    # 初始化模型
    model = MODEL_CLASSES[model_name]()
    
    # 获取特征列
    features = [col for col in train_data.columns 
                if col not in ['datetime', 'sec_code', 'target']]
    
    # 参数调优
    trainer = ModelTrainer(model, PARAM_GRIDS[model_name])
    best_model, best_params = trainer.tune_hyperparameters(
        train_data[features], train_data['target']
    )
    print(f"最佳参数: {best_params}")
    
    # 测试不同训练窗口
    for window in TRAIN_WINDOWS:
        print(f"\n-- 测试 {window} 个月训练窗口 --")
        
        # 滚动窗口训练
        final_model, test_results = trainer.train_with_rolling_window(data_loader, window)
        
        # 评估
        metrics = evaluator.calculate_metrics(
            test_results['target'], 
            test_results['pred'],
            test_results
        )
        print(evaluator.generate_report(metrics, model_name, window))
        
        # 存储结果
        all_results.append({
            'model': model_name,
            'window': window,
            **metrics
        })
        
        # 可视化
        visualizer.plot_pred_vs_real(
            test_results, model_name, window,
            f"outputs/results/pred_vs_real_{model_name}_{window}.png"
        )
        visualizer.plot_cs_ic_series(
            test_results, model_name, window,
            f"outputs/results/cs_ic_{model_name}_{window}.png"
        )
        
        # 最终预测
        X_predict = predict_data[features]
        X_processed = model.preprocess_data(X_predict)
        
        final_predictions = final_model.predict(X_processed)
        if isinstance(final_predictions, np.ndarray) and final_predictions.ndim > 1:
            final_predictions = final_predictions.flatten()
        
        predict_data['pred'] = final_predictions
        top_etfs = selector.select_top_etfs(predict_data)
        selector.save_selection(
            top_etfs,
            f"outputs/results/top_etfs_{model_name}_{window}.csv"
        )
        
        print(f"\nTop 10 ETF预览:")
        display(top_etfs.head())

## 6. 结果汇总

In [None]:
# 将所有结果保存为CSV
results_df = pd.DataFrame(all_results)
results_df.to_csv("outputs/results/all_results.csv", index=False)

print("\n===== 所有模型评估结果汇总 =====")
display(results_df)

# 找出每个模型的最佳配置
best_configs = []
for model_name in MODELS_TO_RUN:
    model_results = results_df[results_df['model'] == model_name]
    if not model_results.empty:
        # 优先按CS-IC排序，其次按R2
        best_idx = model_results['CS-IC'].fillna(-np.inf).idxmax()
        if np.isinf(model_results.loc[best_idx, 'CS-IC']):
            best_idx = model_results['R2'].idxmax()
        best_configs.append(model_results.loc[best_idx])

if best_configs:
    print("\n===== 最佳配置推荐 =====")
    best_df = pd.DataFrame(best_configs)
    display(best_df)
else:
    print("没有找到有效的配置结果")

## 7. 结果可视化

In [None]:
import matplotlib.pyplot as plt
from IPython.display import Image, display

def show_results_for_model(model_name, window):
    """显示指定模型的结果图表"""
    pred_img = f"outputs/results/pred_vs_real_{model_name}_{window}.png"
    ic_img = f"outputs/results/cs_ic_{model_name}_{window}.png"
    
    if os.path.exists(pred_img):
        print(f"\n{model_name} ({window}个月窗口) - 预测vs真实值")
        display(Image(filename=pred_img))
    
    if os.path.exists(ic_img):
        print(f"\n{model_name} ({window}个月窗口) - IC时间序列")
        display(Image(filename=ic_img))

# 显示第一个模型的结果作为示例
if all_results:
    first_result = all_results[0]
    show_results_for_model(first_result['model'], first_result['window'])

## 8. 输出文件说明

运行完成后，将在`outputs/`目录生成以下文件：

```
outputs/
├── results/
│   ├── all_results.csv       # 所有模型评估结果汇总
│   ├── pred_vs_real_*.png    # 预测与真实值对比图
│   ├── cs_ic_*.png           # IC时间序列图
│   └── top_etfs_*.csv        # 选中的ETF列表
└── models/                   # 保存的模型文件（可选）
```