# 滚动训练验证与最新一周打分
本Notebook实现：
1. 从2024年12月起，采用n周训练、1周验证的滑动窗口，输出所有验证集上的平均表现。
2. 用前n周训练，预测当周的分数并保存。

In [102]:
import os
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from utils import get_data_dict, get_ret_mat, compute_ic, compute_ir
from sklearn.neural_network import MLPRegressor
from datetime import timedelta

## 配置参数

In [103]:
FACTOR_DIR = 'data/factor'
START_DATE = '2025-02-01'
END_DATE = '2025-08-09'

TRAIN_WEEKS = 6  # 训练集周数
IR_THRESHOLD = 0.15
PCA_VARIANCE_THRESHOLD = 0.99

MLP_HIDDEN_LAYER_SIZES = (64, 32, 16, 8)
MLP_ALPHA = 1e-5
MLP_LEARNING_RATE_INIT = 1e-2
EARLY_STOPPING = True
MLP_MAX_ITER = 500
BATCH_SIZE = 'auto'
MLP_ACTIVATION = 'relu'

## 获取数据

In [104]:
# 收益率矩阵
data_dict = get_data_dict(start=START_DATE)
ret_mat = get_ret_mat(data_dict)

In [105]:
# 合并所有因子 & 提取因子IC，只选入无缺失值的因子，并输出有缺失值的因子名称
stacked_list = []
factor_ic_dict = {}
factors_with_nan = []
for fname in os.listdir(FACTOR_DIR):
    df = pd.read_csv(os.path.join(FACTOR_DIR, fname), index_col=0, parse_dates=True)
    factor_name = fname[:-4]
    df.columns = df.columns.astype(str)
    s = df.iloc[:, :-1].stack()
    s.name = factor_name
    # 检查是否有缺失值
    if s.isnull().any():
        factors_with_nan.append(factor_name)
        continue
    stacked_list.append(s)
    factor_ic_dict[factor_name] = df.iloc[:, -1]  # 只存最后一列IC
if factors_with_nan:
    print('以下因子存在缺失值，未被选入:')
    for f in factors_with_nan:
        print(f)
all_factors_concat = pd.concat(stacked_list, axis=1).sort_index()

## 滚动训练验证

In [106]:
results = []
dim_input = []
date_range = pd.date_range(START_DATE, END_DATE, freq='W-MON')
# 用字典统计每个factor被用到的次数
factor_use_count = {factor: 0 for factor in factor_ic_dict.keys()}
for i in range(len(date_range) - TRAIN_WEEKS):
    train_start = date_range[i]
    train_end = date_range[i + TRAIN_WEEKS - 1] + timedelta(days=4)  
    valid_start = date_range[i + TRAIN_WEEKS]
    valid_end = valid_start + timedelta(days=4)  # 验证周周五

    # 针对每个窗口，动态筛选因子
    selected_factors = []
    for factor_name, ic_series in factor_ic_dict.items():
        ic_window = ic_series.loc[train_start:train_end]
        ir = compute_ir(ic_window)
        if abs(ir) > IR_THRESHOLD:
            selected_factors.append(factor_name)

    # 统计本次被用到的factor
    for f in selected_factors:
        factor_use_count[f] += 1

    # 根据selected_factors动态筛选合并后的因子列
    factors_concat = all_factors_concat[selected_factors]

    # 训练和验证
    train_data = factors_concat.loc[train_start:train_end]
    # 输出含缺失值的行index
    if train_data.isnull().any(axis=1).any():
        print("train_data 含缺失值的行index:")
        print(train_data[train_data.isnull().any(axis=1)].index.tolist())
    valid_data = factors_concat.loc[valid_start:valid_end]
    scaler = StandardScaler()
    train_data_scaled = scaler.fit_transform(train_data)
    valid_data_scaled = scaler.transform(valid_data)
    pre_pca = PCA()
    pre_pca.fit(train_data_scaled)
    cum_var = np.cumsum(pre_pca.explained_variance_ratio_)
    n_components = np.searchsorted(cum_var, PCA_VARIANCE_THRESHOLD) + 1
    dim_input.append(n_components)
    pca = PCA(n_components=n_components)
    X_train = pca.fit_transform(train_data_scaled)
    X_valid = pca.transform(valid_data_scaled)
    rescaler = StandardScaler()
    X_train = rescaler.fit_transform(X_train)
    X_valid = rescaler.transform(X_valid)
    y_train = ret_mat.stack().reindex(train_data.index).values * 1e4
    mlp = MLPRegressor(
        hidden_layer_sizes=MLP_HIDDEN_LAYER_SIZES,
        activation=MLP_ACTIVATION,
        alpha=MLP_ALPHA,
        learning_rate_init=MLP_LEARNING_RATE_INIT,
        max_iter=MLP_MAX_ITER,
        batch_size=BATCH_SIZE,
        early_stopping=EARLY_STOPPING,
        random_state=42
    )
    mlp.fit(X_train, y_train)
    valid_pred = mlp.predict(X_valid)
    valid_pred_mat = pd.Series(valid_pred, index=valid_data.index).unstack(level=1)
    results.append(valid_pred_mat)

# 输出所有factor被用到的次数，按次数倒序
sorted_factors = sorted(factor_use_count.items(), key=lambda x: x[1], reverse=True)
print('各factor被用到的次数: ')
for factor, count in sorted_factors:
    print(f'{factor}: {count}')

print(f'平均输入维数: {sum(dim_input) / len(dim_input)}')

各factor被用到的次数: 
kurtosis_120: 21
amp_vol_15: 21
amp_vol_10: 20
skewness_60: 20
amp_vol_3: 20
amp_vol_20: 20
amp_vol_5: 20
obv_40: 20
obv_amount_60: 20
amp_turn_120: 19
obv_amount_40: 19
amp_vol_40: 19
turnover_rel_60: 19
turnover_40: 19
skewness_120: 19
volume_ratio_60: 19
turnover_20: 18
close_open_n_1: 18
volatility_60: 18
turnover_120: 18
ma_spread_20_10: 18
momentum_40: 18
obv_amount_10: 18
close_ma_120: 18
turnover_5: 18
amp_vol_1: 18
volatility_40: 18
skewness_40: 18
turnover_3: 18
turnover_60: 18
turnover_15: 18
turnover_10: 18
volatility_20: 17
kurtosis_60: 17
ma_spread_60_10: 17
ma_spread_120_30: 17
amp_turn_60: 17
volume_ratio_120: 17
boll_40: 17
atr_42: 17
amp_vol_60: 17
skewness_5: 17
turnover_rel_120: 17
kurtosis_40: 17
obv_10: 17
momentum_15: 17
obv_amount_120: 17
ma_spread_80_20: 17
amplitude_15: 16
amp_turn_40: 16
max_drawdown_40: 16
close_ma_3: 16
momentum_120: 16
kurtosis_5: 16
amplitude_20: 16
boll_15: 16
obv_60: 16
amplitude_3: 16
close_ma_60: 16
momentum_20: 16
amp

## 验证集表现

In [107]:
score_mat = pd.concat(results).sort_index()
score_mat.to_csv('data/score.csv')

In [108]:
# 计算整个验证集、近三个月、近一个月的IC均值和IR
all_ic = compute_ic(score_mat, ret_mat)
overall_ir = compute_ir(all_ic)
print('【全部验证集】IC均值:', all_ic.mean())
print('【全部验证集】IR:', overall_ir)

# 近三个月
last_3month_idx = all_ic.index >= (all_ic.index[-1] - pd.Timedelta(days=90))
ic_last_3month = all_ic.loc[last_3month_idx]
ir_last_3month = compute_ir(ic_last_3month)
print('【近三个月】IC均值:', ic_last_3month.mean())
print('【近三个月】IR:', ir_last_3month)

# 近一个月
last_month_idx = all_ic.index >= (all_ic.index[-1] - pd.Timedelta(days=30))
ic_last_month = all_ic.loc[last_month_idx]
ir_last_month = compute_ir(ic_last_month)
print('【近一个月】IC均值:', ic_last_month.mean())
print('【近一个月】IR:', ir_last_month)

【全部验证集】IC均值: 0.17117132617228048
【全部验证集】IR: 0.7003907962100671
【近三个月】IC均值: 0.2010572601918627
【近三个月】IR: 0.772770188662098
【近一个月】IC均值: 0.29679149158764867
【近一个月】IR: 2.3210824841284663
