In [27]:
import pandas as pd
import numpy as np
import tushare as ts
import talib
import os
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, f1_score # 导入f1_score以便手动跟踪最佳分数

# 自定义特征工程类 (接收原始价格数据，计算特征，输出包含NaN的特征)
# 这个类现在主要在prepare_full_dataset之后、Pipeline外部使用，用于特定timeperiod的特征计算
class FeatureEngineer(BaseEstimator, TransformerMixin):
    """
    接收包含原始价格数据的DataFrame，根据timeperiod计算技术指标和滞后特征。
    计算结果会包含NaN。
    """
    def __init__(self, timeperiod=14):
        self.timeperiod = timeperiod

    def fit(self, X, y=None):
        return self

    def transform(self, df):
        df_features = df.copy()

        # 技术指标计算 - 使用输入df中的原始列
        df_features['MA20'] = talib.SMA(df_features['close'], timeperiod=20)
        df_features['ATR'] = talib.ATR(df_features['high'], df_features['low'], df_features['close'], timeperiod=14)
        df_features['RSI'] = talib.RSI(df_features['close'], timeperiod=self.timeperiod) # 使用可调参数timeperiod
        df_features['CCI'] = talib.CCI(df_features['high'], df_features['low'], df_features['close'], timeperiod=20)

        # 特征工程增强
        df_features['PriceChange'] = df_features['close'].pct_change()
        df_features['VolChange'] = df_features['vol'].pct_change()

        # 滞后特征
        for lag in [1, 2, 3]:
            df_features[f'RSI_lag{lag}'] = df_features['RSI'].shift(lag)
            df_features[f'VolChange_lag{lag}'] = df_features['VolChange'].shift(lag)

        # 返回所有生成的特征列，丢弃原始列。
        # 这些列会包含由talib和shift产生的NaN
        calculated_feature_cols = [
            'MA20', 'ATR', 'RSI', 'CCI', 'PriceChange', 'VolChange',
            'RSI_lag1', 'RSI_lag2', 'RSI_lag3',
            'VolChange_lag1', 'VolChange_lag2', 'VolChange_lag3'
        ]

        existing_calculated_feature_cols = [col for col in calculated_feature_cols if col in df_features.columns]

        return df_features[existing_calculated_feature_cols]


# 数据准备函数 (获取原始数据，生成标签，并清理原始NaN及标签NaN)
# 这个函数返回的X_raw只包含清洗后对齐的原始数据，y是对应的标签
def prepare_full_dataset(code, start, end, max_feature_lookback=20, max_label_lookahead=3, max_lag=3):
    """
    下载原始股票数据，生成预测标签，并清理包含NaN的行。
    返回的X_raw包含清洗后的原始价格数据（用于外部的FeatureEngineer），
    返回的y是对应的标签。

    Args:
        code (str): 股票代码。
        start (str): 开始日期 (YYYYMMDD)。
        end (str): 结束日期 (YYYYMMDD)。
        max_feature_lookback (int): 估计的最大技术指标回顾周期 (用于确定NaN清理的初始行数)。
        max_label_lookahead (int): 估计的最大标签前瞻周期 (用于NaN清理)。
        max_lag (int): 估计的最大滞后特征周期 (用于确定NaN清理的初始行数)。

    Returns:
        tuple: (X_raw, y) Pandas DataFrame 原始数据 (已清理行) 和 Pandas Series 标签集 (已清理行)，
               如果数据获取失败或为空则返回 (None, None)。
    """
    try:
        TUSHARE_API_KEY = os.environ.get('TUSHARE_API_KEY')
        if not TUSHARE_API_KEY:
             print("Error: TUSHARE_API_KEY environment variable not set.")
             return None, None
        ts_pro_instance = ts.pro_api(TUSHARE_API_KEY)
    except Exception as e:
        print(f"Error initializing Tushare API: {e}")
        return None, None

    print(f"Fetching data for {code} from {start} to {end}...")
    try:
        df = ts_pro_instance.daily(
            ts_code=code,
            start_date=start,
            end_date=end,
            fields='ts_code,trade_date,open,high,low,close,vol'
        )
        if df.empty:
            print(f"Error: No data retrieved for {code} from {start} to {end}")
            return None, None

        df.sort_values(by='trade_date', ascending=True, inplace=True)
        df['trade_date'] = pd.to_datetime(df['trade_date'])
        df.set_index('trade_date', inplace=True)

    except Exception as e:
        print(f"Error fetching data from Tushare: {e}")
        return None, None

    print(f"Successfully fetched {len(df)} rows of raw data.")

    # --- 1. 标签生成 ---
    df['future_return'] = df['close'].pct_change(max_label_lookahead).shift(-max_label_lookahead)
    df['label'] = np.where(df['future_return'] > 0, 1, 0)

    # --- 2. 确定需要删除的行范围 (基于最大的 lookback/lookahead) ---
    # 确保数据长度足够进行特征计算和标签生成
    # 最大的 lookback = max_feature_lookback + max_lag (e.g., 20 + 3 = 23)
    # 最大的 lookahead = max_label_lookahead (e.g., 3)
    # 总共需要的数据长度 = 最大的 lookback + 最大的 lookahead
    estimated_min_length = max_feature_lookback + max_lag + max_label_lookahead
    if len(df) < estimated_min_length:
        print(f"Error: Data length ({len(df)}) is too short for the specified lookbacks and lookaheads ({estimated_min_length} required).")
        return None, None

    # 确定因为未来标签导致的 NaN 范围 (末尾几行)
    label_na_end_index = df['label'].dropna().index[-1] if not df['label'].dropna().empty else df.index[-1]
    # 确定因为技术指标 lookback 导致的 NaN 范围 (开头几行)
    # 我们可以通过计算一个使用最大 lookback 的临时指标来确定第一个非 NaN 行
    temp_check_series = talib.SMA(df['close'], timeperiod=max_feature_lookback + max_lag)
    first_valid_index_from_features = temp_check_series.first_valid_index()

    if first_valid_index_from_features is None:
         print("Error: Could not calculate temporary features to determine valid start index.")
         return None, None

    # --- 3. 数据清理 ---
    # 删除因为技术指标 lookback 导致的开头 NaN 行
    df_cleaned_start = df.loc[first_valid_index_from_features:].copy()

    # 删除因为未来标签导致的末尾 NaN 行 (在已经清理了开头的DataFrame上操作)
    df_cleaned_full = df_cleaned_start.dropna(subset=['label']).copy()


    print(f"Original shape: {df.shape}")
    print(f"Shape after dropping NaNs: {df_cleaned_full.shape}")

    # --- 4. 分割特征 (X_raw) 和 标签 (y) ---
    # y 是标签列
    y = df_cleaned_full['label']

    # X_raw 包含清洗后且与y对齐的原始价格数据，丢弃标签和未来收益列
    columns_to_drop_from_X_raw = ['label', 'future_return', 'ts_code'] # ts_code typically not used as feature
    X_raw = df_cleaned_full.drop(columns=columns_to_drop_from_X_raw)

    print(f"Prepared X_raw shape: {X_raw.shape} (cleaned raw data)")
    print(f"Prepared y shape: {y.shape}")

    return X_raw, y


# 主程序 (手动循环 Grid Search)
if __name__ == "__main__":
    # 1. 数据准备：获取清洗后的原始数据 (X_raw) 和对齐的标签 (y_train)
    X_train_raw, y_train = prepare_full_dataset(
        code='600025.SH',
        start='20200101',
        end='20241231',
        max_feature_lookback=20, # 用于NaN清理，确保覆盖MA/CCI的20周期和RSI的最大20周期
        max_label_lookahead=3,   # 用于NaN清理，匹配标签计算周期
        max_lag=3                # 用于NaN清理，匹配滞后特征数量
    )

    if X_train_raw is not None and y_train is not None:
        print("\n--- Starting Manual Grid Search across Feature Timeperiods ---")

        best_overall_score = -np.inf
        best_overall_params = {}
        all_timeperiods = [10, 14, 20] # Feature timeperiods to iterate over

        # Smaller pipeline for scaling and SVC
        pipeline_subset = Pipeline([
            ('scaler', StandardScaler()),
            ('svm', SVC(class_weight='balanced', random_state=42))
        ])

        # Parameter grid for the subset pipeline (only SVC params)
        param_grid_subset = {
            'svm__C': [0.1, 1, 10],
            'svm__kernel': ['linear', 'rbf']
        }

        # Time Series Cross-Validation (使用在特定timeperiod清理后的数据上)
        tscv = TimeSeriesSplit(n_splits=5)

        for tp in all_timeperiods:
            print(f"\nProcessing timeperiod: {tp}")

            # 2. Feature Calculation and NaN Dropping for current timeperiod
            # 在X_train_raw上计算Features，会产生新的NaN
            feature_engineer_tp = FeatureEngineer(timeperiod=tp)
            X_features_tp = feature_engineer_tp.transform(X_train_raw) # 计算Features (含NaN)

            # 将计算出的特征与原始对齐的y_train合并，以便同时清理X和y的对应行
            df_combined_tp = pd.concat([X_features_tp, y_train], axis=1) # 按索引 (trade_date) 对齐

            print(f"Shape before dropping NaNs (TP={tp}): {df_combined_tp.shape}")
            # 删除NaN行，同时清理了X_features_tp和y_train的对应行
            df_cleaned_tp = df_combined_tp.dropna()
            print(f"Shape after dropping NaNs (TP={tp}): {df_cleaned_tp.shape}")

            # 分离出为这个timeperiod清理后的 X 和 y
            X_cleaned_tp = df_cleaned_tp.drop(columns=['label'])
            y_cleaned_tp = df_cleaned_tp['label']

            if X_cleaned_tp.empty:
                print(f"Warning: No samples remaining after dropping NaNs for timeperiod {tp}. Skipping.")
                continue

            print(f"Cleaned X shape (TP={tp}): {X_cleaned_tp.shape}, y shape: {y_cleaned_tp.shape}")

            # 3. 使用清理后的数据，运行 GridSearch 优化 SVC 参数
            print(f"Running GridSearchCV for C and kernel (TP={tp})...")
            grid_search_subset = GridSearchCV(
                estimator=pipeline_subset,
                param_grid=param_grid_subset,
                cv=tscv, # 使用 TimeSeriesSplit 在清理后的数据上
                scoring='f1',
                n_jobs=-1,
                verbose=1 # 内层循环使用较少verbose
            )

            try:
                # 在为当前timeperiod清理后的数据上进行拟合
                grid_search_subset.fit(X_cleaned_tp, y_cleaned_tp)

                # 4. 评估当前timeperiod的结果并更新最佳整体结果
                print(f"Best params for TP={tp}: {grid_search_subset.best_params_}")
                print(f"Best F1 score for TP={tp}: {grid_search_subset.best_score_:.4f}")

                if grid_search_subset.best_score_ > best_overall_score:
                    best_overall_score = grid_search_subset.best_score_
                    # 复制当前SVC的最佳参数，并添加timeperiod参数
                    best_overall_params = grid_search_subset.best_params_.copy()
                    best_overall_params['features__timeperiod'] = tp # 将 timeperiod 加入最佳参数

            except Exception as e:
                 print(f"An error occurred during subset GridSearchCV fit for TP={tp}: {e}")
                 # 可以打印 X_cleaned_tp, y_cleaned_tp 的 shapes 或 head/tail 进行调试


        # 5. 输出整体最佳结果
        print("\n--- Overall Best Grid Search Results ---")
        if best_overall_score > -np.inf:
            print(f"最佳参数组合 (Best overall parameters): {best_overall_params}")
            print(f"最佳F1分数 (Best overall F1 score): {best_overall_score:.4f}")

            # 6. 使用整体最佳参数，在对应清理后的数据上进行最终评估
            print("\n--- Final Evaluation on Full Prepared Dataset (using best params) ---")

            # 重新计算最佳timeperiod下的特征，并进行清理，获取最终用于评估的数据
            best_tp = best_overall_params['features__timeperiod']
            final_feature_engineer = FeatureEngineer(timeperiod=best_tp)
            X_features_final = final_feature_engineer.transform(X_train_raw) # 在原始清洗数据上计算Features
            df_combined_final = pd.concat([X_features_final, y_train], axis=1)
            df_cleaned_final = df_combined_final.dropna() # 清理NaN，同时清理X和y
            X_cleaned_final = df_cleaned_final.drop(columns=['label'])
            y_cleaned_final = df_cleaned_final['label']

            # 创建使用整体最佳SVC参数的最终Pipeline (只包含Scaler和SVC)
            best_svm_params = {k.replace('svm__', ''): v for k, v in best_overall_params.items() if k.startswith('svm__')}
            final_pipeline_subset = Pipeline([
                ('scaler', StandardScaler()),
                ('svm', SVC(class_weight='balanced', random_state=42, **best_svm_params))
            ])

            # 在最终清理好的数据上拟合最终模型 (不需要Cross-Validation)
            final_pipeline_subset.fit(X_cleaned_final, y_cleaned_final)

            # 使用拟合好的模型进行预测
            predictions_final = final_pipeline_subset.predict(X_cleaned_final)

            # 生成分类报告
            print(classification_report(y_cleaned_final, predictions_final))

        else:
            print("Grid search failed for all timeperiods.")


    else:
        print("Data preparation failed. Exiting program.")

Fetching data for 600025.SH from 20200101 to 20241231...
Successfully fetched 1212 rows of raw data.
Original shape: (1212, 8)
Shape after dropping NaNs: (1190, 8)
Prepared X_raw shape: (1190, 5) (cleaned raw data)
Prepared y shape: (1190,)

--- Starting Manual Grid Search across Feature Timeperiods ---

Processing timeperiod: 10
Shape before dropping NaNs (TP=10): (1190, 13)
Shape after dropping NaNs (TP=10): (1171, 13)
Cleaned X shape (TP=10): (1171, 12), y shape: (1171,)
Running GridSearchCV for C and kernel (TP=10)...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best params for TP=10: {'svm__C': 1, 'svm__kernel': 'linear'}
Best F1 score for TP=10: 0.5226

Processing timeperiod: 14
Shape before dropping NaNs (TP=14): (1190, 13)
Shape after dropping NaNs (TP=14): (1171, 13)
Cleaned X shape (TP=14): (1171, 12), y shape: (1171,)
Running GridSearchCV for C and kernel (TP=14)...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best params for TP=14: {'svm__C': 0