In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

# 步骤1：加载数据
def load_data(file_path):
    """加载parquet格式的股票数据"""
    try:
        df = pd.read_parquet(file_path)
        print(f"数据加载成功，共 {df.shape[0]} 行，{df.shape[1]} 列")
        return df
    except Exception as e:
        print(f"数据加载失败: {e}")
        return None

# 步骤2：查看数据基本信息
def inspect_data(df):
    """查看数据基本信息"""
    if df is None:
        return
    
    print("\n数据前5行:")
    print(df.head())
    
    print("\n数据字段信息:")
    print(df.info())
    
    print("\n基本统计信息:")
    print(df[['open', 'close', 'high', 'low', 'volume', 'money']].describe())
    
    # 检查日期范围
    if 'date' in df.columns:
        df['date'] = pd.to_datetime(df['date'])
        print(f"\n日期范围: {df['date'].min()} 至 {df['date'].max()}")
    
    # 检查股票数量
    if 'stock_code' in df.columns:
        print(f"股票数量: {df['stock_code'].nunique()}")

# 步骤3：数据预处理
def preprocess_data(df):
    """数据预处理：处理停牌数据、排序、检查重复值等"""
    if df is None:
        return None
    
    # 复制数据避免修改原数据
    processed_df = df.copy()
    
    # 确保日期格式正确
    if 'date' in processed_df.columns:
        processed_df['date'] = pd.to_datetime(processed_df['date'])
    
    # 按股票代码和日期排序
    if 'stock_code' in processed_df.columns and 'date' in processed_df.columns:
        processed_df = processed_df.sort_values(by=['stock_code', 'date'])
    
    # 处理停牌数据 - 过滤掉停牌的股票记录
    if 'paused' in processed_df.columns:
        print(f"停牌记录数量: {processed_df[processed_df['paused'] == 1].shape[0]}")
        processed_df = processed_df[processed_df['paused'] == 0]
        print(f"过滤后剩余记录数量: {processed_df.shape[0]}")
    
    # 检查并处理重复值
    if 'stock_code' in processed_df.columns and 'date' in processed_df.columns:
        duplicates = processed_df.duplicated(subset=['stock_code', 'date'], keep=False)
        if duplicates.any():
            print(f"重复记录数量: {duplicates.sum()}")
            processed_df = processed_df.drop_duplicates(subset=['stock_code', 'date'], keep='first')
    
    # 检查必要字段是否存在
    required_fields = ['date', 'stock_code', 'open', 'close', 'high', 'low', 'volume', 'money']
    missing_fields = [field for field in required_fields if field not in processed_df.columns]
    if missing_fields:
        print(f"警告: 缺少必要字段 {missing_fields}")
    
    return processed_df

# 主函数
if __name__ == "__main__":
    # 数据文件路径
    file_path = r"d:\workspace\xiaoyao\data\wide_table.parquet"
    
    # 加载数据
    stock_df = load_data(file_path)
    
    # 查看数据基本信息
    inspect_data(stock_df)
    
    # 预处理数据
    processed_df = preprocess_data(stock_df)
    
    # 保存预处理后的数据，供后续步骤使用
    if processed_df is not None:
        processed_df.to_parquet(r"d:\workspace\xiaoyao\data\wide_table_clean.parquet")
        print("\n预处理完成，数据已保存为 processed_stock_data.parquet")


数据加载成功，共 908780 行，62 列

数据前5行:
         date   stock_code     open    close      low     high     volume  \
0  2025-01-03  000001.XSHE  1589.82  1581.48  1578.70  1603.72   830884.0   
1  2025-01-03  000002.XSHE  1210.84  1182.13  1175.38  1212.53   666119.0   
2  2025-01-03  000004.XSHE   105.73    95.31    95.01   107.22  1664380.0   
3  2025-01-03  000006.XSHE   376.97   356.83   354.25   384.71   759893.0   
4  2025-01-03  000007.XSHE    84.26    79.56    79.56    84.63   658258.0   

          money      factor  high_limit  ...  ind_jq_l2_industry_name  \
0  1.320521e+09  138.970157     1746.85  ...                    综合性银行   
1  7.951548e+08  168.875961     1320.61  ...                    房地产开发   
2  1.649274e+08    7.446000      116.16  ...                     通用软件   
3  2.755300e+08   51.639522      412.08  ...                    房地产开发   
4  5.362218e+07   12.055000       93.06  ...                    房地产管理   

   ind_sw_l1_industry_code  ind_sw_l1_industry_name  ind_sw_l2_indu

In [2]:
import pandas as pd
import numpy as np
import talib
import warnings
warnings.filterwarnings('ignore')

def load_and_preprocess_data(file_path):
    """加载并预处理数据，确保数据质量"""
    try:
        # 加载Parquet数据
        df = pd.read_parquet(file_path)
        print(f"数据加载成功：{df.shape[0]} 行 × {df.shape[1]} 列")
    except Exception as e:
        print(f"数据加载失败: {str(e)}")
        return None
    
    # 数据预处理
    df['date'] = pd.to_datetime(df['date'])  # 确保日期格式正确
    df = df.sort_values(by=['stock_code', 'date'])  # 按股票和日期排序
    
    # 处理重复记录
    duplicates = df.duplicated(subset=['stock_code', 'date'], keep=False)
    if duplicates.any():
        print(f"处理重复记录: {duplicates.sum()} 条")
        df = df.drop_duplicates(subset=['stock_code', 'date'], keep='first')
    
    # 过滤停牌数据（如果存在停牌标记）
    if 'paused' in df.columns:
        paused_count = df[df['paused'] == 1].shape[0]
        if paused_count > 0:
            print(f"过滤停牌记录: {paused_count} 条")
            df = df[df['paused'] == 0]
    
    return df

def calculate_indicators(df):
    """计算所有核心指标：技术指标、量能指标、趋势指标等"""
    df_indicators = df.copy()
    
    def compute_group_indicators(group):
        """对单只股票计算指标"""
        group = group.sort_values('date').reset_index(drop=True)
        close = group['close'].values
        high = group['high'].values
        low = group['low'].values
        open_ = group['open'].values
        volume = group['volume'].values
        auc_volume = group['auc_volume'].values
        auc_money = group['auc_money'].values
        money = group['money'].values
        
        # 1. 技术指标
        # MACD金叉且接近零轴
        macd_line, macd_signal, _ = talib.MACD(close, 12, 26, 9)
        group['macd_ok'] = (macd_line > macd_signal) & (macd_line > -0.5)
        
        # 价格接近布林带上轨
        upper_band, _, _ = talib.BBANDS(close, 20, 2, 2, matype=talib.MA_Type.SMA)
        group['bb_ok'] = close > (upper_band * 0.95)
        
        # ATR波动控制
        atr = talib.ATR(high, low, close, 14)
        group['atr_ok'] = atr / close < 0.07
        
        # 2. 量能与竞价指标
        # 5日平均成交量
        group['volume_ma5'] = group['volume'].rolling(window=5, min_periods=1).mean()
        
        # 当日量比（1.2-15倍）
        group['volume_ratio'] = volume / group['volume_ma5']
        group['volume_ratio_ok'] = (group['volume_ratio'] > 1.2) & (group['volume_ratio'] < 15)
        
        # 竞价量比（>3倍）
        group['auc_volume_ratio'] = auc_volume / group['volume_ma5']
        group['auc_volume_ratio_ok'] = group['auc_volume_ratio'] > 3
        
        # 竞价成交额占比（>15%）
        group['auc_money_ratio_ok'] = auc_money / money > 0.15
        
        # 买盘力度比（>1.1倍）
        group['total_bid'] = group[['auc_b1_v', 'auc_b2_v', 'auc_b3_v', 'auc_b4_v', 'auc_b5_v']].sum(axis=1)
        group['total_ask'] = group[['auc_a1_v', 'auc_a2_v', 'auc_a3_v', 'auc_a4_v', 'auc_a5_v']].sum(axis=1)
        group['bid_ask_ratio'] = np.where(group['total_ask'] > 0, group['total_bid'] / group['total_ask'], 0)
        group['bid_ask_ratio_ok'] = group['bid_ask_ratio'] > 1.1
        
        # 3. 趋势强度指标
        # 20日均线之上
        group['ma20'] = group['close'].rolling(window=20, min_periods=1).mean()
        group['ma_ok'] = group['close'] > group['ma20']
        
        # 4. 估值安全指标（行业PE分位）
        def calc_pe_quantile(subgroup):
            subgroup['pe_quantile'] = subgroup['val_pe_ratio'].rank(pct=True)
            return subgroup
        group = group.groupby('ind_sw_l1_industry_name', group_keys=False).apply(calc_pe_quantile)
        group['pe_quantile_ok'] = group['pe_quantile'] < 0.8
        
        # 5. 流动性指标（单位修正：流通市值=亿元，换手率=%）
        group['circ_cap_ok'] = (group['val_circulating_market_cap'] > 50) & (group['val_circulating_market_cap'] < 500)
        group['turnover_ok'] = (group['val_turnover_ratio'] > 1) & (group['val_turnover_ratio'] < 15)
        
        return group
    
    # 按股票代码分组计算指标
    df_indicators = df_indicators.groupby('stock_code', group_keys=False).apply(compute_group_indicators)
    
    # 过滤关键指标缺失的记录
    valid_columns = ['macd_ok', 'bb_ok', 'atr_ok', 'volume_ratio_ok', 'ma_ok', 'circ_cap_ok']
    df_valid = df_indicators.dropna(subset=valid_columns).copy()
    
    # 定义信号层级
    df_valid['basic_signal'] = (
        df_valid['macd_ok'] & df_valid['bb_ok'] & 
        df_valid['atr_ok'] & df_valid['volume_ratio_ok']
    )
    
    df_valid['enhanced_signal'] = (
        df_valid['basic_signal'] & df_valid['ma_ok'] & 
        df_valid['circ_cap_ok'] & df_valid['turnover_ok']
    )
    
    return df_valid

def filter_top_industries(df):
    """筛选每日热门行业并生成最终选股信号"""
    df_final = df.copy()
    
    # 1. 筛选竞价高热度标的（竞价量比>3）
    high_bid_stocks = df_final[df_final['auc_volume_ratio'] > 3].copy()
    print(f"竞价高热度标的总数: {len(high_bid_stocks)} 条")
    
    # 2. 按日期统计各行业热度（标的数量）
    industry_daily_counts = high_bid_stocks.groupby(
        ['date', 'ind_sw_l1_industry_name']
    ).size().reset_index(name='stock_count')
    
    # 3. 生成每日前5大热门行业字典
    def get_top5_industries(daily_data):
        """对单日数据返回前5行业列表"""
        return daily_data.nlargest(5, 'stock_count')['ind_sw_l1_industry_name'].tolist()
    
    # 构建 {日期: [前5行业列表]} 字典
    top5_industries_dict = industry_daily_counts.groupby('date').apply(
        get_top5_industries
    ).to_dict()
    
    # 4. 判断个股是否属于当日热门行业
    def is_in_top5_industry(row):
        """检查个股所属行业是否在当日前5热门行业中"""
        # 获取当日热门行业列表（默认空列表）
        daily_top5 = top5_industries_dict.get(row['date'], [])
        # 判断行业是否在列表中
        return row['ind_sw_l1_industry_name'] in daily_top5
    
    # 应用判断逻辑
    df_final['is_top5_industry'] = df_final.apply(is_in_top5_industry, axis=1)
    
    # 5. 生成最终选股信号
    df_final['final_signal'] = df_final['enhanced_signal'] & df_final['is_top5_industry']
    
    # 6. 信号统计
    print("\n=== 最终选股信号统计 ===")
    total_signals = df_final['final_signal'].sum()
    print(f"最终选股信号总数: {total_signals} 条")
    
    # 按年份统计
    df_final['year'] = df_final['date'].dt.year
    yearly_signals = df_final.groupby('year')['final_signal'].sum()
    print("\n各年份最终信号数量:")
    print(yearly_signals)
    
    # 验证年度交易机会是否达标
    min_yearly = yearly_signals.min() if not yearly_signals.empty else 0
    print(f"\n年度最小交易机会: {min_yearly} 次")
    if min_yearly >= 400:
        print("✅ 满足年度交易机会目标（≥400次）")
    else:
        print("❌ 未满足目标，建议调整：")
        print("1. 竞价量比阈值放宽至2.5")
        print("2. 热门行业数量增加至前8")
    
    # 保留核心字段
    core_fields = [
        'date', 'stock_code', 'ind_sw_l1_industry_name', 'open', 'close',
        'macd_ok', 'bb_ok', 'volume_ratio', 'auc_volume_ratio',
        'ma20', 'val_circulating_market_cap', 'val_turnover_ratio',
        'enhanced_signal', 'is_top5_industry', 'final_signal'
    ]
    return df_final[core_fields].copy()

def main():
    """主函数：串联全流程"""
    # 文件路径配置
    input_path = r"d:\workspace\xiaoyao\data\wide_table.parquet"
    output_path = r"d:\workspace\xiaoyao\data\final_strategy_signals.parquet"
    
    # 步骤1：加载和预处理数据
    df_raw = load_and_preprocess_data(input_path)
    if df_raw is None:
        return
    
    # 步骤2：计算核心指标
    df_with_indicators = calculate_indicators(df_raw)
    print(f"\n指标计算完成：{df_with_indicators.shape[0]} 行 × {df_with_indicators.shape[1]} 列")
    
    # 步骤3：筛选热门行业并生成最终信号
    df_strategy = filter_top_industries(df_with_indicators)
    
    # 步骤4：保存结果
    df_strategy.to_parquet(output_path)
    print(f"\n最终策略数据已保存至：{output_path}")
    print(f"最终数据规模：{df_strategy.shape[0]} 行 × {df_strategy.shape[1]} 列")

if __name__ == "__main__":
    main()


数据加载成功：908780 行 × 62 列

指标计算完成：908780 行 × 83 列
竞价高热度标的总数: 1621 条

=== 最终选股信号统计 ===
最终选股信号总数: 7539 条

各年份最终信号数量:
year
2025    7539
Name: final_signal, dtype: int64

年度最小交易机会: 7539 次
✅ 满足年度交易机会目标（≥400次）

最终策略数据已保存至：d:\workspace\xiaoyao\data\final_strategy_signals.parquet
最终数据规模：908780 行 × 15 列


In [3]:
import pandas as pd
import numpy as np
import talib
import warnings
warnings.filterwarnings('ignore')

# --------------------------
# 1. 数据加载与预处理
# --------------------------
def load_and_preprocess_data(file_path):
    """加载并预处理数据，确保数据质量"""
    try:
        # 加载Parquet数据
        df = pd.read_parquet(file_path)
        print(f"数据加载成功：{df.shape[0]} 行 × {df.shape[1]} 列")
    except Exception as e:
        print(f"数据加载失败: {str(e)}")
        return None
    
    # 数据预处理
    df['date'] = pd.to_datetime(df['date'])  # 确保日期格式正确
    df = df.sort_values(by=['stock_code', 'date'])  # 按股票和日期排序
    
    # 处理重复记录
    duplicates = df.duplicated(subset=['stock_code', 'date'], keep=False)
    if duplicates.any():
        print(f"处理重复记录: {duplicates.sum()} 条")
        df = df.drop_duplicates(subset=['stock_code', 'date'], keep='first')
    
    # 过滤停牌数据
    if 'paused' in df.columns:
        paused_count = df[df['paused'] == 1].shape[0]
        if paused_count > 0:
            print(f"过滤停牌记录: {paused_count} 条")
            df = df[df['paused'] == 0]
    
    # 保存预处理结果
    preprocessed_path = r"d:\workspace\xiaoyao\data\processed_stock_data.parquet"
    df.to_parquet(preprocessed_path)
    print(f"预处理完成，数据已保存至: {preprocessed_path}")
    
    return df

# --------------------------
# 2. 核心指标计算
# --------------------------
def calculate_indicators(df):
    """计算所有核心指标：技术指标、量能指标、趋势指标等"""
    df_indicators = df.copy()
    
    def compute_group_indicators(group):
        """对单只股票计算指标"""
        group = group.sort_values('date').reset_index(drop=True)
        close = group['close'].values
        high = group['high'].values
        low = group['low'].values
        open_ = group['open'].values
        volume = group['volume'].values
        auc_volume = group['auc_volume'].values
        auc_money = group['auc_money'].values
        money = group['money'].values
        
        # 1. 技术指标
        # MACD金叉且接近零轴
        macd_line, macd_signal, _ = talib.MACD(close, 12, 26, 9)
        group['macd_ok'] = (macd_line > macd_signal) & (macd_line > -0.5)
        
        # 价格接近布林带上轨
        upper_band, _, _ = talib.BBANDS(close, 20, 2, 2, matype=talib.MA_Type.SMA)
        group['bb_ok'] = close > (upper_band * 0.95)
        
        # ATR波动控制
        atr = talib.ATR(high, low, close, 14)
        group['atr_ok'] = atr / close < 0.07
        
        # 2. 量能与竞价指标
        # 5日平均成交量
        group['volume_ma5'] = group['volume'].rolling(window=5, min_periods=1).mean()
        
        # 当日量比（1.2-15倍）
        group['volume_ratio'] = volume / group['volume_ma5']
        group['volume_ratio_ok'] = (group['volume_ratio'] > 1.2) & (group['volume_ratio'] < 15)
        
        # 竞价量比（>3倍）
        group['auc_volume_ratio'] = auc_volume / group['volume_ma5']
        group['auc_volume_ratio_ok'] = group['auc_volume_ratio'] > 3
        
        # 竞价成交额占比（>15%）
        group['auc_money_ratio_ok'] = auc_money / money > 0.15
        
        # 买盘力度比（>1.1倍）
        group['total_bid'] = group[['auc_b1_v', 'auc_b2_v', 'auc_b3_v', 'auc_b4_v', 'auc_b5_v']].sum(axis=1)
        group['total_ask'] = group[['auc_a1_v', 'auc_a2_v', 'auc_a3_v', 'auc_a4_v', 'auc_a5_v']].sum(axis=1)
        group['bid_ask_ratio'] = np.where(group['total_ask'] > 0, group['total_bid'] / group['total_ask'], 0)
        group['bid_ask_ratio_ok'] = group['bid_ask_ratio'] > 1.1
        
        # 3. 趋势强度指标
        # 20日均线之上
        group['ma20'] = group['close'].rolling(window=20, min_periods=1).mean()
        group['ma_ok'] = group['close'] > group['ma20']
        
        # 4. 估值安全指标（行业PE分位）
        def calc_pe_quantile(subgroup):
            subgroup['pe_quantile'] = subgroup['val_pe_ratio'].rank(pct=True)
            return subgroup
        group = group.groupby('ind_sw_l1_industry_name', group_keys=False).apply(calc_pe_quantile)
        group['pe_quantile_ok'] = group['pe_quantile'] < 0.8
        
        # 5. 流动性指标（单位修正：流通市值=亿元，换手率=%）
        group['circ_cap_ok'] = (group['val_circulating_market_cap'] > 50) & (group['val_circulating_market_cap'] < 500)
        group['turnover_ok'] = (group['val_turnover_ratio'] > 1) & (group['val_turnover_ratio'] < 15)
        
        return group
    
    # 按股票代码分组计算指标
    df_indicators = df_indicators.groupby('stock_code', group_keys=False).apply(compute_group_indicators)
    
    # 过滤关键指标缺失的记录
    valid_columns = ['macd_ok', 'bb_ok', 'atr_ok', 'volume_ratio_ok', 'ma_ok', 'circ_cap_ok']
    df_valid = df_indicators.dropna(subset=valid_columns).copy()
    
    # 定义信号层级
    df_valid['basic_signal'] = (
        df_valid['macd_ok'] & df_valid['bb_ok'] & 
        df_valid['atr_ok'] & df_valid['volume_ratio_ok']
    )
    
    df_valid['enhanced_signal'] = (
        df_valid['basic_signal'] & df_valid['ma_ok'] & 
        df_valid['circ_cap_ok'] & df_valid['turnover_ok']
    )
    
    return df_valid

# --------------------------
# 3. 行业筛选与最终信号生成
# --------------------------
def filter_top_industries(df):
    """筛选每日热门行业并生成最终选股信号"""
    df_final = df.copy()
    
    # 1. 筛选竞价高热度标的（竞价量比>3）
    high_bid_stocks = df_final[df_final['auc_volume_ratio'] > 3].copy()
    print(f"竞价高热度标的总数: {len(high_bid_stocks)} 条")
    
    # 2. 按日期统计各行业热度（标的数量）
    industry_daily_counts = high_bid_stocks.groupby(
        ['date', 'ind_sw_l1_industry_name']
    ).size().reset_index(name='stock_count')
    
    # 3. 生成每日前5大热门行业字典
    def get_top5_industries(daily_data):
        """对单日数据返回前5行业列表"""
        return daily_data.nlargest(5, 'stock_count')['ind_sw_l1_industry_name'].tolist()
    
    # 构建 {日期: [前5行业列表]} 字典
    top5_industries_dict = industry_daily_counts.groupby('date').apply(
        get_top5_industries
    ).to_dict()
    
    # 4. 判断个股是否属于当日热门行业
    def is_in_top5_industry(row):
        """检查个股所属行业是否在当日前5热门行业中"""
        # 获取当日热门行业列表（默认空列表）
        daily_top5 = top5_industries_dict.get(row['date'], [])
        # 判断行业是否在列表中
        return row['ind_sw_l1_industry_name'] in daily_top5
    
    # 应用判断逻辑
    df_final['is_top5_industry'] = df_final.apply(is_in_top5_industry, axis=1)
    
    # 5. 生成最终选股信号
    df_final['final_signal'] = df_final['enhanced_signal'] & df_final['is_top5_industry']
    
    # 6. 信号统计
    print("\n=== 最终选股信号统计 ===")
    total_signals = df_final['final_signal'].sum()
    print(f"最终选股信号总数: {total_signals} 条")
    
    # 按年份统计
    df_final['year'] = df_final['date'].dt.year
    yearly_signals = df_final.groupby('year')['final_signal'].sum()
    print("\n各年份最终信号数量:")
    print(yearly_signals)
    
    # 验证年度交易机会是否达标
    min_yearly = yearly_signals.min() if not yearly_signals.empty else 0
    print(f"\n年度最小交易机会: {min_yearly} 次")
    if min_yearly >= 400:
        print("✅ 满足年度交易机会目标（≥400次）")
    else:
        print("❌ 未满足目标，建议调整：")
        print("1. 竞价量比阈值放宽至2.5")
        print("2. 热门行业数量增加至前8")
    
    # 保留核心字段
    core_fields = [
        'date', 'stock_code', 'ind_sw_l1_industry_name', 'open', 'close',
        'macd_ok', 'bb_ok', 'volume_ratio', 'auc_volume_ratio',
        'ma20', 'val_circulating_market_cap', 'val_turnover_ratio',
        'enhanced_signal', 'is_top5_industry', 'final_signal'
    ]
    return df_final[core_fields].copy()

# --------------------------
# 4. 策略回测
# --------------------------
def backtest_strategy(signal_df):
    """
    策略回测核心逻辑：
    1. 提取所有交易信号（final_signal=True）
    2. 匹配后续N日收盘价（计算止盈止损）
    3. 统计胜率、盈亏比等关键指标
    """
    print("\n" + "="*50)
    print("开始策略回测...")
    print("="*50)
    
    # 步骤1：提取交易信号（只保留有买入信号的记录）
    trade_signals = signal_df[signal_df['final_signal'] == True].copy()
    print(f"总交易信号数量: {len(trade_signals)} 条")
    
    # 若信号过少，直接返回
    if len(trade_signals) < 10:
        print("❌ 交易信号过少，无法进行有效回测")
        return None
    
    # 步骤2：为每个交易信号匹配后续5日收盘价（用于计算出场价格）
    def get_future_prices(group):
        """对单只股票，计算每个信号日期后续1-5日的收盘价"""
        # 按日期排序
        group = group.sort_values('date').reset_index(drop=True)
        # 获取该股票的完整日期-收盘价映射
        stock_full_data = signal_df[signal_df['stock_code'] == group['stock_code'].iloc[0]].sort_values('date')
        price_dict = dict(zip(stock_full_data['date'], stock_full_data['close']))
        date_list = sorted(price_dict.keys())
        
        # 对每个信号日期，找后续5日的收盘价
        def get_exit_price(row):
            signal_date = row['date']
            # 找到信号日期在日期列表中的索引
            try:
                signal_idx = date_list.index(signal_date)
            except ValueError:
                return pd.Series([np.nan]*5)  # 日期未找到，返回NaN
            
            # 计算后续1-5日的日期（若存在）
            exit_prices = []
            for day_offset in [1,2,3,4,5]:
                if signal_idx + day_offset < len(date_list):
                    exit_date = date_list[signal_idx + day_offset]
                    exit_prices.append(price_dict[exit_date])
                else:
                    exit_prices.append(np.nan)  # 超出数据范围，返回NaN
            
            return pd.Series(exit_prices, index=['exit_price_1d', 'exit_price_2d', 'exit_price_3d', 'exit_price_4d', 'exit_price_5d'])
        
        # 应用函数获取出场价格
        exit_prices = group.apply(get_exit_price, axis=1)
        group = pd.concat([group, exit_prices], axis=1)
        return group
    
    # 按股票代码分组，计算出场价格
    trade_signals_with_exit = trade_signals.groupby('stock_code', group_keys=False).apply(get_future_prices)
    
    # 过滤掉没有出场价格的无效信号
    valid_trades = trade_signals_with_exit.dropna(subset=['exit_price_1d']).copy()
    print(f"有效交易信号数量: {len(valid_trades)} 条（过滤无效信号 {len(trade_signals)-len(valid_trades)} 条）")
    
    if len(valid_trades) < 10:
        print("❌ 有效交易信号过少，无法进行有效回测")
        return None
    
    # 步骤3：计算每笔交易的盈亏（按止盈止损规则）
    def calculate_trade_return(row):
        """
        计算单笔交易的收益：
        - 入场价：信号当日开盘价（open）
        - 止盈条件：收益≥6%，触发后立即出场
        - 止损条件：收益≤-3%，触发后立即出场
        - 最大持仓：5天，未触发止盈止损则第5天出场
        """
        entry_price = row['close']
        exit_prices = [row['exit_price_1d'], row['exit_price_2d'], row['exit_price_3d'], row['exit_price_4d'], row['exit_price_5d']]
        
        trade_return = 0.0
        hold_days = 0
        exit_reason = ""
        
        for i, exit_price in enumerate(exit_prices):
            daily_return = (exit_price - entry_price) / entry_price
            hold_days = i + 1
            
            # 检查止盈止损条件
            if daily_return >= 0.06:
                trade_return = daily_return
                exit_reason = "止盈（≥6%）"
                break
            elif daily_return <= -0.03:
                trade_return = daily_return
                exit_reason = "止损（≤-3%）"
                break
            # 最后一天（第5天）强制出场
            elif i == 4:
                trade_return = daily_return
                exit_reason = "到期出场（5天）"
        
        return pd.Series([trade_return, hold_days, exit_reason], index=['trade_return', 'hold_days', 'exit_reason'])
    
    # 应用函数计算每笔交易收益
    trade_results = valid_trades.apply(calculate_trade_return, axis=1)
    valid_trades = pd.concat([valid_trades, trade_results], axis=1)
    
    # 步骤4：统计回测指标
    print("\n" + "="*50)
    print("=== 策略回测结果 ===")
    print("="*50)
    
    # 核心指标计算
    total_trades = len(valid_trades)
    profitable_trades = valid_trades[valid_trades['trade_return'] > 0]
    losing_trades = valid_trades[valid_trades['trade_return'] < 0]
    
    # 1. 胜率
    win_rate = len(profitable_trades) / total_trades if total_trades > 0 else 0
    
    # 2. 盈亏比（总盈利金额 / 总亏损金额，亏损取绝对值）
    total_profit = valid_trades[valid_trades['trade_return'] > 0]['trade_return'].sum()
    total_loss = abs(valid_trades[valid_trades['trade_return'] < 0]['trade_return'].sum())
    profit_loss_ratio = total_profit / total_loss if total_loss > 0 else np.inf
    
    # 3. 平均收益
    avg_return = valid_trades['trade_return'].mean()
    avg_profit = profitable_trades['trade_return'].mean() if len(profitable_trades) > 0 else 0
    avg_loss = losing_trades['trade_return'].mean() if len(losing_trades) > 0 else 0
    
    # 4. 持仓天数统计
    avg_hold_days = valid_trades['hold_days'].mean()
    
    # 5. 出场原因分布
    exit_reason_dist = valid_trades['exit_reason'].value_counts()
    
    # 打印结果
    print(f"1. 总交易次数: {total_trades} 次")
    print(f"2. 盈利交易次数: {len(profitable_trades)} 次")
    print(f"3. 亏损交易次数: {len(losing_trades)} 次")
    print(f"4. 胜率: {win_rate:.2%} {'✅' if win_rate > 0.5 else '❌'}（目标：＞50%）")
    print(f"5. 盈亏比: {profit_loss_ratio:.2f} {'✅' if profit_loss_ratio >= 1.9 else '❌'}（目标：≥1.9）")
    print(f"6. 平均每笔收益: {avg_return:.2%}")
    print(f"7. 平均盈利收益: {avg_profit:.2%}")
    print(f"8. 平均亏损收益: {avg_loss:.2%}")
    print(f"9. 平均持仓天数: {avg_hold_days:.1f} 天")
    print(f"\n10. 出场原因分布:")
    for reason, count in exit_reason_dist.items():
        print(f"    - {reason}: {count} 次（{count/total_trades:.2%}）")
    
    # 步骤5：保存回测结果
    backtest_result_path = r"d:\workspace\xiaoyao\data\strategy_backtest_results.parquet"
    valid_trades.to_parquet(backtest_result_path)
    print(f"\n回测详情已保存至: {backtest_result_path}")
    
    # 返回关键指标，便于后续优化
    return {
        'win_rate': win_rate,
        'profit_loss_ratio': profit_loss_ratio,
        'avg_return': avg_return,
        'total_trades': total_trades
    }

# --------------------------
# 主函数：串联全流程
# --------------------------
def main():
    """股票策略全流程主函数"""
    # 文件路径配置
    input_path = r"d:\workspace\xiaoyao\data\wide_table.parquet"
    signal_output_path = r"d:\workspace\xiaoyao\data\final_strategy_signals.parquet"
    
    # 步骤1：加载和预处理数据
    df_raw = load_and_preprocess_data(input_path)
    if df_raw is None:
        return
    
    # 步骤2：计算核心指标
    df_with_indicators = calculate_indicators(df_raw)
    print(f"\n指标计算完成：{df_with_indicators.shape[0]} 行 × {df_with_indicators.shape[1]} 列")
    
    # 步骤3：筛选热门行业并生成最终信号
    df_strategy = filter_top_industries(df_with_indicators)
    
    # 步骤4：保存最终信号
    df_strategy.to_parquet(signal_output_path)
    print(f"\n最终策略信号已保存至：{signal_output_path}")
    print(f"最终信号数据规模：{df_strategy.shape[0]} 行 × {df_strategy.shape[1]} 列")
    
    # 步骤5：执行回测
    backtest_metrics = backtest_strategy(df_strategy)
    
    # 步骤6：输出优化建议
    if backtest_metrics:
        print("\n" + "="*50)
        print("=== 后续优化建议 ===")
        print("="*50)
        if backtest_metrics['win_rate'] <= 0.5:
            print("1. 胜率未达标：建议收紧技术指标条件（如MACD仅保留零轴上金叉）")
        if backtest_metrics['profit_loss_ratio'] < 1.9:
            print("2. 盈亏比未达标：建议调整止盈止损比例（如止盈7%/止损3%）")
        if backtest_metrics['avg_return'] < 0:
            print("3. 平均收益为负：建议增加估值筛选（如PE分位<60%）")
        print("4. 若需增加交易机会：可放宽竞价量比至2.5或热门行业增至前8")

if __name__ == "__main__":
    main()


数据加载成功：908780 行 × 62 列
预处理完成，数据已保存至: d:\workspace\xiaoyao\data\processed_stock_data.parquet

指标计算完成：908780 行 × 83 列
竞价高热度标的总数: 1621 条

=== 最终选股信号统计 ===
最终选股信号总数: 7539 条

各年份最终信号数量:
year
2025    7539
Name: final_signal, dtype: int64

年度最小交易机会: 7539 次
✅ 满足年度交易机会目标（≥400次）

最终策略信号已保存至：d:\workspace\xiaoyao\data\final_strategy_signals.parquet
最终信号数据规模：908780 行 × 15 列

开始策略回测...
总交易信号数量: 7539 条
有效交易信号数量: 7489 条（过滤无效信号 50 条）

=== 策略回测结果 ===
1. 总交易次数: 7489 次
2. 盈利交易次数: 3219 次
3. 亏损交易次数: 4098 次
4. 胜率: 42.98% ❌（目标：＞50%）
5. 盈亏比: 1.20 ❌（目标：≥1.9）
6. 平均每笔收益: 0.39%
7. 平均盈利收益: 5.42%
8. 平均亏损收益: -3.56%
9. 平均持仓天数: 3.6 天

10. 出场原因分布:
    - 到期出场（5天）: 3173 次（42.37%）
    - 止损（≤-3%）: 2847 次（38.02%）
    - 止盈（≥6%）: 1469 次（19.62%）

回测详情已保存至: d:\workspace\xiaoyao\data\strategy_backtest_results.parquet

=== 后续优化建议 ===
1. 胜率未达标：建议收紧技术指标条件（如MACD仅保留零轴上金叉）
2. 盈亏比未达标：建议调整止盈止损比例（如止盈7%/止损3%）
4. 若需增加交易机会：可放宽竞价量比至2.5或热门行业增至前8
