In [None]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

# ======================== 预处理配置 ========================
PREPROCESS_CONFIG = {
    "raw_data_path": r'D:\workspace\xiaoyao\data\factortable.parquet',  # 原始数据路径
    "preprocess_save_path": r'D:\workspace\xiaoyao\data\preprocessed_features.parquet',  # 预处理结果保存路径
    "max_history_days": 60,  # 最大历史窗口
    "force_recompute": True  # 强制重新计算（必须设为True，覆盖旧数据）
}

# ======================== 预处理核心函数（包含所有必要特征） ========================
def preprocess_data():
    # 强制重新计算，确保覆盖旧数据
    print(f"强制重新计算预处理特征，生成包含所有必要字段的数据...")
    
    # 1. 加载原始数据
    print(f"加载原始数据：{PREPROCESS_CONFIG['raw_data_path']}")
    df = pd.read_parquet(PREPROCESS_CONFIG["raw_data_path"])
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by=['stock_code', 'date']).reset_index(drop=True)
    
    # 2. 基础字段预处理
    df['buy_sell_ratio'] = df.apply(
        lambda row: row['buy_total'] / row['sell_total'] if row['sell_total'] > 0 else 0, axis=1
    )
    fill_cols = ['vol_rsi14', 'price_volume_divergence', 'main_force_net_flow', 'buy_sell_ratio']
    df[fill_cols] = df[fill_cols].fillna(0)
    df['circulating_market_cap'] = df['circulating_market_cap'].fillna(method='ffill')
    
    # 3. 计算选股必需的核心特征（包含rise_ratio_30d等）
    # 3.1 阶段高点特征
    def compute_high_features(group):
        group['high_30d'] = group['close'].rolling(window=30, min_periods=1).max()
        group['high_date_30d'] = group['date'].iloc[
            group['close'].rolling(window=30, min_periods=1).apply(np.argmax, raw=True).astype(int)
        ].values
        return group
    df = df.groupby('stock_code', group_keys=False).apply(compute_high_features)
    
    # 3.2 量能相关特征
    df['volume_ratio_vs_5d_avg'] = df.groupby('stock_code')['volume'].transform(
        lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, 0.0001)
    )
    df['money_ratio_vs_5d_avg'] = df.groupby('stock_code')['money'].transform(
        lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, 0.0001)
    )
    
    # 3.3 价格趋势特征
    df['price_up_3d'] = df.groupby('stock_code')['close'].transform(
        lambda x: (x > x.shift(1)) & (x.shift(1) > x.shift(2))
    )
    
    # 3.4 支撑位特征
    df['touch_bollinger_lower_3d'] = df.groupby('stock_code').apply(
        lambda g: (g['close'] <= g['bollinger_lower_calc']) | 
                 (g['close'].shift(1) <= g['bollinger_lower_calc'].shift(1)) | 
                 (g['close'].shift(2) <= g['bollinger_lower_calc'].shift(2))
    ).reset_index(level=0, drop=True)
    
    # 3.5 回调阶段标记
    df['is_pullback'] = (df['close'] < df['high_30d']) & (
        (df['date'] - df['high_date_30d']).dt.days.between(-60, 0)
    )
    
    # -------------------------- 选股必需的新增特征（解决'rise_ratio_30d'缺失） --------------------------
    # 3.6 30日涨幅（rise_ratio_30d）
    df['rise_ratio_30d'] = df.groupby('stock_code').apply(
        lambda g: (g['high_30d'] - g['close'].shift(30)) / g['close'].shift(30).replace(0, 0.0001)
    ).reset_index(level=0, drop=True)
    
    # 3.7 上涨阶段ROC10均值（roc10_mean_rise）
    df['roc10_mean_rise'] = df.groupby('stock_code')['roc10'].transform(
        lambda x: x.rolling(window=10, min_periods=5).mean().shift(1)
    )
    
    # 3.8 回调比例（pullback_ratio）和天数（pullback_days）
    df['pullback_ratio'] = (df['high_30d'] - df['close']) / df['high_30d'].replace(0, 0.0001)
    df['pullback_days'] = (df['date'] - df['high_date_30d']).dt.days
    
    # 3.9 最大回撤（rolling_max_drawdown_20d）
    df['rolling_max_drawdown_20d'] = df.groupby('stock_code').apply(
        lambda g: (g['close'] - g['close'].rolling(window=20, min_periods=1).max()) / g['close'].rolling(window=20, min_periods=1).max().replace(0, 0.0001)
    ).reset_index(level=0, drop=True)
    
    # 3.10 RSI14近3日最小值（rsi14_near3d_min）
    df['rsi14_near3d_min'] = df.groupby('stock_code')['rsi14'].transform(
        lambda x: x.rolling(window=3, min_periods=1).min()
    )
    
    # 3.11 OBV与高点OBV比值（obv_high_ratio）
    def calc_obv_high_ratio(group):
        high_indices = group['close'].rolling(window=30, min_periods=1).apply(np.argmax, raw=True).astype(int)
        high_obv = group['obv'].iloc[high_indices].values
        group['obv_high_ratio'] = group['obv'] / high_obv.replace(0, 0.0001)
        return group
    df = df.groupby('stock_code', group_keys=False).apply(calc_obv_high_ratio)
    
    # 3.12 MACD金叉标记（macd_golden_cross）
    df['macd_golden_cross'] = df.groupby('stock_code').apply(
        lambda g: (g['macd_diff_calc'] > g['macd_dea_calc']) & (g['macd_diff_calc'].shift(1) <= g['macd_dea_calc'].shift(1))
    ).reset_index(level=0, drop=True)
    
    # 4. 保存最新预处理结果（覆盖旧数据）
    df.to_parquet(PREPROCESS_CONFIG["preprocess_save_path"], index=False)
    print(f"预处理完成！已生成所有必要特征，包含'rise_ratio_30d'等字段")
    print(f"保存路径：{PREPROCESS_CONFIG['preprocess_save_path']}")
    print(f"特征列表（部分）：{['rise_ratio_30d', 'roc10_mean_rise', 'pullback_ratio', 'obv_high_ratio']}...")
    return df

# ======================== 执行预处理 ========================
if __name__ == "__main__":
    preprocessed_df = preprocess_data()

发现本地预处理结果，直接加载：D:\workspace\xiaoyao\data\preprocessed_features.parquet


In [5]:
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count
import os

# ======================== 核心配置（优化历史数据判断） ========================
SELECT_CONFIG = {
    "preprocess_data_path": r'D:\workspace\xiaoyao\data\preprocessed_features.parquet',
    "result_save_path": r'D:\workspace\xiaoyao\data\single_day_selection_result.csv',
    "target_date": "2025-10-10",  # 目标验证日期
    "min_history_days": 30,  # 最小所需历史数据（从60改为30，足够计算核心特征）
    "cpu_cores": 1  # 单进程
}

# ======================== 选股参数（不变） ========================
SELECT_PARAMS = {
    "rise_stage": {"enable": True, "min_rise_ratio_30d": 0.2, "min_roc10_mean_rise": 3, "min_vol_rsi14_rise": 50, "max_price_volume_div_rise": -5},
    "rise_volume_support": {"enable": True, "min_volume_ratio_rise": 1.2, "min_money_ratio_rise": 1.2, "min_obv_growth_rise": 0.15, "min_buy_sell_ratio_rise": 1.1},
    "pullback_stage": {"enable": True, "min_pullback_ratio": 0.1, "max_pullback_ratio": 0.2, "min_pullback_days": 10, "max_pullback_days": 20, "min_max_drawdown": -0.2},
    "support_stage": {"enable": True, "ma20_support": True, "ma20_ma60_trend": True, "bollinger_support": True, "min_rsi14_bottom": 30},
    "pullback_volume_health": {"enable": True, "max_volume_ratio_pullback": 0.8, "max_money_ratio_pullback": 0.8, "min_obv_retention": 0.8},
    "start_signal": {"enable": True, "price_up_trend_3d": True, "min_volume_ratio_start": 1.2, "macd_golden_cross": True, "min_macd_hist_start": 0, "rsi14_range": [30, 50]}
}

# ======================== 单只股票单日期选股逻辑（核心修复） ========================
def select_stock_single_day(stock_data):
    target_date = pd.to_datetime(SELECT_CONFIG["target_date"])
    stock_code = stock_data['stock_code'].iloc[0]
    stock_data = stock_data.copy()
    stock_data['date'] = pd.to_datetime(stock_data['date'])

    # 1. 优先确认是否有目标日期数据（无则直接跳过）
    if not (stock_data['date'] == target_date).any():
        # 仅每100只股票打印一次“无目标日期数据”，减少日志干扰
        if int(stock_code.split('.')[0]) % 100 == 0:
            print(f"股票 {stock_code} 无 {SELECT_CONFIG['target_date']} 数据，跳过")
        return pd.DataFrame()

    # 2. 筛选目标日期及之前的历史数据（无需严格60天，有多少用多少）
    valid_data = stock_data[stock_data['date'] <= target_date].sort_values('date')
    history_days_actual = len(valid_data) - 1  # 实际历史数据条数（排除目标日）
    
    # 3. 宽松判断历史数据：只要≥最小需求（30天），或虽不足但能计算核心特征（如30日涨幅）
    core_feature_ready = False
    if history_days_actual >= SELECT_CONFIG["min_history_days"]:
        core_feature_ready = True
    else:
        # 若历史数据不足30天，但有30日涨幅所需数据（如目标日是2025-05-10，有2025-04-10前的数据）
        earliest_date = valid_data['date'].min()
        if (target_date - earliest_date).days >= 30:
            core_feature_ready = True
    
    if not core_feature_ready:
        # 仅每200只股票打印一次“数据不足”，避免日志刷屏
        if int(stock_code.split('.')[0]) % 200 == 0:
            print(f"股票 {stock_code} 有效历史数据仅 {history_days_actual} 天，跳过")
        return pd.DataFrame()

    # 4. 提取目标日期数据，开始筛选
    target_data = valid_data[valid_data['date'] == target_date].copy()
    target_data['is_qualified'] = True

    # 5. 逐条件验证（逻辑不变，仅适配宽松数据）
    # 5.1 前期上涨铺垫
    if SELECT_PARAMS['rise_stage']['enable']:
        # 若rise_ratio_30d为NaN（数据不足导致），直接判定不满足
        if pd.isna(target_data['rise_ratio_30d'].iloc[0]):
            return pd.DataFrame()
        target_data['is_qualified'] &= (target_data['rise_ratio_30d'] >= SELECT_PARAMS['rise_stage']['min_rise_ratio_30d'])
        target_data['is_qualified'] &= (target_data['roc10_mean_rise'] >= SELECT_PARAMS['rise_stage']['min_roc10_mean_rise'])
        if not target_data['is_qualified'].iloc[0]:
            return pd.DataFrame()

    # 5.2 前期上涨量能支撑
    if SELECT_PARAMS['rise_volume_support']['enable']:
        target_data['is_qualified'] &= (target_data['volume_ratio_vs_5d_avg'] >= SELECT_PARAMS['rise_volume_support']['min_volume_ratio_rise'])
        target_data['is_qualified'] &= (target_data['money_ratio_vs_5d_avg'] >= SELECT_PARAMS['rise_volume_support']['min_money_ratio_rise'])
        if not target_data['is_qualified'].iloc[0]:
            return pd.DataFrame()

    # 5.3 回调充分且健康
    if SELECT_PARAMS['pullback_stage']['enable']:
        if pd.isna(target_data['pullback_ratio'].iloc[0]) or pd.isna(target_data['pullback_days'].iloc[0]):
            return pd.DataFrame()
        target_data['is_qualified'] &= target_data['pullback_ratio'].between(
            SELECT_PARAMS['pullback_stage']['min_pullback_ratio'],
            SELECT_PARAMS['pullback_stage']['max_pullback_ratio']
        )
        target_data['is_qualified'] &= target_data['pullback_days'].between(
            SELECT_PARAMS['pullback_stage']['min_pullback_days'],
            SELECT_PARAMS['pullback_stage']['max_pullback_days']
        )
        target_data['is_qualified'] &= (target_data['rolling_max_drawdown_20d'] >= SELECT_PARAMS['pullback_stage']['min_max_drawdown'])
        if not target_data['is_qualified'].iloc[0]:
            return pd.DataFrame()

    # 5.4 支撑位企稳
    if SELECT_PARAMS['support_stage']['enable']:
        target_data['is_qualified'] &= (target_data['close'] >= target_data['ma20'])
        target_data['is_qualified'] &= (target_data['ma20'] >= target_data['ma60'])
        target_data['is_qualified'] &= (target_data['touch_bollinger_lower_3d'])
        target_data['is_qualified'] &= (target_data['rsi14_near3d_min'] >= SELECT_PARAMS['support_stage']['min_rsi14_bottom'])
        if not target_data['is_qualified'].iloc[0]:
            return pd.DataFrame()

    # 5.5 回调量价健康
    if SELECT_PARAMS['pullback_volume_health']['enable']:
        target_data['is_qualified'] &= (target_data['volume_ratio_vs_5d_avg'] <= SELECT_PARAMS['pullback_volume_health']['max_volume_ratio_pullback'])
        target_data['is_qualified'] &= (target_data['money_ratio_vs_5d_avg'] <= SELECT_PARAMS['pullback_volume_health']['max_money_ratio_pullback'])
        target_data['is_qualified'] &= (target_data['obv_high_ratio'] >= SELECT_PARAMS['pullback_volume_health']['min_obv_retention'])
        if not target_data['is_qualified'].iloc[0]:
            return pd.DataFrame()

    # 5.6 启动信号确认
    if SELECT_PARAMS['start_signal']['enable']:
        target_data['is_qualified'] &= (target_data['price_up_3d'])
        target_data['is_qualified'] &= (target_data['volume_ratio_vs_5d_avg'] >= SELECT_PARAMS['start_signal']['min_volume_ratio_start'])
        target_data['is_qualified'] &= (target_data['macd_golden_cross'])
        target_data['is_qualified'] &= (target_data['macd_hist_calc'] >= SELECT_PARAMS['start_signal']['min_macd_hist_start'])
        target_data['is_qualified'] &= target_data['rsi14'].between(
            SELECT_PARAMS['start_signal']['rsi14_range'][0],
            SELECT_PARAMS['start_signal']['rsi14_range'][1]
        )
        if not target_data['is_qualified'].iloc[0]:
            return pd.DataFrame()

    # 6. 符合条件，打印结果（仅符合条件的股票打印，避免日志干扰）
    print(f"✅ 股票 {stock_code} 在 {SELECT_CONFIG['target_date']} 符合所有条件！")
    result_cols = ['stock_code', 'date', 'close', 'ma20', 'obv', 'macd_hist_calc', 'volume_ratio_vs_5d_avg']
    return target_data[result_cols]

# ======================== 单日期批量选股（优化日志输出） ========================
def batch_select_single_day():
    if not os.path.exists(SELECT_CONFIG["preprocess_data_path"]):
        raise FileNotFoundError(f"未找到预处理数据：{SELECT_CONFIG['preprocess_data_path']}")

    # 加载数据（不提前筛选时间范围，避免丢失有效数据）
    print(f"加载预处理数据，验证 {SELECT_CONFIG['target_date']} 选股结果...")
    preprocessed_df = pd.read_parquet(SELECT_CONFIG["preprocess_data_path"])
    preprocessed_df['date'] = pd.to_datetime(preprocessed_df['date'])
    target_date = pd.to_datetime(SELECT_CONFIG["target_date"])

    # 按股票分组（全量分组，后续在函数内筛选）
    stock_groups = [group for _, group in preprocessed_df.groupby('stock_code')]
    total_stocks = len(stock_groups)
    print(f"数据加载完成：共 {total_stocks} 只股票，开始筛选...")

    # 单进程筛选（日志清晰，避免输出截断）
    valid_results = []
    for i, group in enumerate(stock_groups, 1):
        result = select_stock_single_day(group)
        if not result.empty:
            valid_results.append(result)
        # 每处理100只股票显示一次进度（不刷屏）
        if i % 100 == 0 or i == total_stocks:
            print(f"进度：已处理 {i}/{total_stocks} 只股票 | 累计符合条件 {len(valid_results)} 只")

    # 保存结果
    if valid_results:
        final_result = pd.concat(valid_results, ignore_index=True)
    else:
        final_result = pd.DataFrame(columns=['stock_code', 'date', 'close', 'ma20', 'obv', 'macd_hist_calc'])

    final_result.to_csv(SELECT_CONFIG["result_save_path"], index=False, encoding='utf-8-sig')
    print(f"\n📊 {SELECT_CONFIG['target_date']} 选股结果汇总：")
    print(f"- 总处理股票数：{total_stocks} 只")
    print(f"- 符合条件股票数：{len(final_result)} 只")
    print(f"- 结果保存路径：{SELECT_CONFIG['result_save_path']}")
    
    # 显示符合条件的股票详情（无截断）
    if not final_result.empty:
        print(f"\n🎯 符合条件的股票列表：")
        pd.set_option('display.max_rows', None)  # 显示所有符合条件的股票
        print(final_result[['stock_code', 'close', 'volume_ratio_vs_5d_avg', 'macd_hist_calc']].to_string(index=False))
    else:
        print(f"\n⚠️  无符合条件的股票（可调整SELECT_PARAMS阈值重试）")

    return final_result

# ======================== 执行入口 ========================
if __name__ == "__main__":
    # 解决输出截断问题：设置最大输出行数
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', None)
    pd.set_option('display.max_colwidth', None)
    
    try:
        selection_result = batch_select_single_day()
    except Exception as e:
        print(f"❌ 选股异常：{str(e)}")

加载预处理数据，验证 2025-10-10 选股结果...
数据加载完成：共 5185 只股票，开始筛选...
❌ 选股异常：'rise_ratio_30d'
