In [4]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

# ======================== 预处理配置 ========================
PREPROCESS_CONFIG = {
    "raw_data_path": r'D:\workspace\xiaoyao\data\factortable.parquet',  # 原始数据路径
    "preprocess_save_path": r'D:\workspace\xiaoyao\data\preprocessed_features.parquet',  # 预处理结果保存路径
    "max_history_days": 60,  # 最大历史窗口（与选股逻辑匹配）
    "force_recompute": False  # 强制重新计算（即使本地有结果）
}

# ======================== 预处理核心函数 ========================
def preprocess_data():
    """
    预处理模块：计算所有无未来函数的中间特征，保存到本地
    特征包括：阶段高点、量能比值、价格趋势、支撑位标记等
    """
    # 检查是否需要重新计算
    if not PREPROCESS_CONFIG["force_recompute"] and os.path.exists(PREPROCESS_CONFIG["preprocess_save_path"]):
        print(f"发现本地预处理结果，直接加载：{PREPROCESS_CONFIG['preprocess_save_path']}")
        preprocessed_df = pd.read_parquet(PREPROCESS_CONFIG["preprocess_save_path"])
        return preprocessed_df
    
    # 1. 加载原始数据
    print(f"加载原始数据：{PREPROCESS_CONFIG['raw_data_path']}")
    df = pd.read_parquet(PREPROCESS_CONFIG["raw_data_path"])
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values(by=['stock_code', 'date']).reset_index(drop=True)
    
    # 2. 基础字段预处理（确保无缺失）
    # 计算盘口买卖比（避免除零）
    df['buy_sell_ratio'] = df.apply(
        lambda row: row['buy_total'] / row['sell_total'] if row['sell_total'] > 0 else 0, axis=1
    )
    # 填充必要字段的缺失值（用0或前值，避免未来函数）
    fill_cols = ['vol_rsi14', 'price_volume_divergence', 'main_force_net_flow', 'buy_sell_ratio']
    df[fill_cols] = df[fill_cols].fillna(0)
    df['circulating_market_cap'] = df['circulating_market_cap'].fillna(method='ffill')  # 流通市值用前值填充
    
    # 3. 计算无未来函数的核心特征（仅用当前及历史数据）
    # 3.1 阶段高点特征（30日高点及高点日期）
    def compute_high_features(group):
        # 滚动计算30日高点（仅用过去30天数据，无未来）
        group['high_30d'] = group['close'].rolling(window=30, min_periods=1).max()
        # 滚动定位30日高点对应的日期（无未来）
        group['high_date_30d'] = group['date'].iloc[
            group['close'].rolling(window=30, min_periods=1).apply(
                lambda x: np.argmax(x), raw=True
            ).astype(int)
        ].values
        return group
    df = df.groupby('stock_code', group_keys=False).apply(compute_high_features)
    
    # 3.2 量能相关特征（成交量/金额与5日均值比值）
    # 成交量相对5日均值（滞后1天，避免当天数据影响，无未来）
    df['volume_ratio_vs_5d_avg'] = df.groupby('stock_code')['volume'].transform(
        lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, 0.0001)
    )
    # 成交金额相对5日均值（同理无未来）
    df['money_ratio_vs_5d_avg'] = df.groupby('stock_code')['money'].transform(
        lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, 0.0001)
    )
    
    # 3.3 价格趋势特征（近3日价格递增标记）
    df['price_up_3d'] = df.groupby('stock_code')['close'].transform(
        lambda x: (x > x.shift(1)) & (x.shift(1) > x.shift(2))  # 仅用前2天数据，无未来
    )
    
    # 3.4 支撑位特征（近3日触及布林带下轨标记）
    df['touch_bollinger_lower_3d'] = df.groupby('stock_code').apply(
        lambda g: (g['close'] <= g['bollinger_lower_calc']) | 
                 (g['close'].shift(1) <= g['bollinger_lower_calc'].shift(1)) | 
                 (g['close'].shift(2) <= g['bollinger_lower_calc'].shift(2))
    ).reset_index(level=0, drop=True)
    
    # 3.5 回调阶段标记（用于快速定位回调数据）
    df['is_pullback'] = (df['close'] < df['high_30d']) & (  # 当前价格低于30日高点
        (df['date'] - df['high_date_30d']).dt.days.between(
            PREPROCESS_CONFIG["max_history_days"]*-1, 0  # 高点之后的日期（无未来）
        )
    )
    
    # 4. 保存预处理结果到本地
    df.to_parquet(PREPROCESS_CONFIG["preprocess_save_path"], index=False)
    print(f"预处理完成，结果保存至：{PREPROCESS_CONFIG['preprocess_save_path']}")
    print(f"特征列表：{df.columns.tolist()}")
    return df

# ======================== 预处理执行入口 ========================
if __name__ == "__main__":
    # 执行预处理（若本地无结果则计算，有结果则直接加载）
    preprocessed_df = preprocess_data()

加载原始数据：D:\workspace\xiaoyao\data\factortable.parquet


  df['circulating_market_cap'] = df['circulating_market_cap'].fillna(method='ffill')  # 流通市值用前值填充
  df = df.groupby('stock_code', group_keys=False).apply(compute_high_features)
  df['touch_bollinger_lower_3d'] = df.groupby('stock_code').apply(


预处理完成，结果保存至：D:\workspace\xiaoyao\data\preprocessed_features.parquet
特征列表：['date', 'stock_code', 'open', 'close', 'low', 'high', 'volume', 'money', 'factor', 'high_limit', 'low_limit', 'avg', 'pre_close', 'paused', 'stock_name', 'zjw_industry_code', 'zjw_industry_name', 'jq_l1_industry_code', 'jq_l1_industry_name', 'jq_l2_industry_code', 'jq_l2_industry_name', 'sw_l1_industry_code', 'sw_l1_industry_name', 'sw_l2_industry_code', 'sw_l2_industry_name', 'sw_l3_industry_code', 'sw_l3_industry_name', 'capitalization', 'circulating_cap', 'market_cap', 'circulating_market_cap', 'turnover_ratio', 'pe_ratio', 'pe_ratio_lyr', 'pb_ratio', 'ps_ratio', 'pcf_ratio', 'current', 'auc_volume', 'auc_money', 'a1_p', 'a1_v', 'a2_p', 'a2_v', 'a3_p', 'a3_v', 'a4_p', 'a4_v', 'a5_p', 'a5_v', 'b1_p', 'b1_v', 'b2_p', 'b2_v', 'b3_p', 'b3_v', 'b4_p', 'b4_v', 'b5_p', 'b5_v', 'concept_name_list', 'ma5', 'ma10', 'ma20', 'ma60', 'ema12', 'ema26', 'rsi14', 'rsi6', 'rsi21', 'macd_line_calc', 'signal_line_calc', 'macd_hi

In [None]:
import pandas as pd
import numpy as np
from multiprocessing import Pool, cpu_count
import os

# ======================== 选股配置 ========================
SELECT_CONFIG = {
    "preprocess_data_path": r'D:\workspace\xiaoyao\data\preprocessed_features.parquet',  # 预处理数据路径
    "result_save_path": r'D:\workspace\xiaoyao\data\daily_selection_result.csv',  # 选股结果保存路径
    "cpu_cores": max(1, cpu_count() - 2)  # 并行核心数（预留2核）
}

# ======================== 选股参数（与之前一致） ========================
SELECT_PARAMS = {
    "rise_stage": {
        "enable": True, "min_rise_ratio_30d": 0.2, "min_roc10_mean_rise": 3,
        "min_vol_rsi14_rise": 50, "max_price_volume_div_rise": -5
    },
    "rise_volume_support": {
        "enable": True, "min_volume_ratio_rise": 1.2, "min_money_ratio_rise": 1.2,
        "min_obv_growth_rise": 0.15, "min_buy_sell_ratio_rise": 1.1
    },
    "pullback_stage": {
        "enable": True, "min_pullback_ratio": 0.1, "max_pullback_ratio": 0.2,
        "min_pullback_days": 10, "max_pullback_days": 20, "min_max_drawdown": -0.2
    },
    "support_stage": {
        "enable": True, "ma20_support": True, "ma20_ma60_trend": True,
        "bollinger_support": True, "min_rsi14_bottom": 30
    },
    "pullback_volume_health": {
        "enable": True, "max_volume_ratio_pullback": 0.8, "max_money_ratio_pullback": 0.8,
        "min_obv_retention": 0.8
    },
    "start_signal": {
        "enable": True, "price_up_trend_3d": True, "min_volume_ratio_start": 1.2,
        "macd_golden_cross": True, "min_macd_hist_start": 0, "rsi14_range": [30, 50]
    }
}

# ======================== 单只股票选股逻辑 ========================
def select_stock(stock_data):
    """
    对单只股票应用选股逻辑（仅用预处理特征，无新增计算）
    :param stock_data: 单只股票的预处理数据（含所有特征）
    :return: 符合条件的交易日记录
    """
    stock_code = stock_data['stock_code'].iloc[0]
    df = stock_data.copy()
    # 初始化符合条件标记
    df['is_qualified'] = True
    
    # 1. 前期上涨铺垫（基于预处理的30日高点特征）
    if SELECT_PARAMS['rise_stage']['enable']:
        # 30日涨幅≥20%
        df['rise_ratio_30d'] = (df['high_30d'] - df['close'].shift(30)) / df['close'].shift(30).replace(0, 0.0001)
        df['is_qualified'] &= (df['rise_ratio_30d'] >= SELECT_PARAMS['rise_stage']['min_rise_ratio_30d'])
        # 上涨阶段ROC10均值≥3%（用高点前10日数据）
        df['roc10_mean_rise'] = df.groupby('stock_code').apply(
            lambda g: g['roc10'].rolling(window=10, min_periods=5).mean().shift(1)
        ).reset_index(level=0, drop=True)
        df['is_qualified'] &= (df['roc10_mean_rise'] >= SELECT_PARAMS['rise_stage']['min_roc10_mean_rise'])
    
    # 2. 前期上涨量能支撑（基于预处理的量能比值）
    if SELECT_PARAMS['rise_volume_support']['enable']:
        df['is_qualified'] &= (df['volume_ratio_vs_5d_avg'] >= SELECT_PARAMS['rise_volume_support']['min_volume_ratio_rise'])
        df['is_qualified'] &= (df['money_ratio_vs_5d_avg'] >= SELECT_PARAMS['rise_volume_support']['min_money_ratio_rise'])
    
    # 3. 回调充分且健康（基于预处理的回调标记）
    if SELECT_PARAMS['pullback_stage']['enable']:
        df['pullback_ratio'] = (df['high_30d'] - df['close']) / df['high_30d'].replace(0, 0.0001)
        df['pullback_days'] = (df['date'] - df['high_date_30d']).dt.days
        df['is_qualified'] &= (df['pullback_ratio'].between(
            SELECT_PARAMS['pullback_stage']['min_pullback_ratio'],
            SELECT_PARAMS['pullback_stage']['max_pullback_ratio']
        ))
        df['is_qualified'] &= (df['pullback_days'].between(
            SELECT_PARAMS['pullback_stage']['min_pullback_days'],
            SELECT_PARAMS['pullback_stage']['max_pullback_days']
        ))
    
    # 4. 支撑位企稳（基于预处理的支撑位特征）
    if SELECT_PARAMS['support_stage']['enable']:
        df['is_qualified'] &= (df['close'] >= df['ma20'])  # 站稳MA20
        df['is_qualified'] &= (df['ma20'] >= df['ma60'])  # 中长期趋势
        df['is_qualified'] &= df['touch_bollinger_lower_3d']  # 触及布林带下轨
    
    # 5. 回调量价健康（基于预处理的量能比值）
    if SELECT_PARAMS['pullback_volume_health']['enable']:
        df['is_qualified'] &= (df['volume_ratio_vs_5d_avg'] <= SELECT_PARAMS['pullback_volume_health']['max_volume_ratio_pullback'])
        df['is_qualified'] &= (df['money_ratio_vs_5d_avg'] <= SELECT_PARAMS['pullback_volume_health']['max_money_ratio_pullback'])
    
    # 6. 启动信号确认（基于预处理的趋势特征）
    if SELECT_PARAMS['start_signal']['enable']:
        df['is_qualified'] &= df['price_up_3d']  # 近3日价格递增
        df['is_qualified'] &= (df['volume_ratio_vs_5d_avg'] >= SELECT_PARAMS['start_signal']['min_volume_ratio_start'])  # 启动放量
        df['is_qualified'] &= (df['macd_hist_calc'] >= SELECT_PARAMS['start_signal']['min_macd_hist_start'])  # MACD多头
    
    # 筛选符合条件的记录
    qualified = df[df['is_qualified']][['stock_code', 'date', 'close', 'ma20', 'obv', 'macd_hist_calc']]
    print(f"股票 {stock_code} 筛选完成，符合条件 {len(qualified)} 条")
    return qualified

# ======================== 多进程批量选股 ========================
def batch_select():
    """读取预处理数据，多进程批量选股"""
    # 1. 读取预处理数据
    if not os.path.exists(SELECT_CONFIG["preprocess_data_path"]):
        raise FileNotFoundError(f"未找到预处理数据，请先运行预处理模块：{SELECT_CONFIG['preprocess_data_path']}")
    print(f"加载预处理数据：{SELECT_CONFIG['preprocess_data_path']}")
    preprocessed_df = pd.read_parquet(SELECT_CONFIG["preprocess_data_path"])
    
    # 2. 按股票分组，准备并行任务
    stock_groups = [group for _, group in preprocessed_df.groupby('stock_code')]
    print(f"共 {len(stock_groups)} 只股票，启动 {SELECT_CONFIG['cpu_cores']} 核并行筛选...")
    
    # 3. 多进程筛选
    with Pool(SELECT_CONFIG["cpu_cores"]) as pool:
        results = pool.map(select_stock, stock_groups)
    
    # 4. 合并结果并保存
    final_result = pd.concat([r for r in results if not r.empty], ignore_index=True)
    final_result = final_result.sort_values(by=['date', 'stock_code']).reset_index(drop=True)
    final_result.to_csv(SELECT_CONFIG["result_save_path"], index=False, encoding='utf-8-sig')
    print(f"\n选股完成！共 {len(final_result)} 条符合条件记录，保存至：{SELECT_CONFIG['result_save_path']}")
    return final_result

# ======================== 选股执行入口 ========================
if __name__ == "__main__":
    # 执行选股（依赖预处理结果）
    selection_result = batch_select()