### 将竞价信息与量价信息合并

In [None]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

def merge_parquet_core_fields(auction_input, daily_input, output_path):
    """
    合并Parquet文件（确保字段顺序严格一致），输出轻量合并文件
    """
    # 1. 读取数据
    print(f"读取集合竞价数据：{auction_input}")
    auction_cols = ["stock_code", "date", "current", "volume"]
    auction_df = pd.read_parquet(auction_input, columns=auction_cols)
    
    print(f"读取每日量价数据：{daily_input}")
    daily_cols = ["stock_code", "date", "pre_close", "close", "open", "high", "low", 
                 "high_limit", "low_limit", "factor"]
    daily_df = pd.read_parquet(daily_input, columns=daily_cols)
    
    # 2. 基础预处理
    for df in [auction_df, daily_df]:
        df["date"] = pd.to_datetime(df["date"])
        df["trade_date"] = df["date"].dt.date
        df["stock_code"] = df["stock_code"].astype(str)
    
    auction_df.rename(columns={"volume": "auction_volume"}, inplace=True)
    
    # 3. 合并数据
    print("合并关键字段数据...")
    merged_df = pd.merge(
        left=auction_df,
        right=daily_df,
        on=["stock_code", "trade_date"],
        how="inner",
        suffixes=("", "_daily")
    )
    
    # 4. 价格复权处理
    price_fields = ["open", "close", "high", "low", "pre_close", "high_limit", "low_limit"]
    for field in price_fields:
        if field in merged_df.columns and "factor" in merged_df.columns:
            merged_df[field] = merged_df[field] / merged_df["factor"].where(merged_df["factor"] != 0, 1)
            # 仅保留小数点后2位
            merged_df[field] = merged_df[field].round(2)
            print(f"已对 {field} 进行复权处理")
    
    # 5. 数据清理
    merged_df = merged_df.sort_values(["stock_code", "date"]).reset_index(drop=True)
    merged_df = merged_df.drop_duplicates(subset=["stock_code", "trade_date"], keep="first")
    
    # 6. 严格按照预期顺序排列字段（关键修复：确保列名唯一）
    expected_columns = [
        "stock_code", "trade_date", "date", 
        "current", "auction_volume", "high_limit", 
        "low_limit", "open", "close", "high", 
        "low", "pre_close", "factor"
    ]
    # 确保只包含预期字段，且顺序完全一致
    merged_final = merged_df.reindex(columns=expected_columns)

    print(f"保存合并文件：{output_path}")
    merged_final.to_parquet(output_path, index=False)
    
    # 7. 验证字段顺序
    print("\n=== 合并结果验证 ===")
    print(f"实际字段顺序：{merged_final.columns.tolist()}")
    print(f"预期字段顺序：{expected_columns}")
    print(f"字段顺序匹配：{merged_final.columns.tolist() == expected_columns}")
    print(f"合并后数据条数：{len(merged_final)}")
    print(f"覆盖股票数量：{merged_final['stock_code'].nunique()} 只")
    print(f"\n合并完成！文件已保存至：{output_path}")

if __name__ == "__main__":
    AUCTION_PATH = "D:\\workspace\\xiaoyao\\data\\stock_daily_auction.parquet"
    DAILY_PATH = "D:\\workspace\\xiaoyao\\data\\stock_daily_price.parquet"
    OUTPUT_PATH = "D:\\workspace\\xiaoyao\\data\\stock_daily_auction_clean.parquet"
    
    merge_parquet_core_fields(AUCTION_PATH, DAILY_PATH, OUTPUT_PATH)

读取集合竞价数据：D:\workspace\xiaoyao\data\stock_daily_auction.parquet
读取每日量价数据：D:\workspace\xiaoyao\data\stock_daily_price.parquet
合并关键字段数据...
已对 open 进行复权处理
已对 close 进行复权处理
已对 high 进行复权处理
已对 low 进行复权处理
已对 pre_close 进行复权处理
已对 high_limit 进行复权处理
已对 low_limit 进行复权处理
保存合并文件：D:\workspace\xiaoyao\data\stock_daily_auction_clean.parquet

=== 合并结果验证 ===
实际字段顺序：['stock_code', 'trade_date', 'date', 'current', 'auction_volume', 'high_limit', 'low_limit', 'open', 'close', 'high', 'low', 'pre_close', 'factor']
预期字段顺序：['stock_code', 'trade_date', 'date', 'current', 'auction_volume', 'high_limit', 'low_limit', 'open', 'close', 'high', 'low', 'pre_close', 'factor']
字段顺序匹配：True
合并后数据条数：5555679
覆盖股票数量：5337 只

合并完成！文件已保存至：D:\workspace\xiaoyao\data\stock_daily_auction_clean.parquet


### 计算量比

In [18]:
import pandas as pd
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

def calculate_stock_metrics(input_path, output_path):
    # 1. 读取原始数据
    df = pd.read_parquet(input_path)
    
    # 2. 基础信息记录
    original_cols = df.columns.tolist()
    original_col_count = len(original_cols)
    print(f"原始列数: {original_col_count}, 列名: {original_cols[:5]}...")
    
    # 3. 预处理
    df['trade_date'] = pd.to_datetime(df['trade_date'])
    df = df.sort_values(['stock_code', 'trade_date']).reset_index(drop=True)
    df['stock_code'] = df['stock_code'].astype(str)
    
    # 4. 定义新增字段（包含BOLL指标）
    new_columns = [
        'auction_prev_ratio',
        'auction_volume_ratio_3d', 'auction_volume_ratio_5d',
        'auction_volume_ratio_8d', 'auction_volume_ratio_10d', 'auction_volume_ratio_20d', 'auction_volume_ratio_60d','rp_60'
        'daily_increase',
        'boll_mid', 'boll_up', 'boll_low'  # 新增BOLL相关字段
    ]
    
    # 5. 清理已存在的目标字段
    for col in new_columns:
        if col in df.columns:
            df = df.drop(columns=[col])
            print(f"移除已存在的字段: {col}")
    
    # 6. 计算原有指标
    # 6.1 竞价昨比
    df['auction_prev_ratio'] = df.groupby('stock_code')['auction_volume'].transform(
        lambda x: x / x.shift(1)
    )
    
    # 6.2 竞价量比
    window_sizes = [3, 5, 8, 10, 20, 60]
    for window in window_sizes:
        col_name = f'auction_volume_ratio_{window}d'
        df[col_name] = df.groupby('stock_code')['auction_volume'].transform(
            lambda x: x / x.shift(1).rolling(
                window=window, min_periods=1, closed='left'
            ).mean()
        )
    
    # 6.3 当日涨幅
    df['daily_increase'] = np.where(
        df['open'] != 0,
        (df['close'] - df['open']) / df['open'] * 100,
        np.nan
    )
    
    # 7. 新增BOLL指标计算（包含当日open）
    # BOLL计算逻辑：基于开盘价，周期6天，2倍标准差
    df['boll_mid'] = df.groupby('stock_code')['open'].transform(
        lambda x: x.rolling(window=15, min_periods=1).mean()  # 中轨：6日开盘价均值（含当日）
    )
    df['boll_std'] = df.groupby('stock_code')['open'].transform(
        lambda x: x.rolling(window=10, min_periods=1).std()   # 标准差（含当日）
    )
    df['boll_up'] = df['boll_mid'] + 1.5 * df['boll_std']    # 上轨：中轨 + 2倍标准差
    df['boll_low'] = df['boll_mid'] - 1.5 * df['boll_std']   # 下轨：中轨 - 2倍标准差
    
    # 增加一列 计算rp值，即相对价格位置，取60日的最高价、最低价、当前价的位置比例
    df['rp_60'] = df.groupby('stock_code')['open'].transform(
        lambda x: (x - x.rolling(window=60, min_periods=1).min()) / (x.rolling(window=60, min_periods=1).max() - x.rolling(window=60, min_periods=1).min())
    )

    # 移除临时标准差列
    if 'boll_std' in df.columns:
        df = df.drop(columns=['boll_std'])
    
    # 8. 列数控制（原始列数 + 8个新列：原5个+新增3个BOLL字段）
    final_col_count = original_col_count + 8 + 2 + 1
    current_col_count = len(df.columns)
    
    if current_col_count != final_col_count:
        diff = current_col_count - final_col_count
        print(f"警告：列数不匹配（差异: {diff}），自动修正...")
        
        added_cols = [col for col in df.columns if col not in original_cols and col not in new_columns]
        if added_cols:
            df = df.drop(columns=added_cols)
            print(f"已删除意外新增的列: {added_cols}")
    
    # 9. 数据类型标准化
    for col in new_columns:
        if col in df.columns:
            df[col] = df[col].astype('float64')
    
    # 10. 写入并验证
    table = pa.Table.from_pandas(df)
    pq.write_table(table, output_path)
    
    validation_df = pd.read_parquet(output_path)
    print(f"\n验证结果：")
    print(f"输出文件列数: {len(validation_df.columns)}")
    print(f"BOLL指标样例:\n{validation_df[['trade_date', 'open', 'boll_mid', 'boll_up', 'boll_low']].head(10)}")
    print(f"文件已保存至: {output_path}")


if __name__ == "__main__":
    input_file = r"D:\workspace\xiaoyao\data\stock_daily_auction_clean.parquet"
    output_file = r"D:\workspace\xiaoyao\data\stock_daily_auction_with_metrics.parquet"
    calculate_stock_metrics(input_file, output_file)


原始列数: 13, 列名: ['stock_code', 'trade_date', 'date', 'current', 'auction_volume']...
警告：列数不匹配（差异: 1），自动修正...
已删除意外新增的列: ['daily_increase', 'rp_60']

验证结果：
输出文件列数: 23
BOLL指标样例:
  trade_date   open   boll_mid    boll_up   boll_low
0 2021-01-04  19.10  19.100000        NaN        NaN
1 2021-01-05  18.40  18.750000  19.492462  18.007538
2 2021-01-06  18.08  18.526667  19.309163  17.744171
3 2021-01-07  19.52  18.775000  19.756440  17.793560
4 2021-01-08  19.90  19.000000  20.136640  17.863360
5 2021-01-11  20.00  19.166667  20.353494  17.979839
6 2021-01-12  20.39  19.341429  20.627831  18.055026
7 2021-01-13  21.00  19.548750  21.029326  18.068174
8 2021-01-14  20.68  19.674444  21.170447  18.178441
9 2021-01-15  21.00  19.807000  21.351248  18.262752
文件已保存至: D:\workspace\xiaoyao\data\stock_daily_auction_with_metrics.parquet


In [None]:
import pandas as pd

def analyze_daily_top10_increase(input_path, output_path):
    # 读取数据
    df = pd.read_parquet(input_path)
    df['trade_date'] = pd.to_datetime(df['trade_date'])
    valid_df = df
    # 1. 筛选高开2个点以上的股票
    # 确保前收盘价有效
    # valid_df = df[df['trade_date'] > '2021-01-01'].copy()

    # valid_df = df[
    #     (df['trade_date'] >= '2021-02-01') & 
    #     (df['trade_date'] <= '2024-08-31')
    # ].copy()
    
    valid_df = df[
    (df['trade_date'] >= '2024-09-01') & 
    (df['trade_date'] <= '2025-09-30')
    ].copy()

    # stock_code 不是688开头的
    valid_df = valid_df[~valid_df['stock_code'].str.startswith('688')].copy()

    valid_df = valid_df[valid_df['auction_volume_ratio_20d'] > 10].copy()

    valid_df = valid_df[valid_df['auction_volume_ratio_20d'] > 10].copy()

    # 计算开盘涨幅
    valid_df['open_pct'] = (valid_df['open'] - valid_df['pre_close']) / valid_df['pre_close'] * 100

    high_open_df = valid_df
    # high_open_df = high_open_df[high_open_df['open_pct'] < 4].copy()
    # high_open_df = high_open_df[high_open_df['open_pct'] >-4].copy()

    high_open_df = high_open_df[high_open_df['open'] < high_open_df['high_limit']].copy()
    # high_open_df = high_open_df[high_open_df['current'] < high_open_df['boll_low']].copy()

    
    # 2. 确保量比数据有效
    valid_high_open_df = high_open_df.dropna(subset=['auction_volume_ratio_20d']).copy()
    
    # 3. 按日筛选5日竞价量比最大的前10只股票
    # 按日期分组，每组内按量比降序排序并取前10
    def get_top10(group):
        # 组内按量比降序排序
        sorted_group = group.sort_values('auction_volume_ratio_20d', ascending=False)
        # 取前10并保留所有字段
        return sorted_group.head(10)
    
    # 应用分组函数
    daily_top10 = valid_high_open_df.groupby('trade_date', group_keys=False).apply(get_top10)
    
    # 4. 提取每日前10股票的当日涨幅(daily_increase)
    # 保留股票代码、日期、量比和涨幅等关键信息
    top10_with_increase = daily_top10[
        ['trade_date', 'stock_code', 'auction_volume_ratio_20d', 'daily_increase']
    ].copy()
    
    # 5. 按日统计涨幅情况
    daily_increase_stats = top10_with_increase.groupby('trade_date')['daily_increase'].agg(
        平均涨幅='mean',
        中位数涨幅='median',
        最大涨幅='max',
        最小涨幅='min',
        上涨数量=lambda x: (x > 0).sum(),
        涨停数量=lambda x: (x >= 9.8).sum(),  # 近似涨停标准
        总数量='count'
    ).reset_index()
    
    # 6. 保存结果
    # 每日前10股票明细（含当日涨幅）
    top10_with_increase.to_parquet(f"{output_path}_top10_with_increase.parquet", index=False)
    # 每日涨幅统计
    daily_increase_stats.to_parquet(f"{output_path}_increase_stats.parquet", index=False)
    
    print(f"每日前10股票及其当日涨幅已保存至：{output_path}_top10_with_increase.parquet")
    print(f"每日涨幅统计结果已保存至：{output_path}_increase_stats.parquet")
    
    # 7. 打印关键统计信息
    print("\n===== 关键统计结果 =====")
    print(f"有效交易日总数：{daily_increase_stats.shape[0]}天")
    print(f"所有交易日平均涨幅：{daily_increase_stats['平均涨幅'].mean():.2f}%")
    print(f"上涨概率：{daily_increase_stats['上涨数量'].sum() / daily_increase_stats['总数量'].sum():.2%}")
    print(f"平均每日涨停数量：{daily_increase_stats['涨停数量'].mean():.2f}只")


if __name__ == "__main__":
    input_file = r"D:\workspace\xiaoyao\data\stock_daily_auction_with_metrics.parquet"
    output_file = r"D:\workspace\xiaoyao\data\daily_top10_increase_analysis"
    analyze_daily_top10_increase(input_file, output_file)
    

每日前10股票及其当日涨幅已保存至：D:\workspace\xiaoyao\data\daily_top10_increase_analysis_top10_with_increase.parquet
每日涨幅统计结果已保存至：D:\workspace\xiaoyao\data\daily_top10_increase_analysis_increase_stats.parquet

===== 关键统计结果 =====
有效交易日总数：257天
所有交易日平均涨幅：-0.22%
上涨概率：48.02%
平均每日涨停数量：0.49只


  daily_top10 = valid_high_open_df.groupby('trade_date', group_keys=False).apply(get_top10)
