In [53]:
import pandas as pd

# 读取widetable.parquet文件
file_path = r'D:\workspace\xiaoyao\data\widetable.parquet'
df = pd.read_parquet(file_path)
df.columns

Index(['date', 'stock_code', 'open', 'close', 'low', 'high', 'volume', 'money',
       'factor', 'high_limit', 'low_limit', 'avg', 'pre_close', 'paused',
       'zjw_industry_code', 'zjw_industry_name', 'jq_l1_industry_code',
       'jq_l1_industry_name', 'jq_l2_industry_code', 'jq_l2_industry_name',
       'sw_l1_industry_code', 'sw_l1_industry_name', 'sw_l2_industry_code',
       'sw_l2_industry_name', 'sw_l3_industry_code', 'sw_l3_industry_name',
       'capitalization', 'circulating_cap', 'market_cap',
       'circulating_market_cap', 'turnover_ratio', 'pe_ratio', 'pe_ratio_lyr',
       'pb_ratio', 'ps_ratio', 'pcf_ratio', 'current', 'auc_volume',
       'auc_money', 'a1_p', 'a1_v', 'a2_p', 'a2_v', 'a3_p', 'a3_v', 'a4_p',
       'a4_v', 'a5_p', 'a5_v', 'b1_p', 'b1_v', 'b2_p', 'b2_v', 'b3_p', 'b3_v',
       'b4_p', 'b4_v', 'b5_p', 'b5_v', 'return_0d', 'return_1d', 'return_2d',
       'return_3d', 'return_4d', 'return_5d'],
      dtype='object')

In [54]:
import pandas as pd
import numpy as np

# 确保数据按股票代码和日期排序
df = df.sort_values(by=['stock_code', 'date']).reset_index(drop=True)

# 1. 移动平均线 (MA)
df['ma5'] = df.groupby('stock_code')['close'].transform(
    lambda x: x.rolling(window=5, min_periods=1).mean()
)
df['ma10'] = df.groupby('stock_code')['close'].transform(
    lambda x: x.rolling(window=10, min_periods=1).mean()
)
df['ma20'] = df.groupby('stock_code')['close'].transform(
    lambda x: x.rolling(window=20, min_periods=1).mean()
)
df['ma60'] = df.groupby('stock_code')['close'].transform(
    lambda x: x.rolling(window=60, min_periods=1).mean()
)

# 2. 相对强弱指数 (RSI)
def calculate_rsi(series, window=14):
    delta = series.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    
    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()
    
    # 避免除零错误
    avg_loss = avg_loss.replace(0, 0.0001)
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

df['rsi14'] = df.groupby('stock_code')['close'].transform(
    lambda x: calculate_rsi(x, window=14)
)

# 3. MACD指标
def calculate_macd(series, fast_period=12, slow_period=26, signal_period=9):
    ema_fast = series.ewm(span=fast_period, adjust=False).mean()
    ema_slow = series.ewm(span=slow_period, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal_period, adjust=False).mean()
    macd_hist = macd_line - signal_line
    return pd.DataFrame({
        'macd_line': macd_line,
        'signal_line': signal_line,
        'macd_hist': macd_hist
    })

# 应用MACD计算并合并结果
macd_results = df.groupby('stock_code')['close'].apply(
    lambda x: calculate_macd(x)
)
df = df.join(macd_results.reset_index(level=0, drop=True))

# 4. 布林带 (Bollinger Bands)
def calculate_bollinger_bands(series, window=20, num_std=2):
    rolling_mean = series.rolling(window=window, min_periods=1).mean()
    rolling_std = series.rolling(window=window, min_periods=1).std().replace(0, 0.0001)
    
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    
    return pd.DataFrame({
        'bollinger_mid': rolling_mean,
        'bollinger_upper': upper_band,
        'bollinger_lower': lower_band
    })

# 应用布林带计算并合并结果
bollinger_results = df.groupby('stock_code')['close'].apply(
    lambda x: calculate_bollinger_bands(x)
)
df = df.join(bollinger_results.reset_index(level=0, drop=True))

# 5. 成交量加权平均价 (VWAP)
def calculate_vwap(group):
    # 避免除零错误
    volume = group['volume'].replace(0, 0.0001)
    vwap = (group['money'] / volume).cumsum() / np.arange(1, len(group) + 1)
    return vwap

df['vwap'] = df.groupby('stock_code', group_keys=False).apply(
    lambda x: calculate_vwap(x)
)

# 6. 动量指标 (Momentum)
def calculate_momentum(series, period=14):
    return series - series.shift(period)

df['momentum14'] = df.groupby('stock_code')['close'].transform(
    lambda x: calculate_momentum(x, period=14)
)

# 7. 量比指标
# 7.1 与昨日成交量比
df['volume_ratio_vs_yesterday'] = df.groupby('stock_code')['volume'].transform(
    lambda x: x / x.shift(1).replace(0, 0.0001)
)

# 7.2 与5日均量比
df['volume_ratio_vs_5d_avg'] = df.groupby('stock_code')['volume'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, 0.0001)
)

# 8. 竞价量比指标
# 8.1 与昨日竞价量比
df['auc_volume_ratio_vs_yesterday'] = df.groupby('stock_code')['auc_volume'].transform(
    lambda x: x / x.shift(1).replace(0, 0.0001)
)

# 8.2 与5日均竞价量比
df['auc_volume_ratio_vs_5d_avg'] = df.groupby('stock_code')['auc_volume'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, 0.0001)
)

# 9. 波动率计算
def calculate_volatility(series, window=20):
    # 避免除零错误
    open_price = series['open'].replace(0, 0.0001)
    daily_range = (series['high'] - series['low']) / open_price
    return daily_range.rolling(window=window, min_periods=1).mean()

df['volatility'] = df.groupby('stock_code', group_keys=False).apply(
    lambda x: calculate_volatility(x, window=20)
)

df['buy_total'] = df['b1_v'] + df['b2_v'] + df['b3_v'] + df['b4_v'] + df['b5_v']
df['sell_total'] = df['a1_v'] + df['a2_v'] + df['a3_v'] + df['a4_v'] + df['a5_v']

# 计算当日盘口量比（避免除零错误）
df['order_book_volume_ratio'] = df.apply(
    lambda row: row['buy_total'] / row['sell_total'] if row['sell_total'] != 0 else np.nan, 
    axis=1
)

# 2. 盘口量比与昨日比
df['obv_ratio_vs_yesterday'] = df.groupby('stock_code')['order_book_volume_ratio'].transform(
    lambda x: x / x.shift(1)  # 今日盘口量比 / 昨日盘口量比
)

# 3. 盘口量比与5日均比
df['obv_ratio_vs_5d_avg'] = df.groupby('stock_code')['order_book_volume_ratio'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1)
    # 今日盘口量比 / 过去5日平均盘口量比（不含今日）
)

# 处理可能的异常值
df = df.replace([np.inf, -np.inf], np.nan)

  df['vwap'] = df.groupby('stock_code', group_keys=False).apply(
  df['volatility'] = df.groupby('stock_code', group_keys=False).apply(


In [55]:
df.head(3)

Unnamed: 0,date,stock_code,open,close,low,high,volume,money,factor,high_limit,...,volume_ratio_vs_yesterday,volume_ratio_vs_5d_avg,auc_volume_ratio_vs_yesterday,auc_volume_ratio_vs_5d_avg,volatility,buy_total,sell_total,order_book_volume_ratio,obv_ratio_vs_yesterday,obv_ratio_vs_5d_avg
0,2025-01-02,000001.XSHE,1630.12,1588.43,1582.87,1635.68,1309344.0,2102923000.0,138.970157,1788.55,...,,,,,0.032396,1177400.0,1063700.0,1.106891,,
1,2025-01-03,000001.XSHE,1589.82,1581.48,1578.7,1603.72,830884.0,1320521000.0,138.970157,1746.85,...,0.63458,0.63458,0.947729,0.947729,0.024067,920500.0,385400.0,2.388428,2.15778,2.15778
2,2025-01-06,000001.XSHE,1581.48,1589.82,1559.25,1595.38,781129.0,1234306000.0,138.970157,1739.91,...,0.940118,0.729949,1.204702,1.172371,0.02366,801300.0,120695.0,6.639049,2.779673,3.798823


In [56]:
# 原始字段中文含义说明
original_fields = {
    'date': '日期',
    'stock_code': '股票代码',
    'open': '开盘价',
    'close': '收盘价',
    'low': '最低价',
    'high': '最高价',
    'volume': '成交量',
    'money': '成交额',
    'factor': '复权因子',
    'high_limit': '涨停价',
    'low_limit': '跌停价',
    'avg': '平均价',
    'pre_close': '前收盘价',
    'paused': '是否停牌(1=停牌,0=正常)',
    'zjw_industry_code': '证监会行业代码',
    'zjw_industry_name': '证监会行业名称',
    'jq_l1_industry_code': '聚宽一级行业代码',
    'jq_l1_industry_name': '聚宽一级行业名称',
    'jq_l2_industry_code': '聚宽二级行业代码',
    'jq_l2_industry_name': '聚宽二级行业名称',
    'sw_l1_industry_code': '申万一级行业代码',
    'sw_l1_industry_name': '申万一级行业名称',
    'sw_l2_industry_code': '申万二级行业代码',
    'sw_l2_industry_name': '申万二级行业名称',
    'sw_l3_industry_code': '申万三级行业代码',
    'sw_l3_industry_name': '申万三级行业名称',
    'capitalization': '总股本(股)',
    'circulating_cap': '流通股本(股)',
    'market_cap': '总市值',
    'circulating_market_cap': '流通市值',
    'turnover_ratio': '换手率(%)',
    'pe_ratio': '动态市盈率',
    'pe_ratio_lyr': '静态市盈率',
    'pb_ratio': '市净率',
    'ps_ratio': '市销率',
    'pcf_ratio': '市现率',
    'current': '最新价(当前价)',
    'auc_volume': '竞价成交量',
    'auc_money': '竞价成交额',
    'a1_p': '卖一价',
    'a1_v': '卖一量',
    'a2_p': '卖二价',
    'a2_v': '卖二量',
    'a3_p': '卖三价',
    'a3_v': '卖三量',
    'a4_p': '卖四价',
    'a4_v': '卖四量',
    'a5_p': '卖五价',
    'a5_v': '卖五量',
    'b1_p': '买一价',
    'b1_v': '买一量',
    'b2_p': '买二价',
    'b2_v': '买二量',
    'b3_p': '买三价',
    'b3_v': '买三量',
    'b4_p': '买四价',
    'b4_v': '买四量',
    'b5_p': '买五价',
    'b5_v': '买五量',
    'return_0d': '当日收益率',
    'return_1d': '1日收益率',
    'return_2d': '2日收益率',
    'return_3d': '3日收益率',
    'return_4d': '4日收益率',
    'return_5d': '5日收益率'
}

# 新增技术指标字段中文含义说明
technical_indicators = {
    # 移动平均线
    'ma5': '5日移动平均线',
    'ma10': '10日移动平均线',
    'ma20': '20日移动平均线',
    'ma60': '60日移动平均线',
    
    # 相对强弱指数
    'rsi14': '14日相对强弱指数',
    
    # MACD指标
    'macd_line': 'MACD线',
    'signal_line': '信号线',
    'macd_hist': 'MACD柱状图',
    
    # 布林带
    'bollinger_mid': '布林带中轨',
    'bollinger_upper': '布林带上轨',
    'bollinger_lower': '布林带下轨',
    
    # 成交量加权平均价
    'vwap': '成交量加权平均价',
    
    # 动量指标
    'momentum14': '14日动量指标',
    
    # 量比指标
    'volume_ratio_vs_yesterday': '成交量与昨日比',
    'volume_ratio_vs_5d_avg': '成交量与5日均量比',
    
    # 竞价量比指标
    'auc_volume_ratio_vs_yesterday': '竞价量与昨日竞价量比',
    'auc_volume_ratio_vs_5d_avg': '竞价量与5日均竞价量比',
    
    # 波动率
    'volatility': '20日价格波动率'
}

new_indicators = {
    'buy_total': '买1至买5总数量',
    'sell_total': '卖1至卖5总数量',
    'order_book_volume_ratio': '当日盘口量比（买盘总量/卖盘总量）',
    'obv_ratio_vs_yesterday': '盘口量比与昨日比',
    'obv_ratio_vs_5d_avg': '盘口量比与5日均比'
}
# 合并所有字段说明
all_fields = {** original_fields, **technical_indicators, **new_indicators}

# 可以通过以下方式查看任意字段的中文含义
# 例如: print(all_fields['close'])  # 输出: 收盘价
print(all_fields['obv_ratio_vs_5d_avg'])  # 输出: 5日收益率

盘口量比与5日均比


In [57]:
import pandas as pd

# 确保数据按股票代码和日期排序
df = df.sort_values(by=['stock_code', 'date']).reset_index(drop=True)

# 1. 先标记单日满足条件的记录（修正了字段名称）
df['condition_met'] = (
    (df['volume_ratio_vs_yesterday'] > 1) &  # 成交量与昨日比>1
    (df['auc_volume_ratio_vs_yesterday'] > 1) &  # 竞价量与昨日比>1
    (df['obv_ratio_vs_yesterday'] > 1)  # 盘口量比与昨日比>1（修正后的正确字段）
)

# 2. 对每个股票分组，识别连续3日满足条件的序列
def find_consecutive_days(group):
    # 创建一个空列用于标记是否属于连续3日满足条件的区间
    group['consecutive_3d'] = False
    
    # 获取满足条件的索引位置
    condition_indices = group.index[group['condition_met']].tolist()
    
    # 检查是否存在连续3个索引
    for i in range(len(condition_indices) - 2):
        if (condition_indices[i+1] == condition_indices[i] + 1 and 
            condition_indices[i+2] == condition_indices[i] + 2):
            # 标记这3天为连续满足条件
            group.loc[condition_indices[i], 'consecutive_3d'] = True
            group.loc[condition_indices[i+1], 'consecutive_3d'] = True
            group.loc[condition_indices[i+2], 'consecutive_3d'] = True
    
    return group

# 应用函数到每个股票分组
df = df.groupby('stock_code', group_keys=False).apply(find_consecutive_days)

# 3. 提取所有连续3日满足条件的记录
result = df[df['consecutive_3d'] == True]

# 查看结果
print(f"共找到 {len(result)} 条连续3日满足条件的记录")
print(result[['stock_code', 'date', 'volume_ratio_vs_yesterday', 
             'auc_volume_ratio_vs_yesterday', 'obv_ratio_vs_yesterday']].head())

共找到 668 条连续3日满足条件的记录
       stock_code        date  volume_ratio_vs_yesterday  \
316   000002.XSHE  2025-07-21                   1.296294   
317   000002.XSHE  2025-07-22                   1.999714   
318   000002.XSHE  2025-07-23                   1.147005   
9073  000088.XSHE  2025-06-30                   1.256028   
9074  000088.XSHE  2025-07-01                   1.038599   

      auc_volume_ratio_vs_yesterday  obv_ratio_vs_yesterday  
316                        1.211321                1.107919  
317                        1.179128                2.088989  
318                        5.127917                1.036325  
9073                       4.912281                1.143004  
9074                       1.353571                1.510731  


  df = df.groupby('stock_code', group_keys=False).apply(find_consecutive_days)


In [58]:
# 获取平均收益
print(result['return_0d'].mean())
print(result['return_1d'].mean())
print(result['return_2d'].mean())
print(result['return_3d'].mean())
print(result['return_4d'].mean())
print(result['return_5d'].mean())



0.015375062678084191
0.025776693824181473
0.028822406787693242
0.0316775997910966
0.032953242759248436
0.0331234379511314


In [59]:
# 将date取唯一值后，排序，打印屏幕
dates = result['date'].unique()
# 将dates排序后打印
print(sorted(dates))


['2025-01-03', '2025-01-06', '2025-01-07', '2025-01-08', '2025-01-09', '2025-01-10', '2025-01-13', '2025-01-14', '2025-01-15', '2025-01-16', '2025-01-17', '2025-01-20', '2025-01-21', '2025-01-22', '2025-01-23', '2025-01-24', '2025-01-27', '2025-02-05', '2025-02-06', '2025-02-07', '2025-02-10', '2025-02-11', '2025-02-12', '2025-02-13', '2025-02-14', '2025-02-17', '2025-02-18', '2025-02-19', '2025-02-20', '2025-02-21', '2025-02-24', '2025-02-25', '2025-02-26', '2025-02-27', '2025-02-28', '2025-03-03', '2025-03-04', '2025-03-05', '2025-03-06', '2025-03-07', '2025-03-10', '2025-03-11', '2025-03-12', '2025-03-13', '2025-03-14', '2025-03-17', '2025-03-19', '2025-03-20', '2025-03-21', '2025-03-24', '2025-03-26', '2025-03-27', '2025-03-28', '2025-03-31', '2025-04-01', '2025-04-02', '2025-04-03', '2025-04-07', '2025-04-08', '2025-04-09', '2025-04-18', '2025-04-21', '2025-04-22', '2025-04-23', '2025-04-24', '2025-04-25', '2025-04-28', '2025-04-29', '2025-04-30', '2025-05-06', '2025-05-07', '2025

In [60]:
# 将result按日统计return_4d的均值、数量、并把股票代码也包括进来
result = result.sort_values(by=['date', 'stock_code']).reset_index(drop=True)
result.to_parquet('result.parquet', index=False)


In [64]:
# 将result的date和return_1d进行汇总求均值，生成一个df。
df = result.groupby('date')['return_1d'].mean().reset_index()
# 用1+0.5*return_1d 得出收益率。
df['收益率'] = (1 + df['return_1d'] * 0.5).cumprod()

In [65]:
df

Unnamed: 0,date,return_1d,收益率
0,2025-01-03,0.077637,1.038818
1,2025-01-06,0.052237,1.065951
2,2025-01-07,0.009740,1.071142
3,2025-01-08,0.016452,1.079953
4,2025-01-09,-0.006915,1.076220
...,...,...,...
163,2025-09-22,0.097882,9.475111
164,2025-09-24,0.129274,10.087555
165,2025-09-25,0.041317,10.295947
166,2025-09-26,0.022556,10.412067
