In [None]:
import pandas as pd
import numpy as np
import talib as ta

# --------------------------
# 1. 读取数据与基础处理
# --------------------------
file_path = r'D:\workspace\xiaoyao\data\widetable.parquet'
df = pd.read_parquet(file_path)
# 排序并重置索引（确保唯一有序）
df = df.sort_values(by=['stock_code', 'date']).reset_index(drop=True)
print(f"数据量：{len(df)} 条，索引是否唯一：{df.index.is_unique}")

# --------------------------
# 2. 趋势类指标（MA/EMA）
# --------------------------
df['ma5'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.SMA(x.values, timeperiod=5)
)
df['ma10'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.SMA(x.values, timeperiod=10)
)
df['ma20'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.SMA(x.values, timeperiod=20)
)
df['ma60'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.SMA(x.values, timeperiod=60)
)
df['ema12'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.EMA(x.values, timeperiod=12)
)
df['ema26'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.EMA(x.values, timeperiod=26)
)

# --------------------------
# 3. 震荡类指标（RSI）
# --------------------------
df['rsi14'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.RSI(x.values, timeperiod=14)
)
df['rsi6'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.RSI(x.values, timeperiod=6)
)
df['rsi21'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.RSI(x.values, timeperiod=21)
)

# --------------------------
# 4. MACD指标（修复：不删除stock_code列）
# --------------------------
def macd_vectorized(group):
    # 不删除分组列，仅计算指标（避免后续groupby找不到列）
    close = group['close'].values
    macd_line, signal_line, macd_hist = ta.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9)
    group['macd_line_calc'] = macd_line
    group['signal_line_calc'] = signal_line
    group['macd_hist_calc'] = macd_hist
    return group

# 用group_keys=False + 保留分组列，消除警告且不丢列
df = df.groupby('stock_code', group_keys=False).apply(macd_vectorized)

# --------------------------
# 5. 布林带指标（修复：不删除stock_code列）
# --------------------------
def bollinger_vectorized(group):
    close = group['close'].values
    upper, mid, lower = ta.BBANDS(
        close, timeperiod=20, nbdevup=2, nbdevdn=2, matype=0  # matype=0=SMA
    )
    group['bollinger_upper_calc'] = upper
    group['bollinger_mid_calc'] = mid
    group['bollinger_lower_calc'] = lower
    return group

df = df.groupby('stock_code', group_keys=False).apply(bollinger_vectorized)

# --------------------------
# 6. VWAP指标（修复：不删除stock_code列）
# --------------------------
def vwap_vectorized(group):
    volume = group['volume'].replace(0, 0.0001).values
    money = group['money'].values
    cum_sum = (money / volume).cumsum()  # 向量化累积和
    divisor = np.arange(1, len(group) + 1)  # 向量化除数（1,2,...,n）
    group['vwap'] = cum_sum / divisor
    return group

df = df.groupby('stock_code', group_keys=False).apply(vwap_vectorized)

# --------------------------
# 7. Momentum与ROC指标（修复：不删除stock_code列）
# --------------------------
def momentum_vectorized(group):
    group['momentum14'] = group['close'] - group['close'].shift(14)  # 14日动量
    return group

df = df.groupby('stock_code', group_keys=False).apply(momentum_vectorized)

# TA-Lib ROC（价格变动率）
df['roc10'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.ROC(x.values, timeperiod=10)
)

# --------------------------
# 8. 量能类指标
# --------------------------
# 成交量对比（昨日）
df['volume_ratio_vs_yesterday'] = df.groupby('stock_code')['volume'].transform(
    lambda x: x / x.shift(1).replace(0, 0.0001)
)
# 成交量对比（5日平均）
df['volume_ratio_vs_5d_avg'] = df.groupby('stock_code')['volume'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, 0.0001)
)
# 竞价量对比（昨日）
df['auc_volume_ratio_vs_yesterday'] = df.groupby('stock_code')['auc_volume'].transform(
    lambda x: x / x.shift(1).replace(0, 0.0001)
)
# 竞价量对比（5日平均）
df['auc_volume_ratio_vs_5d_avg'] = df.groupby('stock_code')['auc_volume'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, 0.0001)
)

# --------------------------
# 9. 波动率与ATR指标（修复：不删除stock_code列）
# --------------------------
# 自定义波动率
def volatility_vectorized(group):
    open_price = group['open'].replace(0, 0.0001)
    daily_range = (group['high'] - group['low']) / open_price  # 当日波动幅度
    group['volatility'] = daily_range.rolling(window=20, min_periods=1).mean()  # 20日平均波动率
    return group

df = df.groupby('stock_code', group_keys=False).apply(volatility_vectorized)

# TA-Lib ATR（平均真实波幅）—— 修复版
def atr_vectorized(group):
    high = group['high'].values
    low = group['low'].values
    close = group['close'].values
    # 计算ATR并返回与group长度一致的数组
    atr = ta.ATR(high, low, close, timeperiod=14)
    return pd.Series(atr, index=group.index, name='atr14')  # 显式指定索引

# 用transform确保结果与原df索引对齐
df['atr14'] = df.groupby('stock_code', group_keys=False).apply(atr_vectorized)

# --------------------------
# 10. 盘口/活跃度指标
# --------------------------
# 盘口订单总量
df['buy_total'] = df[['b1_v', 'b2_v', 'b3_v', 'b4_v', 'b5_v']].sum(axis=1)
df['sell_total'] = df[['a1_v', 'a2_v', 'a3_v', 'a4_v', 'a5_v']].sum(axis=1)
# 盘口订单量比
df['order_book_volume_ratio'] = df.apply(
    lambda row: row['buy_total'] / row['sell_total'] if row['sell_total'] != 0 else np.nan, axis=1
)
# 盘口量比（昨日）
df['obv_ratio_vs_yesterday'] = df.groupby('stock_code')['order_book_volume_ratio'].transform(
    lambda x: x / x.shift(1).replace(0, np.nan)
)
# 盘口量比（5日平均）
df['obv_ratio_vs_5d_avg'] = df.groupby('stock_code')['order_book_volume_ratio'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, np.nan)
)

# 换手率对比
df['turnover_ratio_vs_yesterday'] = df.groupby('stock_code')['turnover_ratio'].transform(
    lambda x: x / x.shift(1).replace(0, np.nan)
)
df['turnover_ratio_vs_5d_avg'] = df.groupby('stock_code')['turnover_ratio'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, np.nan)
)

# 资金额对比
df['money_ratio_vs_yesterday'] = df.groupby('stock_code')['money'].transform(
    lambda x: x / x.shift(1).replace(0, 0.0001)
)
df['money_ratio_vs_5d_avg'] = df.groupby('stock_code')['money'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, 0.0001)
)

# 振幅指标
df['amplitude'] = (df['high'] - df['low']) / df['pre_close'] * 100  # 当日振幅（%）
df['amplitude_vs_yesterday'] = df.groupby('stock_code')['amplitude'].transform(
    lambda x: x / x.shift(1).replace(0, np.nan)
)
df['amplitude_vs_5d_avg'] = df.groupby('stock_code')['amplitude'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, np.nan)
)

# --------------------------
# 11. 资金流向/趋势强度/波动风险/量价结构
# --------------------------
# OBV（能量潮）—— 向量化实现
def obv_vectorized(group):
    close = group['close'].values
    volume = group['volume'].values
    # 向量化判断涨跌（无需循环）
    delta = np.sign(close[1:] - close[:-1])  # 涨=1，跌=-1，平=0
    delta = np.pad(delta, (1, 0), mode='constant')  # 首行补0（无历史数据）
    group['obv'] = (delta * volume).cumsum()  # 累积求和
    return group

df = df.groupby('stock_code', group_keys=False).apply(obv_vectorized)

# 主力净流入
df['main_force_net_flow'] = df['buy_total'] - df['sell_total']

# ADX（平均趋向指数）
def calculate_adx(group):
    high = group['high']
    low = group['low']
    close = group['close']
    prev_close = close.shift(1)
    
    # 计算+DM、-DM
    plus_dm = high - high.shift(1)
    minus_dm = low.shift(1) - low
    plus_dm = plus_dm.where((plus_dm > minus_dm) & (plus_dm > 0), 0)
    minus_dm = minus_dm.where((minus_dm > plus_dm) & (minus_dm > 0), 0)
    
    # 计算TR（真实波幅）
    tr1 = high - low
    tr2 = abs(high - prev_close)
    tr3 = abs(low - prev_close)
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    
    # 计算ATR、+DI、-DI、ADX
    atr = tr.rolling(window=14, min_periods=1).mean()
    plus_di = (plus_dm.rolling(window=14, min_periods=1).mean() / atr) * 100
    minus_di = (minus_dm.rolling(window=14, min_periods=1).mean() / atr) * 100
    dx = (abs(plus_di - minus_di) / (plus_di + minus_di.replace(0, 0.0001))) * 100
    group['adx'] = dx.rolling(window=14, min_periods=1).mean()
    group['plus_di'] = plus_di
    group['minus_di'] = minus_di
    return group

df = df.groupby('stock_code', group_keys=False).apply(calculate_adx)

# MA20斜率
df['ma20_slope'] = df.groupby('stock_code')['ma20'].transform(
    lambda x: (x - x.shift(1)) / x.shift(1).replace(0, 0.0001) * 100
)

# 滚动最大回撤（20天）
def calculate_rolling_max_drawdown(group):
    rolling_max = group['close'].rolling(window=20, min_periods=1).max()  # 20天内最高价
    drawdown = (group['close'] - rolling_max) / rolling_max  # 回撤率
    group['rolling_max_drawdown_20d'] = drawdown.rolling(window=20, min_periods=1).min()  # 最大回撤
    return group

df = df.groupby('stock_code', group_keys=False).apply(calculate_rolling_max_drawdown)

# 量价结构指标
df['volume_ratio'] = df.groupby('stock_code')['volume'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().replace(0, 0.0001)
)

# 量价背离
def calculate_price_volume_divergence(group):
    period = 5
    price_gain = (group['close'] / group['close'].shift(period) - 1) * 100  # 价格涨幅（%）
    volume_gain = (group['volume'] / group['volume'].shift(period).replace(0, 0.0001) - 1) * 100  # 成交量涨幅（%）
    group['price_volume_divergence'] = price_gain - volume_gain  # 背离值
    return group

df = df.groupby('stock_code', group_keys=False).apply(calculate_price_volume_divergence)

# --------------------------
# 12. TA-Lib新增核心因子
# --------------------------
# 12.1 成交量加权RSI（VI）
def calculate_vol_rsi(group):
    close = group['close'].values
    volume = group['volume'].values
    vol_weighted_price = (close * volume) / np.maximum(volume.sum(), 0.0001)  # 成交量加权价格
    group['vol_rsi14'] = ta.RSI(vol_weighted_price, timeperiod=14)  # 基于加权价格的RSI
    return group

df = df.groupby('stock_code', group_keys=False).apply(calculate_vol_rsi)

# 12.2 DMA（平行线差）
df['dma'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.EMA(x.values, timeperiod=10) - ta.EMA(x.values, timeperiod=50)
)
df['dma_signal'] = df.groupby('stock_code')['dma'].transform(
    lambda x: ta.EMA(x.values, timeperiod=10)  # DMA信号线（10日EMA）
)

# 12.3 KDJ（随机指标）
def calculate_kdj(group):
    high = group['high'].values
    low = group['low'].values
    close = group['close'].values
    # TA-Lib计算K、D值
    k, d = ta.STOCH(
        high, low, close,
        fastk_period=9, slowk_period=3, slowk_matype=0,
        slowd_period=3, slowd_matype=0
    )
    j = 3 * k - 2 * d  # J值计算公式
    group['kdj_k'] = k
    group['kdj_d'] = d
    group['kdj_j'] = j
    return group

df = df.groupby('stock_code', group_keys=False).apply(calculate_kdj)

# 12.4 OBV的EMA（平滑资金流向）
df['obv_ema10'] = df.groupby('stock_code')['obv'].transform(
    lambda x: ta.EMA(x.values, timeperiod=10)
)

# 12.5 价格标准差（波动程度）
df['price_std20'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.STDDEV(x.values, timeperiod=20, nbdev=1)
)

# 12.6 TRIX（三重指数平滑平均线）
df['trix'] = df.groupby('stock_code')['close'].transform(
    lambda x: ta.TRIX(x.values, timeperiod=12)
)
df['trix_signal'] = df.groupby('stock_code')['trix'].transform(
    lambda x: ta.EMA(x.values, timeperiod=9)  # TRIX信号线（9日EMA）
)

# --------------------------
# 13. 异常值处理
# --------------------------
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
df[numeric_cols] = df[numeric_cols].replace([np.inf, -np.inf], np.nan)

# --------------------------
# 14. 结果保存
# --------------------------
output_parquet = r'D:\workspace\xiaoyao\data\factortable.parquet'
df.to_parquet(output_parquet, index=False)

# 导出样本
df_sample = df.sample(5, random_state=42)
df_sample.to_csv('./sample_factortable.csv', index=False)

print(f"所有指标计算完成！结果已保存至：")
print(f"- Parquet文件：{output_parquet}")
print(f"- 样本CSV：./sample_factortable.csv")

数据量：972326 条，索引是否唯一：True


  df = df.groupby('stock_code', group_keys=False).apply(macd_vectorized)


KeyError: 'stock_code'