In [None]:
# 从Jupyter Notebook转换而来的Python代码
# 原始文件：D:\workspace\xiaoyao\works\preprocessor\factors.ipynb

import pandas as pd
import numpy as np

# 读取widetable.parquet文件
file_path = r'D:\workspace\xiaoyao\data\widetable.parquet'
df = pd.read_parquet(file_path)

# --------------------------
# 基础准备：数据排序与初始化
# --------------------------
df = df.sort_values(by=['stock_code', 'date']).reset_index(drop=True)

# --------------------------
# 1. 趋势类指标：移动平均线（MA）
# --------------------------
df['ma5'] = df.groupby('stock_code')['close'].transform(
    lambda x: x.rolling(window=5, min_periods=1).mean()
)
df['ma10'] = df.groupby('stock_code')['close'].transform(
    lambda x: x.rolling(window=10, min_periods=1).mean()
)
df['ma20'] = df.groupby('stock_code')['close'].transform(
    lambda x: x.rolling(window=20, min_periods=1).mean()
)
df['ma60'] = df.groupby('stock_code')['close'].transform(
    lambda x: x.rolling(window=60, min_periods=1).mean()
)

# --------------------------
# 2. 震荡类指标：相对强弱指数（RSI）
# --------------------------
def calculate_rsi(series, window=14):
    delta = series.diff()
    gain = delta.where(delta > 0, 0)
    loss = -delta.where(delta < 0, 0)
    avg_gain = gain.rolling(window=window, min_periods=1).mean()
    avg_loss = loss.rolling(window=window, min_periods=1).mean()
    avg_loss = avg_loss.replace(0, 0.0001)
    rs = avg_gain / avg_loss
    rsi = 100 - (100 / (1 + rs))
    return rsi

df['rsi14'] = df.groupby('stock_code')['close'].transform(
    lambda x: calculate_rsi(x, window=14)
)

# --------------------------
# 3. 趋势类指标：MACD
# --------------------------
def calculate_macd(series, fast_period=12, slow_period=26, signal_period=9):
    ema_fast = series.ewm(span=fast_period, adjust=False).mean()
    ema_slow = series.ewm(span=slow_period, adjust=False).mean()
    macd_line = ema_fast - ema_slow
    signal_line = macd_line.ewm(span=signal_period, adjust=False).mean()
    macd_hist = macd_line - signal_line
    return pd.DataFrame({
        'macd_line': macd_line,
        'signal_line': signal_line,
        'macd_hist': macd_hist
    })

macd_results = df.groupby('stock_code')['close'].apply(calculate_macd)
df = df.join(macd_results.reset_index(level=0, drop=True), rsuffix='_calc')

# --------------------------
# 4. 波动类指标：布林带
# --------------------------
def calculate_bollinger_bands(series, window=20, num_std=2):
    rolling_mean = series.rolling(window=window, min_periods=1).mean()
    rolling_std = series.rolling(window=window, min_periods=1).std().replace(0, 0.0001)
    upper_band = rolling_mean + (rolling_std * num_std)
    lower_band = rolling_mean - (rolling_std * num_std)
    return pd.DataFrame({
        'bollinger_mid': rolling_mean,
        'bollinger_upper': upper_band,
        'bollinger_lower': lower_band
    })

bollinger_results = df.groupby('stock_code')['close'].apply(calculate_bollinger_bands)
df = df.join(bollinger_results.reset_index(level=0, drop=True), rsuffix='_calc')

# --------------------------
# 5. 量价类指标：VWAP（修改1：添加include_groups=False）
# --------------------------
def calculate_vwap(group):
    volume = group['volume'].replace(0, 0.0001)
    vwap = (group['money'] / volume).cumsum() / np.arange(1, len(group) + 1)
    return vwap

# 关键修改：添加include_groups=False
df['vwap'] = df.groupby('stock_code', group_keys=False, include_groups=False).apply(calculate_vwap)

# --------------------------
# 6. 趋势类指标：Momentum
# --------------------------
def calculate_momentum(series, period=14):
    return series - series.shift(period)

df['momentum14'] = df.groupby('stock_code')['close'].transform(
    lambda x: calculate_momentum(x, period=14)
)

# --------------------------
# 7. 量能类指标：成交量对比
# --------------------------
df['volume_ratio_vs_yesterday'] = df.groupby('stock_code')['volume'].transform(
    lambda x: x / x.shift(1).replace(0, 0.0001)
)
df['volume_ratio_vs_5d_avg'] = df.groupby('stock_code')['volume'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, 0.0001)
)

# --------------------------
# 8. 量能类指标：竞价量对比
# --------------------------
df['auc_volume_ratio_vs_yesterday'] = df.groupby('stock_code')['auc_volume'].transform(
    lambda x: x / x.shift(1).replace(0, 0.0001)
)
df['auc_volume_ratio_vs_5d_avg'] = df.groupby('stock_code')['auc_volume'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, 0.0001)
)

# --------------------------
# 9. 波动类指标：波动率（修改2：添加include_groups=False）
# --------------------------
def calculate_volatility(series, window=20):
    open_price = series['open'].replace(0, 0.0001)
    daily_range = (series['high'] - series['low']) / open_price
    return daily_range.rolling(window=window, min_periods=1).mean()

# 关键修改：添加include_groups=False
df['volatility'] = df.groupby('stock_code', group_keys=False, include_groups=False).apply(
    lambda x: calculate_volatility(x, window=20)
)

# --------------------------
# 10. 盘口类指标：五档盘口量比
# --------------------------
df['buy_total'] = df['b1_v'] + df['b2_v'] + df['b3_v'] + df['b4_v'] + df['b5_v']
df['sell_total'] = df['a1_v'] + df['a2_v'] + df['a3_v'] + df['a4_v'] + df['a5_v']
df['order_book_volume_ratio'] = df.apply(
    lambda row: row['buy_total'] / row['sell_total'] if row['sell_total'] != 0 else np.nan,
    axis=1
)
df['obv_ratio_vs_yesterday'] = df.groupby('stock_code')['order_book_volume_ratio'].transform(
    lambda x: x / x.shift(1).replace(0, np.nan)
)
df['obv_ratio_vs_5d_avg'] = df.groupby('stock_code')['order_book_volume_ratio'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, np.nan)
)

# --------------------------
# 11. 活跃度指标
# --------------------------
df['turnover_ratio_vs_yesterday'] = df.groupby('stock_code')['turnover_ratio'].transform(
    lambda x: x / x.shift(1).replace(0, np.nan)
)
df['turnover_ratio_vs_5d_avg'] = df.groupby('stock_code')['turnover_ratio'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, np.nan)
)
df['money_ratio_vs_yesterday'] = df.groupby('stock_code')['money'].transform(
    lambda x: x / x.shift(1).replace(0, 0.0001)
)
df['money_ratio_vs_5d_avg'] = df.groupby('stock_code')['money'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, 0.0001)
)
df['amplitude'] = (df['high'] - df['low']) / df['pre_close'] * 100
df['amplitude_vs_yesterday'] = df.groupby('stock_code')['amplitude'].transform(
    lambda x: x / x.shift(1).replace(0, np.nan)
)
df['amplitude_vs_5d_avg'] = df.groupby('stock_code')['amplitude'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().shift(1).replace(0, np.nan)
)

# --------------------------
# 12. 新增：资金流向类指标（修改3：添加include_groups=False）
# --------------------------
def calculate_obv(group):
    obv = pd.Series(0.0, index=group.index)
    for i in range(1, len(group)):
        if group['close'].iloc[i] > group['close'].iloc[i-1]:
            obv.iloc[i] = obv.iloc[i-1] + group['volume'].iloc[i]
        elif group['close'].iloc[i] < group['close'].iloc[i-1]:
            obv.iloc[i] = obv.iloc[i-1] - group['volume'].iloc[i]
        else:
            obv.iloc[i] = obv.iloc[i-1]
    return obv

# 关键修改：添加include_groups=False
df['obv'] = df.groupby('stock_code', group_keys=False, include_groups=False).apply(calculate_obv)
df['main_force_net_flow'] = df['buy_total'] - df['sell_total']

# --------------------------
# 13. 新增：趋势强度类指标（修改4：添加include_groups=False）
# --------------------------
def calculate_adx(group, window=14):
    high = group['high']
    low = group['low']
    close = group['close']
    prev_close = close.shift(1)
    plus_dm = high - high.shift(1)
    minus_dm = low.shift(1) - low
    plus_dm = plus_dm.where((plus_dm > minus_dm) & (plus_dm > 0), 0)
    minus_dm = minus_dm.where((minus_dm > plus_dm) & (minus_dm > 0), 0)
    tr1 = high - low
    tr2 = abs(high - prev_close)
    tr3 = abs(low - prev_close)
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    atr = tr.rolling(window=window, min_periods=1).mean()
    plus_di = (plus_dm.rolling(window=window, min_periods=1).mean() / atr) * 100
    minus_di = (minus_dm.rolling(window=window, min_periods=1).mean() / atr) * 100
    dx = (abs(plus_di - minus_di) / (plus_di + minus_di.replace(0, 0.0001))) * 100
    adx = dx.rolling(window=window, min_periods=1).mean()
    return pd.DataFrame({
        'adx': adx,
        'plus_di': plus_di,
        'minus_di': minus_di
    })

# 关键修改：添加include_groups=False
adx_results = df.groupby('stock_code', group_keys=False, include_groups=False).apply(calculate_adx)
df = df.join(adx_results)

df['ma20_slope'] = df.groupby('stock_code')['ma20'].transform(
    lambda x: (x - x.shift(1)) / x.shift(1).replace(0, 0.0001) * 100
)

# --------------------------
# 14. 新增：波动风险类指标（修改5-6：添加include_groups=False）
# --------------------------
def calculate_atr(group, window=14):
    high = group['high']
    low = group['low']
    prev_close = group['pre_close']
    tr1 = high - low
    tr2 = abs(high - prev_close)
    tr3 = abs(low - prev_close)
    tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
    return tr.rolling(window=window, min_periods=1).mean()

# 关键修改：添加include_groups=False
df['atr'] = df.groupby('stock_code', group_keys=False, include_groups=False).apply(calculate_atr)

def calculate_rolling_max_drawdown(group, window=20):
    rolling_high = group['close'].rolling(window=window, min_periods=1).max()
    drawdown = (group['close'] - rolling_high) / rolling_high * 100
    return drawdown

# 关键修改：添加include_groups=False
df['rolling_max_drawdown_20d'] = df.groupby('stock_code', group_keys=False, include_groups=False).apply(
    calculate_rolling_max_drawdown
)

# --------------------------
# 15. 新增：量价结构类指标（修改7：添加include_groups=False）
# --------------------------
df['volume_ratio'] = df.groupby('stock_code')['volume'].transform(
    lambda x: x / x.rolling(window=5, min_periods=1).mean().replace(0, 0.0001)
)

def calculate_price_volume_divergence(group, period=5):
    price_gain = (group['close'] / group['close'].shift(period) - 1) * 100
    volume_gain = (group['volume'] / group['volume'].shift(period).replace(0, 0.0001) - 1) * 100
    return price_gain - volume_gain

# 关键修改：添加include_groups=False
df['price_volume_divergence'] = df.groupby('stock_code', group_keys=False, include_groups=False).apply(
    calculate_price_volume_divergence
)

# --------------------------
# 异常值处理（修改8：仅对数值型列执行replace，跳过object列）
# --------------------------
# 1. 获取所有数值型列（排除object类型，如concept_name_list）
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
# 2. 仅对数值型列替换异常值
df[numeric_cols] = df[numeric_cols].replace([np.inf, -np.inf], np.nan)

# --------------------------
# 结果保存与样本导出
# --------------------------
df.to_parquet(r'D:\workspace\xiaoyao\data\factortable.parquet', index=False)
# 读取并采样（固定random_state确保可复现）
df_sample = pd.read_parquet(r'D:\workspace\xiaoyao\data\factortable.parquet')
df_sample.sample(5, random_state=42).to_csv('./sample.csv', index=False)

  df['vwap'] = df.groupby('stock_code', group_keys=False).apply(calculate_vwap)
  df['volatility'] = df.groupby('stock_code', group_keys=False).apply(
  df['obv'] = df.groupby('stock_code', group_keys=False).apply(calculate_obv)
  adx_results = df.groupby('stock_code', group_keys=False).apply(calculate_adx)
  df['atr'] = df.groupby('stock_code', group_keys=False).apply(calculate_atr)
  df['rolling_max_drawdown_20d'] = df.groupby('stock_code', group_keys=False).apply(
  df['price_volume_divergence'] = df.groupby('stock_code', group_keys=False).apply(


TypeError: Cannot compare types 'ndarray(dtype=object)' and 'float'