In [46]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler
from sklearn.pipeline import Pipeline
import joblib


In [28]:
file_path = "data/final_df.csv"
df_ori = pd.read_csv(file_path)

df_ori['datetime'] = pd.to_datetime(df_ori['datetime'])
df_ori = df_ori.sort_values(by=['datetime'])
df_ori = df_ori.reset_index(drop=True)
df_ori.head()

Unnamed: 0,datetime,symbol,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,f_minsin,f_mincos,...,div_amount_iqr_outlier,shares_out_nonpos_flag,split_nonpos_flag,shares_out,log_shares_out,eps_surp_pct_final,div_amount,log_shares_out_iqr_outlier,eps_estimate_rz_8,eps_actual
0,2024-04-30 12:50:00,AMAT,0,1,0,0,0,0,-0.94693,0.321439,...,0,0,1,830897024,20.538016,0.0,0.0,0,0.0,0.0
1,2024-04-30 12:50:00,AVGO,0,1,0,0,0,0,-0.94693,0.321439,...,0,0,1,465308000,19.95821,0.0,0.0,0,0.0,0.0
2,2024-04-30 12:50:00,MU,0,1,0,0,0,0,-0.94693,0.321439,...,0,0,1,1107369984,20.825254,0.0,0.0,0,0.0,0.0
3,2024-04-30 12:50:00,AMD,0,1,0,0,0,0,-0.94693,0.321439,...,0,0,1,1616140032,21.203306,2.04,0.0,0,0.0,0.62
4,2024-04-30 12:50:00,NVDA,0,1,0,0,0,0,-0.94693,0.321439,...,0,0,1,2500000000,21.639557,0.0,0.0,0,0.0,0.0


In [29]:
df = df_ori.copy()

In [30]:
df['y_target'] = df.groupby(['symbol', df['datetime'].dt.date])['fz_lret_1_rolling'].shift(-1)
df[["datetime", "symbol", "y_target"]]

Unnamed: 0,datetime,symbol,y_target
0,2024-04-30 12:50:00,AMAT,-0.169358
1,2024-04-30 12:50:00,AVGO,-0.620658
2,2024-04-30 12:50:00,MU,2.225385
3,2024-04-30 12:50:00,AMD,0.932911
4,2024-04-30 12:50:00,NVDA,-1.457090
...,...,...,...
324895,2025-10-28 15:59:00,MU,
324896,2025-10-28 15:59:00,AMD,
324897,2025-10-28 15:59:00,AMAT,
324898,2025-10-28 15:59:00,AVGO,


In [67]:
features_to_drop = ['y_target', 'fz_lret_1_rolling','year', 'month', 'day', 'minute', 'minute_of_day', 'dow_5', 'dow_6']

all_cols_set = set(df.columns)
drop_cols_set = set(features_to_drop)
keep_cols_set = all_cols_set - drop_cols_set - {'datetime', 'symbol'}
feature_cols = sorted(list(keep_cols_set))

df_model = df.dropna(subset=['y_target'])
df_model[["datetime", "symbol", 'fz_lret_1_rolling', "y_target"]]

Unnamed: 0,datetime,symbol,fz_lret_1_rolling,y_target
0,2024-04-30 12:50:00,AMAT,0.974098,-0.169358
1,2024-04-30 12:50:00,AVGO,1.712327,-0.620658
2,2024-04-30 12:50:00,MU,2.725425,2.225385
3,2024-04-30 12:50:00,AMD,4.291575,0.932911
4,2024-04-30 12:50:00,NVDA,2.057602,-1.457090
...,...,...,...,...
324890,2025-10-28 15:58:00,AMD,-2.102746,-3.877713
324891,2025-10-28 15:58:00,MU,0.872353,-0.494423
324892,2025-10-28 15:58:00,AVGO,0.209849,1.057159
324893,2025-10-28 15:58:00,NVDA,0.363492,-0.542356


In [68]:


try:
    # --- 1. 检查 NaN (空值) ---
    nan_counts = df_model.isnull().sum().sum()
    print("--- 检查 NaN (空值) ---")
    if nan_counts == 0:
        print("✅ 恭喜！'df_model' 中没有任何 NaN 值。")
    else:
        print(f"⚠️ 警告！'df_model' 中总共发现 {nan_counts} 个 NaN 值。")

    # --- 2. 检查 ±Inf (无穷大) ---
    numeric_cols = df_model.select_dtypes(include=[np.number]).columns
    pos_inf_count = np.isinf(df_model[numeric_cols]).values.sum()
    neg_inf_count = np.isneginf(df_model[numeric_cols]).values.sum()

    print("\n--- 检查 ±Inf (无穷大) ---")
    if pos_inf_count == 0 and neg_inf_count == 0:
        print("✅ 恭喜！'df_model' 中没有任何 Inf 值。")
    else:
        print(f"⚠️ 警告！发现 {pos_inf_count} 个 +Inf 值。")
        print(f"⚠️ 警告！发现 {neg_inf_count} 个 -Inf 值。")

    # --- 3. 检查极端大值 (描述性统计) ---
    print("\n--- 检查极端大值 (描述性统计) ---")
    
    # (!! 这是修复你问题的代码 !!)
    # 我们设置 'display.float_format' 来禁用科学记数法
    # 我们设置 'display.max_rows' 来显示所有列
    
    # 获取数值型列
    numeric_cols_for_describe = df_model.select_dtypes(include=[np.number]).columns
    
    with pd.option_context(
        'display.float_format', '{:.4f}'.format, # 格式化为 4 位小数
        'display.max_rows', None                 # 显示所有行 (所有特征)
    ):
        print("（已禁用科学记数法并显示所有特征）")
        
        # .T (转置) 使其更易于阅读
        description = df_model[numeric_cols_for_describe].describe(percentiles=[.01, .25, .5, .75, .99]).T
        
        # 打印描述性统计
        print(description)

except NameError:
    print("\n*** 错误: 'df_model' 未定义。***")
    print("请确保你已经运行了之前的单元格来创建 'df_model'。")
except Exception as e:
    print(f"\n*** 检查时出错: {e} ***")

--- 检查 NaN (空值) ---
✅ 恭喜！'df_model' 中没有任何 NaN 值。

--- 检查 ±Inf (无穷大) ---
✅ 恭喜！'df_model' 中没有任何 Inf 值。

--- 检查极端大值 (描述性统计) ---
（已禁用科学记数法并显示所有特征）
                                     count  \
dow_0                          323190.0000   
dow_1                          323190.0000   
dow_2                          323190.0000   
dow_3                          323190.0000   
dow_4                          323190.0000   
dow_5                          323190.0000   
f_minsin                       323190.0000   
f_mincos                       323190.0000   
close                          323190.0000   
f_volu_z_15_clipped            323190.0000   
f_volu_z_30_clipped            323190.0000   
f_volu_z_5_clipped             323190.0000   
f_volu_z_60_clipped            323190.0000   
f_volume_norm                  323190.0000   
fz_bb_pos                      323190.0000   
fz_dev_close_ema20             323190.0000   
fz_dev_close_ema200            323190.0000   
fz_dev_close_vwap_15         

In [63]:
X = df_model[feature_cols]
y = df_model['y_target']

print(f"X (Features) shape: {X.shape}")
print(f"y (Target) shape: {y.shape}")

X (Features) shape: (323190, 159)
y (Target) shape: (323190,)


In [64]:
split_ratio = 1.0 / 1.5 
split_index = int(len(df_model) * split_ratio)

X_val = X.iloc[:split_index]
y_val = y.iloc[:split_index]

X_test = X.iloc[split_index:]
y_test = y.iloc[split_index:]

print(f"Validation set (X_val, y_val) shape: {X_val.shape}, {y_val.shape}")
print(f"Testing set (X_test, y_test) shape: {X_test.shape}, {y_test.shape}")

Validation set (X_val, y_val) shape: (215460, 159), (215460,)
Testing set (X_test, y_test) shape: (107730, 159), (107730,)


In [None]:


# --- 1. Window Param ---
n_splits = 5
n_total_samples = len(X_val)
n_chunks = n_splits + 1

# Make sure every training chunk has at least 1 row
chunk_size = max(1, n_total_samples // n_chunks) 

print(f"--- Preparing {n_splits}-fold Rolling Window Validation ---")
print(f"X_val total rows: {n_total_samples}")
print(f"Rolling window trainin set size (max_train_size): {chunk_size}")

# --- 2. Create CV (TimeSeriesSplit) ---
tscv_rolling = TimeSeriesSplit(
    n_splits=n_splits,
    max_train_size=chunk_size
)

# 3. Create Pipeline
pipe = Pipeline([
    ('scaler', MaxAbsScaler()),
    ('ridge', Ridge())
])

# 4. Param Grid
param_grid = {
    'ridge__alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]
}

# 5. Grid Search
print(f"\n--- Having Grid Search CV on 'X_val' ... ---")
search = GridSearchCV(pipe, param_grid, 
                      cv=tscv_rolling,
                      scoring='neg_mean_squared_error', 
                      n_jobs=-1,
                      verbose=1)
search.fit(X_val, y_val)

# 6. Print Best Result
best_alpha = search.best_params_['ridge__alpha']
best_score = -search.best_score_

print("\n--- COMPLETED! ---")
print(f"Best Alpha (ridge__alpha): {best_alpha}")
print(f"Best MSE: {best_score}")


# Save Model
# joblib.dump(search.best_estimator_, 'best_ridge_pipeline.pkl')

🧹 Dropping 8 zero-variance cols globally.
--- Preparing 5-fold Rolling Window Validation ---
X_val total rows: 215460
Rolling window trainin set size (max_train_size): 35910

--- Having Grid Search CV on 'X_val' ... ---
Fitting 5 folds for each of 5 candidates, totalling 25 fits

--- COMPLETED! ---
Best Alpha (ridge__alpha): 1000.0
Best MSE: 1.5430425472182314e+37


In [59]:
# 1. 手动去掉全程零方差列
var = X_val.var()
drop_cols = var[var == 0].index.tolist()
print(f"🧹 Dropping {len(drop_cols)} zero-variance cols globally.")
X_val = X_val.drop(columns=drop_cols)

# 2. 定义稳定的 pipeline
pipe = Pipeline([
    ('scaler', MaxAbsScaler()),
    ('ridge', Ridge())
])

# 3. Ridge alpha grid
param_grid = {'ridge__alpha': [0.1, 1.0, 10.0, 100.0, 1000.0, 1e4]}

# 4. TimeSeries CV
tscv = TimeSeriesSplit(n_splits=10)

search = GridSearchCV(
    pipe, param_grid,
    cv=tscv,
    scoring='neg_mean_squared_error',
    n_jobs=-1, verbose=1
)

search.fit(X_val, y_val)

# 6. Print Best Result
best_alpha = search.best_params_['ridge__alpha']
best_score = -search.best_score_

print("\n--- COMPLETED! ---")
print(f"Best Alpha (ridge__alpha): {best_alpha}")
print(f"Best MSE: {best_score}")

🧹 Dropping 0 zero-variance cols globally.
Fitting 10 folds for each of 6 candidates, totalling 60 fits

--- COMPLETED! ---
Best Alpha (ridge__alpha): 10000.0
Best MSE: 1.137128768895639e+22


In [49]:

print("--- 正在运行诊断程序：检查零方差列 ---")

# --- 1. 确保 X_val 存在 ---
try:
    X_val
    y_val
except NameError:
    print("*** 错误: 'X_val' 未定义。***")
    print("请先运行 单元格 1、2、3 来创建 'X_val'。")
    # 如果 X_val 不存在，就停止
    raise

# --- 2. 重新创建你的滚动窗口CV ---
n_splits = 5
n_total_samples = len(X_val)
n_chunks = n_splits + 1
chunk_size = max(1, n_total_samples // n_chunks) 

tscv_rolling = TimeSeriesSplit(
    n_splits=n_splits,
    max_train_size=chunk_size
)

print(f"X_val 总行数: {n_total_samples}")
print(f"滚动窗口训练集大小 (max_train_size): {chunk_size} 行")
print("-" * 30)

# --- 3. 循环遍历每一折，找出问题列 ---
# 我们用一个 set 来存储所有出过问题的列名
problem_columns_set = set()
fold_count = 1

for train_index, test_index in tscv_rolling.split(X_val):
    # a. 获取这一折的训练数据
    X_train_chunk = X_val.iloc[train_index]
    
    # b. 计算这一"块"数据中每一列的"标准差" (Standard Deviation)
    chunk_std = X_train_chunk.std()
    
    # c. 找出标准差为 0 (或 NaN) 的列
    constant_cols = chunk_std[ (chunk_std == 0) | (chunk_std.isnull()) ].index.tolist()
    
    if len(constant_cols) > 0:
        print(f"[第 {fold_count}/{n_splits} 折]: ⚠️ 发现 {len(constant_cols)} 个恒定列!")
        # 把这些列名添加到我们的总集合中
        problem_columns_set.update(constant_cols)
    else:
        print(f"[第 {fold_count}/{n_splits} 折]: ✅ 正常。")
    
    fold_count += 1

# --- 4. 打印最终的总结报告 ---
print("\n" + "=" * 30)
print("--- 诊断完成 ---")

if not problem_columns_set:
    print("✅ 恭喜！在任何一折中都没有发现零方差列。")
    print("   (如果之前报错，问题可能出在别处)")
else:
    print(f"🔥 总共在 {len(problem_columns_set)} 个列中发现了问题 (在至少一折中方差为0):")
    
    # 排序后打印
    sorted_problem_cols = sorted(list(problem_columns_set))
    for col_name in sorted_problem_cols:
        print(f"  - {col_name}")

    print("\n这些列 (很可能是基本面或情绪特征) 导致 Scaler 除以零而崩溃。")
    print("使用 'VarianceThreshold(threshold=0.0)' 管道步骤是正确的解决方案。")

--- 正在运行诊断程序：检查零方差列 ---
X_val 总行数: 215460
滚动窗口训练集大小 (max_train_size): 35910 行
------------------------------
[第 1/5 折]: ⚠️ 发现 10 个恒定列!
[第 2/5 折]: ⚠️ 发现 10 个恒定列!
[第 3/5 折]: ⚠️ 发现 15 个恒定列!
[第 4/5 折]: ⚠️ 发现 15 个恒定列!
[第 5/5 折]: ⚠️ 发现 15 个恒定列!

--- 诊断完成 ---
🔥 总共在 15 个列中发现了问题 (在至少一折中方差为0):
  - div_amount_rz_8
  - div_negative_flag
  - eps_actual_iqr_outlier
  - eps_actual_rz_8
  - eps_estimate_iqr_outlier
  - eps_estimate_rz_8
  - eps_surp_pct_final_iqr_outlier
  - eps_surp_winsor_iqr_outlier
  - shares_out_nonpos_flag
  - split_flag
  - split_mult
  - split_mult_iqr_outlier
  - split_nonpos_flag
  - split_ratio
  - split_ratio_iqr_outlier

这些列 (很可能是基本面或情绪特征) 导致 Scaler 除以零而崩溃。
使用 'VarianceThreshold(threshold=0.0)' 管道步骤是正确的解决方案。
