In [11]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, MaxAbsScaler
from sklearn.compose import TransformedTargetRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import joblib


In [2]:
file_path = "data/final_df.csv"
df_ori = pd.read_csv(file_path)

df_ori['datetime'] = pd.to_datetime(df_ori['datetime'])
df_ori = df_ori.sort_values(by=['datetime', 'symbol'])
df_ori = df_ori.reset_index(drop=True)
df_ori.head()

Unnamed: 0,datetime,symbol,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,f_minsin,f_mincos,...,div_amount_iqr_outlier,shares_out_nonpos_flag,split_nonpos_flag,shares_out,log_shares_out,eps_surp_pct_final,div_amount,log_shares_out_iqr_outlier,eps_estimate_rz_8,eps_actual
0,2024-04-30 12:50:00,AMAT,0,1,0,0,0,0,-0.94693,0.321439,...,0,0,1,830897024,20.538016,0.0,0.0,0,0.0,0.0
1,2024-04-30 12:50:00,AMD,0,1,0,0,0,0,-0.94693,0.321439,...,0,0,1,1616140032,21.203306,2.04,0.0,0,0.0,0.62
2,2024-04-30 12:50:00,AVGO,0,1,0,0,0,0,-0.94693,0.321439,...,0,0,1,465308000,19.95821,0.0,0.0,0,0.0,0.0
3,2024-04-30 12:50:00,MU,0,1,0,0,0,0,-0.94693,0.321439,...,0,0,1,1107369984,20.825254,0.0,0.0,0,0.0,0.0
4,2024-04-30 12:50:00,NVDA,0,1,0,0,0,0,-0.94693,0.321439,...,0,0,1,2500000000,21.639557,0.0,0.0,0,0.0,0.0


In [3]:
df = df_ori.copy()

In [4]:
df['y_target'] = df.groupby(['symbol', df['datetime'].dt.date])['fz_lret_1_rolling'].shift(-1)
df[["datetime", "symbol", "y_target"]]

Unnamed: 0,datetime,symbol,y_target
0,2024-04-30 12:50:00,AMAT,-0.194539
1,2024-04-30 12:50:00,AMD,0.722399
2,2024-04-30 12:50:00,AVGO,-0.248127
3,2024-04-30 12:50:00,MU,0.793029
4,2024-04-30 12:50:00,NVDA,-1.007944
...,...,...,...
324895,2025-10-28 15:59:00,AMAT,
324896,2025-10-28 15:59:00,AMD,
324897,2025-10-28 15:59:00,AVGO,
324898,2025-10-28 15:59:00,MU,


In [5]:
features_to_drop = ['y_target', 'fz_lret_1_rolling','year', 'month', 'day', 'minute', 'minute_of_day', 'dow_5', 'dow_6']

all_cols_set = set(df.columns)
drop_cols_set = set(features_to_drop)
keep_cols_set = all_cols_set - drop_cols_set - {'datetime', 'symbol'}
feature_cols = sorted(list(keep_cols_set))

df_model = df.dropna(subset=['y_target'])
df_model[["datetime", "symbol", 'fz_lret_1_rolling', "y_target"]]

Unnamed: 0,datetime,symbol,fz_lret_1_rolling,y_target
0,2024-04-30 12:50:00,AMAT,0.364913,-0.194539
1,2024-04-30 12:50:00,AMD,2.739025,0.722399
2,2024-04-30 12:50:00,AVGO,0.845791,-0.248127
3,2024-04-30 12:50:00,MU,1.085664,0.793029
4,2024-04-30 12:50:00,NVDA,1.273713,-1.007944
...,...,...,...,...
324890,2025-10-28 15:58:00,AMAT,-0.714656,-0.119255
324891,2025-10-28 15:58:00,AMD,-0.811072,-1.577809
324892,2025-10-28 15:58:00,AVGO,-0.057482,0.315530
324893,2025-10-28 15:58:00,MU,0.281664,-0.146954


In [6]:
X = df_model[feature_cols]
y = df_model['y_target']

print(f"X (Features) shape: {X.shape}")
print(f"y (Target) shape: {y.shape}")

X (Features) shape: (323190, 181)
y (Target) shape: (323190,)


In [7]:
split_ratio = 1.0 / 1.5 
split_index = int(len(df_model) * split_ratio)

X_val = X.iloc[:split_index]
y_val = y.iloc[:split_index]

X_test = X.iloc[split_index:]
y_test = y.iloc[split_index:]

print(f"Validation set (X_val, y_val) shape: {X_val.shape}, {y_val.shape}")
print(f"Testing set (X_test, y_test) shape: {X_test.shape}, {y_test.shape}")

Validation set (X_val, y_val) shape: (215460, 181), (215460,)
Testing set (X_test, y_test) shape: (107730, 181), (107730,)


In [8]:
# --- 1. Window Param ---
n_splits = 5
n_total_samples = len(X_val)
n_chunks = n_splits + 1

# Make sure every training chunk has at least 1 row
chunk_size = max(1, n_total_samples // n_chunks) 

print(f"--- Preparing {n_splits}-fold Rolling Window Validation ---")
print(f"X_val total rows: {n_total_samples}")
print(f"Rolling window trainin set size (max_train_size): {chunk_size}")

# --- 2. Create CV (TimeSeriesSplit) ---
tscv_rolling = TimeSeriesSplit(
    n_splits=n_splits,
    max_train_size=chunk_size
)

# 3. Create Pipeline
pipe = Pipeline([
    ('scaler', MaxAbsScaler()),
    ('ridge', Ridge())
])

# 4. Param Grid
param_grid = {
    'ridge__alpha': [0.1, 1.0, 10.0, 100.0, 1000.0]
}

# 5. Grid Search
print(f"\n--- Having Grid Search CV on 'X_val' ... ---")
search = GridSearchCV(pipe, param_grid, 
                      cv=tscv_rolling,
                      scoring='neg_mean_squared_error', 
                      n_jobs=-1,
                      verbose=1)
search.fit(X_val, y_val)

# 6. Print Best Result
best_alpha = search.best_params_['ridge__alpha']
best_score = -search.best_score_

print("\n--- COMPLETED! ---")
print(f"Best Alpha (ridge__alpha): {best_alpha}")
print(f"Best MSE: {best_score}")


# Save Model
# joblib.dump(search.best_estimator_, 'best_ridge_pipeline.pkl')

--- Preparing 5-fold Rolling Window Validation ---
X_val total rows: 215460
Rolling window trainin set size (max_train_size): 35910

--- Having Grid Search CV on 'X_val' ... ---
Fitting 5 folds for each of 5 candidates, totalling 25 fits

--- COMPLETED! ---
Best Alpha (ridge__alpha): 100.0
Best MSE: 1.3238829672927097


In [12]:
## Pipeline
pipe = Pipeline([
    ("imp",  SimpleImputer(strategy="median")),
    ("sc",   MaxAbsScaler()),
    ("reg",  TransformedTargetRegressor(
                regressor=Ridge(solver="lsqr", max_iter=10000),
                transformer=StandardScaler())
    ),
])

# Hyper param grid
param_grid = {
    "reg__regressor__alpha": [10.0, 50.0, 100.0, 1000.0, 1e4]
}

# TimeSeriesSplit
tscv = TimeSeriesSplit(n_splits=10)

# Grid Search
print(f"\n--- GridSearchCV ... ---")
search = GridSearchCV(pipe, param_grid, 
                      cv=tscv,
                      scoring=["neg_mean_squared_error", 'r2'],
                      refit='neg_mean_squared_error',
                      n_jobs=-1, verbose=1)
search.fit(X_val, y_val)

# Result
best_alpha = search.best_params_["reg__regressor__alpha"]
best_score = -search.best_score_ 

print(f"Best Alpha (reg__regressor__alpha): {best_alpha}")
print(f"Best MSE: {best_score}")

results_df = pd.DataFrame(search.cv_results_)

columns_to_see = [
    'param_reg__regressor__alpha',
    'mean_test_neg_mean_squared_error',
    'std_test_neg_mean_squared_error',
    'mean_test_r2', 
    'std_test_r2'
]

columns_to_see = [col for col in columns_to_see if col in results_df.columns]

print("\n--- CV Details ---")
with pd.option_context('display.float_format', '{:,.8f}'.format):
    print(results_df[columns_to_see].sort_values(by='mean_test_neg_mean_squared_error', ascending=False))

# Save model
# joblib.dump(search.best_estimator_, 'final_ridge_pipeline.pkl')


--- GridSearchCV ... ---
Fitting 10 folds for each of 5 candidates, totalling 50 fits
Best Alpha (reg__regressor__alpha): 1000.0
Best MSE: 1.3287037654002272

--- CV Details ---
  param_reg__regressor__alpha  mean_test_neg_mean_squared_error  \
3              1,000.00000000                       -1.32870377   
4             10,000.00000000                       -1.33094826   
2                100.00000000                       -1.34158309   
1                 50.00000000                       -1.34575956   
0                 10.00000000                       -1.35925524   

   std_test_neg_mean_squared_error  mean_test_r2  std_test_r2  
3                       0.05255392    0.00331073   0.00861178  
4                       0.05407863    0.00170564   0.00171273  
2                       0.06254999   -0.00687274   0.04333493  
1                       0.06973931   -0.01014474   0.05250447  
0                       0.09953937   -0.02074385   0.08291822  


In [9]:
n_splits = 5
n_total_samples = len(X_val)
n_chunks = n_splits + 1
chunk_size = max(1, n_total_samples // n_chunks) 

tscv_rolling = TimeSeriesSplit(
    n_splits=n_splits,
    max_train_size=chunk_size
)

print(f"X_val total rows: {n_total_samples}")
print(f"(max_train_size): {chunk_size} ")
print("-" * 30)

problem_columns_set = set()
fold_count = 1

for train_index, test_index in tscv_rolling.split(X_val):
    X_train_chunk = X_val.iloc[train_index]
    chunk_std = X_train_chunk.std() 
    constant_cols = chunk_std[ (chunk_std == 0) | (chunk_std.isnull()) ].index.tolist()
    
    if len(constant_cols) > 0:
        print(f"[Fold {fold_count}/{n_splits}]: found {len(constant_cols)} constant column!")
        problem_columns_set.update(constant_cols)
    else:
        print(f"[Fold {fold_count}/{n_splits}]: Normal")
    
    fold_count += 1

# --- Result ---
print("\n" + "=" * 30)
print("--- Done ---")

if not problem_columns_set:
    print("✅ No fold has zero variance。")
else:
    print(f"In {len(problem_columns_set)} columns found issues (at least 1 fold variance = 0):")
    
    # 排序后打印
    sorted_problem_cols = sorted(list(problem_columns_set))
    for col_name in sorted_problem_cols:
        print(f"  - {col_name}")

X_val total rows: 215460
(max_train_size): 35910 
------------------------------
[Fold 1/5]: found 21 constant column!
[Fold 2/5]: found 21 constant column!
[Fold 3/5]: found 26 constant column!
[Fold 4/5]: found 26 constant column!
[Fold 5/5]: found 26 constant column!

--- Done ---
In 26 columns found issues (at least 1 fold variance = 0):
  - div_amount_rz_8
  - div_negative_flag
  - eps_actual_iqr_outlier
  - eps_actual_rz_8
  - eps_estimate_iqr_outlier
  - eps_estimate_rz_8
  - eps_surp_pct_final_iqr_outlier
  - eps_surp_winsor_iqr_outlier
  - fz_vol_ratio_60
  - morning_n_rs
  - morning_source_div_rs
  - morning_tone_mean_rs
  - morning_tone_sum_rs
  - n_news_ewm_hl15_is_zero
  - n_news_ewm_hl5_is_zero
  - overnight_n_rs
  - overnight_source_div_rs
  - overnight_tone_mean_rs
  - overnight_tone_sum_rs
  - shares_out_nonpos_flag
  - split_flag
  - split_mult
  - split_mult_iqr_outlier
  - split_nonpos_flag
  - split_ratio
  - split_ratio_iqr_outlier


In [10]:
try:
    # --- 1. Checking NaN  ---
    nan_counts = df_model.isnull().sum().sum()
    print("--- Checking NaN ---")
    if nan_counts == 0:
        print("✅ 'df_model' has no NaN")
    else:
        print(f"⚠️ 'df_model' found {nan_counts} NaN")

    # --- 2. Checking ±Inf ---
    numeric_cols = df_model.select_dtypes(include=[np.number]).columns
    pos_inf_count = np.isinf(df_model[numeric_cols]).values.sum()
    neg_inf_count = np.isneginf(df_model[numeric_cols]).values.sum()

    print("\n--- Checking ±Inf ---")
    if pos_inf_count == 0 and neg_inf_count == 0:
        print("✅ 'df_model' has no Inf")
    else:
        print(f"⚠️ Found {pos_inf_count} +Inf")
        print(f"⚠️ Found {neg_inf_count} -Inf")

    # --- Check for extreme values ---
    print("\n--- Check for extreme values ---")
    
    numeric_cols_for_describe = df_model.select_dtypes(include=[np.number]).columns
    
    with pd.option_context(
        'display.float_format', '{:.4f}'.format,
        'display.max_rows', None 
    ):
        description = df_model[numeric_cols_for_describe].describe(percentiles=[.01, .25, .5, .75, .99]).T
        print(description)

except NameError:
    print("\n*** Error: 'df_model' not defined***")
except Exception as e:
    print(f"\n*** Error: {e} ***")

--- Checking NaN ---
✅ 'df_model' has no NaN

--- Checking ±Inf ---
✅ 'df_model' has no Inf

--- Check for extreme values ---
                                     count            mean             std  \
dow_0                          323190.0000          0.1901          0.3923   
dow_1                          323190.0000          0.2105          0.4077   
dow_2                          323190.0000          0.1988          0.3991   
dow_3                          323190.0000          0.1988          0.3991   
dow_4                          323190.0000          0.2018          0.4013   
dow_5                          323190.0000          0.0000          0.0000   
f_minsin                       323190.0000         -0.5751          0.1901   
f_mincos                       323190.0000         -0.7794          0.1602   
close                          323190.0000        158.6059         53.0292   
f_volu_z_15_clipped            323190.0000          0.1450          1.0751   
f_volu_z_30_clip