In [None]:
import pandas as pd
import numpy as np
import joblib
import re

# Sklearn imports
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MaxAbsScaler
from sklearn.linear_model import Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

from tqdm.auto import tqdm

In [None]:
SELECTED_FEATURES_PATH = "../data/selected_features.json"

if Path(SELECTED_FEATURES_PATH).exists():
    with open(SELECTED_FEATURES_PATH, "r") as f:
        SELECTED_FEATURES = json.load(f)
    print(f"Loaded {len(SELECTED_FEATURES)} fixed features.")
else:
    all_cols = sorted(X.columns.tolist())
    print(all_cols)  # 你手动删除滞后列后再写入
    raise SystemExit("请根据打印的列手动编辑，并保存到 JSON")


In [None]:
def calculate_sharpe(pnl_series, periods_per_year=390*252):
    std = pnl_series.std()
    if std == 0:
        return 0.0
    raw = pnl_series.mean() / std
    return raw * np.sqrt(periods_per_year)

## Load Data

In [None]:
file_path = '../data/final_df.csv' 
try:
    df_ori = pd.read_csv(file_path)
except FileNotFoundError:
    print(f"*** ERROR: Cannot find {file_path} ***")


In [None]:
df = df_ori.copy()

# Sort by symbol and datetime
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values(by=['symbol', 'datetime'])
df = df.reset_index(drop=True)

df['lret_1m'] = df.groupby('symbol')['close'].transform(lambda s: np.log(s).diff())
df['y_target'] = df.groupby('symbol')['lret_1m'].shift(-1)

initial_rows = len(df)
df = df[(df['y_target'].abs() <= 0.2)]
df = df.dropna(subset=['y_target', 'lret_1m'])

print(f"(Dropped {initial_rows - len(df)} unusal/NaN y value)")

# --- Re-sort by datatime, symbol ---
df = df.sort_values(by=['datetime', 'symbol'])
df = df.reset_index(drop=True)

df

(Dropped 12 unusal/NaN y value)


Unnamed: 0,datetime,symbol,dow_0,dow_1,dow_2,dow_3,dow_4,dow_5,f_minsin,f_mincos,...,split_nonpos_flag,shares_out,log_shares_out,eps_surp_pct_final,div_amount,log_shares_out_iqr_outlier,eps_estimate_rz_8,eps_actual,lret_1m,y_target
0,2024-04-30 12:51:00,AMAT,0,1,0,0,0,0,-0.220697,-0.975342,...,1,830897024,20.538016,0.00,0.0,0,0.0,0.00,-0.000199,-0.000100
1,2024-04-30 12:51:00,AMD,0,1,0,0,0,0,-0.220697,-0.975342,...,1,1616140032,21.203306,2.04,0.0,0,0.0,0.62,0.000314,-0.000126
2,2024-04-30 12:51:00,AVGO,0,1,0,0,0,0,-0.220697,-0.975342,...,1,465308000,19.958210,0.00,0.0,0,0.0,0.00,-0.000334,0.000721
3,2024-04-30 12:51:00,MU,0,1,0,0,0,0,-0.220697,-0.975342,...,1,1107369984,20.825254,0.00,0.0,0,0.0,0.00,0.000698,-0.000698
4,2024-04-30 12:51:00,NVDA,0,1,0,0,0,0,-0.220697,-0.975342,...,1,2500000000,21.639557,0.00,0.0,0,0.0,0.00,-0.000769,-0.000907
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324883,2025-10-28 15:58:00,AMAT,0,1,0,0,0,0,-0.861629,-0.507538,...,1,796642427,20.495916,0.00,0.0,0,0.0,0.00,-0.000461,-0.000132
324884,2025-10-28 15:58:00,AMD,0,1,0,0,0,0,-0.861629,-0.507538,...,1,1633284837,21.213859,0.00,0.0,0,0.0,0.00,-0.000698,-0.001316
324885,2025-10-28 15:58:00,AVGO,0,1,0,0,0,0,-0.861629,-0.507538,...,1,4722365022,22.275576,0.00,0.0,0,0.0,0.00,0.000161,0.000375
324886,2025-10-28 15:58:00,MU,0,1,0,0,0,0,-0.861629,-0.507538,...,1,1122466035,20.838794,0.00,0.0,0,0.0,0.00,0.000270,-0.000135


In [None]:
df[["datetime", "symbol", 'fz_lret_1_rolling', "lret_1m", "y_target"]]

Unnamed: 0,datetime,symbol,fz_lret_1_rolling,lret_1m,y_target
0,2024-04-30 12:51:00,AMAT,-0.194539,-0.000199,-0.000100
1,2024-04-30 12:51:00,AMD,0.722399,0.000314,-0.000126
2,2024-04-30 12:51:00,AVGO,-0.248127,-0.000334,0.000721
3,2024-04-30 12:51:00,MU,0.793029,0.000698,-0.000698
4,2024-04-30 12:51:00,NVDA,-1.007944,-0.000769,-0.000907
...,...,...,...,...,...
324883,2025-10-28 15:58:00,AMAT,-0.714656,-0.000461,-0.000132
324884,2025-10-28 15:58:00,AMD,-0.811072,-0.000698,-0.001316
324885,2025-10-28 15:58:00,AVGO,-0.057482,0.000161,0.000375
324886,2025-10-28 15:58:00,MU,0.281664,0.000270,-0.000135


## Data Cleaning

In [None]:
y = df['y_target']
X = df.drop(columns=[
    'y_target', 'lret_1m', 'datetime', 'symbol',  
    'year', 'month', 'day', 'minute', 'minute_of_day'
], errors='ignore')

print(f"X original shape: {X.shape}")

# --- Find bad Cols with wrong data ---
desc = X.describe(percentiles=[0.99]).T
bad_cols = []

# std > 1000
bad_cols += desc.index[desc['std'] > 1e3].tolist()
# 99% > 1000
bad_cols += desc.index[desc['99%'].abs() > 1e3].tolist()
# Max > 1,000,000
bad_cols += desc.index[desc['max'].abs() > 1e6].tolist()
# Constant columns
bad_cols += desc.index[desc['std'] == 0].tolist()

bad_cols_set = sorted(set(bad_cols))
print(f"--- Dropping {len(bad_cols_set)} bad cols ---")
for col in bad_cols_set:
    print(f"  - {col}")

# --- Drop bad cols ---
X_cleaned = X.drop(columns=bad_cols_set)

print(f"\n--- X cleaned ---")
print(f"X (cleaned) shape: {X_cleaned.shape}")

X original shape: (324888, 184)
--- Dropping 37 bad cols ---
  - cnt_15m_rs
  - cnt_30m_rs
  - cnt_5m_rs
  - cnt_60m_rs
  - div_negative_flag
  - dow_5
  - dow_6
  - eps_actual_iqr_outlier
  - eps_estimate_iqr_outlier
  - fz_vol_ratio_60
  - mins_since_last_news_rs
  - morning_n_rs
  - morning_source_div_rs
  - morning_tone_mean_rs
  - morning_tone_sum_rs
  - n_news_ewm_hl15_is_zero
  - n_news_ewm_hl15_rs
  - n_news_ewm_hl5_is_zero
  - n_news_ewm_hl5_rs
  - n_news_rs
  - n_pos_raw_rs
  - overnight_n_rs
  - overnight_source_div_rs
  - overnight_tone_mean_rs
  - overnight_tone_sum_rs
  - shares_out
  - shares_out_ffill
  - shares_out_nonpos_flag
  - split_flag
  - surprise_tone_mean_rs
  - tone_ewm15_rs
  - tone_ewm30_rs
  - tone_mean_delta_rs
  - tone_mean_ewm_hl15_rs
  - tone_mean_rs
  - tone_sum_rs
  - volume

--- X cleaned ---
X (cleaned) shape: (324888, 147)


## Split Data

In [None]:
split_ratio = 1.0 / 1.5 
split_index = int(len(X_cleaned) * split_ratio)

X_val = X_cleaned.iloc[:split_index]
y_val = y.iloc[:split_index]

X_test = X_cleaned.iloc[split_index:]
y_test = y.iloc[split_index:]

print(f"--- Data Splitting done ---")
print(f" Validation set (X_val, y_val) shape: {X_val.shape}, {y_val.shape}")
print(f" Testing set (X_test, y_test) shape: {X_test.shape}, {y_test.shape}")

--- Data Splitting done ---
 Validation set (X_val, y_val) shape: (216592, 147), (216592,)
 Testing set (X_test, y_test) shape: (108296, 147), (108296,)


In [None]:
class QuantileClipper(BaseEstimator, TransformerMixin):
    def __init__(self, lower=0.005, upper=0.995):
        self.lower = lower
        self.upper = upper
        self.q_low_ = None
        self.q_high_ = None

    def fit(self, X, y=None):
        self.q_low_  = np.nanpercentile(X, self.lower * 100, axis=0) 
        self.q_high_ = np.nanpercentile(X, self.upper * 100, axis=0)
        return self

    def transform(self, X):
        return np.clip(X, self.q_low_, self.q_high_)

In [None]:
X_cleaned = X_cleaned[SELECTED_FEATURES].copy()
X_val = X_val[SELECTED_FEATURES].copy()
X_test = X_test[SELECTED_FEATURES].copy()

### Applied Scaler only to raw features

In [None]:
def split_columns_for_scaling(X: pd.DataFrame):
    # Regex to find already-processed columns
    patterns = [
        r'^fz_', r'_z_', r'(?:^|_)tanh', r'(?:^|_)arctanh',
        r'(?:^|_)sin$', r'(?:^|_)cos$', r'^dow_\d+$',
        r'^has_news$', r'(?:^|_)log', r'(?:^|_)ln', r'(?:^|_)ewm',
        r'^tone_', r'_flag$', r'_iqr_outlier$'
    ]
    regex = re.compile('|'.join(patterns))

    name_matched = [c for c in X.columns if regex.search(c)]
    # Find all 0/1 binary columns
    binary_cols = [c for c in X.columns if X[c].nunique(dropna=True) <= 2]

    no_scale_cols = sorted(set(name_matched + binary_cols))
    scale_cols = [c for c in X.columns if c not in no_scale_cols]
    return no_scale_cols, scale_cols

no_scale_cols, scale_cols = split_columns_for_scaling(X_val)

print(f"--- Feature separation complete ---")
print(f"Will scale {len(scale_cols)} 'raw' columns (e.g., 'close', 'high'...)")
print(f"Will skip {len(no_scale_cols)} 'processed' columns (e.g., 'fz_...', 'dow_0'...)")

--- Feature separation complete ---
Will scale 44 'raw' columns (e.g., 'close', 'high'...)
Will skip 103 'processed' columns (e.g., 'fz_...', 'dow_0'...)


In [None]:
# Selected features
SELECTED_FEATURES =

# Sequentially apply transformations

In [None]:
SEQ_LEN = 60  # 例如用过去 60 分钟
PRED_HORIZON = 1

def build_sequences(df, feature_cols, target_col, seq_len=SEQ_LEN):
    X, y = [], []
    for sym, g in df.groupby("symbol"):
        g = g.sort_values("datetime")
        feat = g[feature_cols].values
        target = g[target_col].values
        for i in range(len(g) - seq_len - PRED_HORIZON + 1):
            X.append(feat[i:i+seq_len])
            y.append(target[i+seq_len+PRED_HORIZON-1])
    return np.array(X, dtype=np.float32), np.array(y, dtype=np.float32)

In [None]:
SEQ_LEN = 60
BATCH_SIZE = 256

X_seq_val, y_seq_val = build_sequences(df.loc[X_val.index], SELECTED_FEATURES, "y_target", seq_len=SEQ_LEN)
X_seq_test, y_seq_test = build_sequences(df.loc[X_test.index], SELECTED_FEATURES, "y_target", seq_len=SEQ_LEN)

def build_prefix_loaders(X_seq, y_seq, n_splits=10):
    fold_sizes = np.linspace(0, len(X_seq), n_splits+1, dtype=int)
    folds = []
    for i in range(1, len(fold_sizes)):
        train_end = fold_sizes[i]
        val_end = fold_sizes[i] + max(1, fold_sizes[1])  # 让验证期与训练期后一段相邻
        X_tr, y_tr = X_seq[:train_end], y_seq[:train_end]
        X_va, y_va = X_seq[train_end:val_end], y_seq[train_end:val_end]
        folds.append((
            torch.utils.data.DataLoader(TensorDataset(torch.from_numpy(X_tr), torch.from_numpy(y_tr)),
                                        batch_size=BATCH_SIZE, shuffle=True),
            torch.utils.data.DataLoader(TensorDataset(torch.from_numpy(X_va), torch.from_numpy(y_va)),
                                        batch_size=BATCH_SIZE, shuffle=False)
        ))
    return folds

prefix_folds = build_prefix_loaders(X_seq_val, y_seq_val, n_splits=10)


# Transformer Model (PyTorch)

In [None]:
import torch
from torch import nn

class ReturnTransformer(nn.Module):
    def __init__(self, feature_dim, d_model=64, nhead=4, num_layers=2, ff_dim=128, dropout=0.1):
        super().__init__()
        self.input_proj = nn.Linear(feature_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=nhead,
            dim_feedforward=ff_dim, dropout=dropout,
            batch_first=True
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.head = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.input_proj(x)
        enc = self.encoder(x)
        return self.head(enc[:, -1, :]).squeeze(-1)
def fit_transformer(folds, feature_dim, epochs=10, lr=1e-3):
    best_models = []
    for fold_id, (train_loader, val_loader) in enumerate(folds, 1):
        model = ReturnTransformer(feature_dim)
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
        criterion = nn.MSELoss()
        best_loss = float("inf")
        best_state = None
        for epoch in range(epochs):
            train_loss = train_epoch(model, train_loader, optimizer, criterion)
            val_loss = eval_epoch(model, val_loader, criterion)
            if val_loss < best_loss:
                best_loss = val_loss
                best_state = copy.deepcopy(model.state_dict())
        model.load_state_dict(best_state)
        best_models.append(model)
        print(f"Fold {fold_id} best val MSE: {best_loss:.6f}")
    return best_models

best_models = fit_transformer(prefix_folds, feature_dim=len(SELECTED_FEATURES), epochs=8)

## Training Loop using MSE Loss

In [None]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for xb, yb in loader:
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(xb)
    return total_loss / len(loader.dataset)


## Predicting and Evaluating

In [None]:
def predict_sequences(model, X_seq):
    model.eval()
    with torch.no_grad():
        X_tensor = torch.from_numpy(X_seq)
        return model(X_tensor).numpy()

val_preds = predict_sequences(best_models[-1], X_seq_val)
val_pnl = val_preds * y_seq_val
print("Val Sharpe:", calculate_sharpe(pd.Series(val_pnl)))

test_preds = predict_sequences(best_models[-1], X_seq_test)
test_pnl = test_preds * y_seq_test
print("Test Sharpe:", calculate_sharpe(pd.Series(test_pnl)))
