In [1]:
import json
import math
import re
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from tqdm.auto import tqdm


ModuleNotFoundError: No module named 'torch'

In [None]:
def calculate_sharpe(pnl_series, periods_per_year=390 * 252):
    std = pnl_series.std()
    if std == 0:
        return 0.0
    raw = pnl_series.mean() / std
    return raw * np.sqrt(periods_per_year)


## Load data

In [None]:
DATA_PATH = Path('../data/final_df.csv')
if not DATA_PATH.exists():
    raise FileNotFoundError(f"Cannot find {DATA_PATH}")

df_ori = pd.read_csv(DATA_PATH)
df = df_ori.copy()
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values(['symbol', 'datetime']).reset_index(drop=True)

df['lret_1m'] = df.groupby('symbol')['close'].transform(lambda s: np.log(s).diff())
df['y_target'] = df.groupby('symbol')['lret_1m'].shift(-1)

initial_rows = len(df)
df = df[df['y_target'].abs() <= 0.2].dropna(subset=['y_target', 'lret_1m'])
df = df.sort_values(['datetime', 'symbol']).reset_index(drop=True)
print(f"Dropped {initial_rows - len(df)} rows with abnormal/NaN targets")
print(df[['datetime', 'symbol', 'close', 'lret_1m', 'y_target']].head())


## Feature cleaning

In [None]:
y = df['y_target']
X = df.drop(columns=[
    'y_target', 'lret_1m', 'datetime', 'symbol',
    'year', 'month', 'day', 'minute', 'minute_of_day'
], errors='ignore')
print(f"Original feature shape: {X.shape}")

stats = X.describe(percentiles=[0.99]).T
bad_cols = set()
bad_cols.update(stats.index[stats['std'] > 1e3])
bad_cols.update(stats.index[stats['99%'].abs() > 1e3])
bad_cols.update(stats.index[stats['max'].abs() > 1e6])
bad_cols.update(stats.index[stats['std'] == 0])

bad_cols = sorted(bad_cols)
print(f"Dropping {len(bad_cols)} problematic columns")
X_cleaned = X.drop(columns=bad_cols)
print(f"Cleaned feature shape: {X_cleaned.shape}")


## Train / test split

In [None]:
split_ratio = 1.0 / 1.5
split_index = int(len(X_cleaned) * split_ratio)

X_train = X_cleaned.iloc[:split_index]
y_train = y.iloc[:split_index]
X_test = X_cleaned.iloc[split_index:]
y_test = y.iloc[split_index:]

print(f"Train shape: {X_train.shape}")
print(f"Test shape:  {X_test.shape}")


## Preprocessing pipeline

In [None]:
class QuantileClipper(BaseEstimator, TransformerMixin):
    def __init__(self, lower=0.005, upper=0.995):
        self.lower = lower
        self.upper = upper
        self.q_low_ = None
        self.q_high_ = None

    def fit(self, X, y=None):
        self.q_low_ = np.nanpercentile(X, self.lower * 100, axis=0)
        self.q_high_ = np.nanpercentile(X, self.upper * 100, axis=0)
        return self

    def transform(self, X):
        return np.clip(X, self.q_low_, self.q_high_)

feature_pipeline = Pipeline([
    ('imp', SimpleImputer(strategy='median')),
    ('clip', QuantileClipper(lower=0.005, upper=0.995)),
    ('scale', MaxAbsScaler()),
])

feature_pipeline.fit(X_train)

X_all_processed = pd.DataFrame(
    feature_pipeline.transform(X_cleaned),
    columns=X_cleaned.columns,
    index=X_cleaned.index
)

print("Preprocessing complete")


In [None]:
feature_cols = X_cleaned.columns.tolist()
df_processed = df[['datetime', 'symbol', 'y_target']].copy()
df_processed[feature_cols] = X_all_processed

train_df = df_processed.iloc[:split_index].copy()
test_df = df_processed.iloc[split_index:].copy()


## Sequence construction

In [None]:
SEQ_LEN = 60
PRED_HORIZON = 1


def build_sequences(dataframe, feature_cols, target_col, seq_len=SEQ_LEN):
    X_seq, y_seq, meta_dt, meta_symbol = [], [], [], []
    for symbol, group in dataframe.groupby('symbol'):
        group = group.sort_values('datetime')
        feats = group[feature_cols].values
        target = group[target_col].values
        dts = group['datetime'].values
        for i in range(len(group) - seq_len - PRED_HORIZON + 1):
            start = i
            end = i + seq_len
            target_idx = end + PRED_HORIZON - 1
            X_seq.append(feats[start:end])
            y_seq.append(target[target_idx])
            meta_dt.append(dts[target_idx])
            meta_symbol.append(symbol)
    return (
        np.array(X_seq, dtype=np.float32),
        np.array(y_seq, dtype=np.float32),
        np.array(meta_dt),
        np.array(meta_symbol)
    )

train_X_seq, train_y_seq, train_meta_dt, train_meta_sym = build_sequences(
    train_df, feature_cols, 'y_target', seq_len=SEQ_LEN
)

test_X_seq, test_y_seq, test_meta_dt, test_meta_sym = build_sequences(
    test_df, feature_cols, 'y_target', seq_len=SEQ_LEN
)

print(f"Train sequences: {train_X_seq.shape}")
print(f"Test sequences:  {test_X_seq.shape}")


In [None]:
train_order = np.argsort(train_meta_dt)
train_X_seq = train_X_seq[train_order]
train_y_seq = train_y_seq[train_order]
train_meta_dt = train_meta_dt[train_order]

test_order = np.argsort(test_meta_dt)
test_X_seq = test_X_seq[test_order]
test_y_seq = test_y_seq[test_order]
test_meta_dt = test_meta_dt[test_order]


## Prefix K-fold DataLoaders

In [None]:
BATCH_SIZE = 256

def create_prefix_folds(X_seq, y_seq, n_splits=10, batch_size=BATCH_SIZE):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    folds = []
    for fold_idx, (train_idx, val_idx) in enumerate(tscv.split(X_seq), start=1):
        X_tr, y_tr = X_seq[train_idx], y_seq[train_idx]
        X_va, y_va = X_seq[val_idx], y_seq[val_idx]
        train_ds = TensorDataset(torch.from_numpy(X_tr), torch.from_numpy(y_tr))
        val_ds = TensorDataset(torch.from_numpy(X_va), torch.from_numpy(y_va))
        folds.append((
            DataLoader(train_ds, batch_size=batch_size, shuffle=True),
            DataLoader(val_ds, batch_size=batch_size, shuffle=False)
        ))
        print(f"Fold {fold_idx}: train {len(train_ds)} seq, val {len(val_ds)} seq")
    return folds

prefix_folds = create_prefix_folds(train_X_seq, train_y_seq, n_splits=10)


## Transformer model

In [None]:
class ReturnTransformer(nn.Module):
    def __init__(self, feature_dim, d_model=128, nhead=4, num_layers=2, ff_dim=256, dropout=0.1):
        super().__init__()
        self.input_proj = nn.Linear(feature_dim, d_model)
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=d_model,
            nhead=nhead,
            dim_feedforward=ff_dim,
            dropout=dropout,
            batch_first=True,
            activation='gelu'
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.head = nn.Linear(d_model, 1)

    def forward(self, x):
        x = self.input_proj(x)
        encoded = self.encoder(x)
        return self.head(encoded[:, -1, :]).squeeze(-1)


## Training utilities

In [None]:
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0.0
    for xb, yb in loader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)
        optimizer.zero_grad()
        pred = model(xb)
        loss = criterion(pred, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        total_loss += loss.item() * len(xb)
    return total_loss / len(loader.dataset)


def eval_epoch(model, loader, criterion):
    model.eval()
    total_loss = 0.0
    with torch.no_grad():
        for xb, yb in loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)
            pred = model(xb)
            loss = criterion(pred, yb)
            total_loss += loss.item() * len(xb)
    return total_loss / len(loader.dataset)


def predict_batches(model, X_array, batch_size=512):
    model.eval()
    preds = []
    with torch.no_grad():
        for start in range(0, len(X_array), batch_size):
            batch = torch.from_numpy(X_array[start:start + batch_size]).to(DEVICE)
            preds.append(model(batch).cpu().numpy())
    return np.concatenate(preds)


## Prefix CV training

In [None]:
import copy

def fit_with_prefix_cv(folds, feature_dim, epochs=15, lr=1e-3, weight_decay=1e-4, patience=3):
    histories = []
    best_states = []
    for fold_idx, (train_loader, val_loader) in enumerate(folds, start=1):
        model = ReturnTransformer(feature_dim).to(DEVICE)
        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        criterion = nn.MSELoss()
        best_val = math.inf
        best_state = None
        patience_ctr = 0
        for epoch in range(1, epochs + 1):
            train_loss = train_epoch(model, train_loader, optimizer, criterion)
            val_loss = eval_epoch(model, val_loader, criterion)
            if val_loss < best_val - 1e-6:
                best_val = val_loss
                best_state = copy.deepcopy(model.state_dict())
                patience_ctr = 0
            else:
                patience_ctr += 1
            print(f"Fold {fold_idx} | Epoch {epoch} | train {train_loss:.6f} | val {val_loss:.6f}")
            if patience_ctr >= patience:
                print(f"Stopping early on fold {fold_idx}")
                break
        histories.append({'fold': fold_idx, 'best_val_mse': best_val})
        best_states.append(best_state)
    return histories, best_states

cv_histories, cv_states = fit_with_prefix_cv(
    prefix_folds,
    feature_dim=len(feature_cols),
    epochs=12,
    lr=1e-3,
    weight_decay=1e-4,
    patience=3,
)
cv_histories


## Final training on full training window

In [None]:
VALID_SPLIT = int(len(train_X_seq) * 0.9)
full_train_ds = TensorDataset(
    torch.from_numpy(train_X_seq[:VALID_SPLIT]),
    torch.from_numpy(train_y_seq[:VALID_SPLIT])
)
full_val_ds = TensorDataset(
    torch.from_numpy(train_X_seq[VALID_SPLIT:]),
    torch.from_numpy(train_y_seq[VALID_SPLIT:])
)
full_train_loader = DataLoader(full_train_ds, batch_size=BATCH_SIZE, shuffle=True)
full_val_loader = DataLoader(full_val_ds, batch_size=BATCH_SIZE, shuffle=False)

final_model = ReturnTransformer(len(feature_cols)).to(DEVICE)
optimizer = torch.optim.AdamW(final_model.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.MSELoss()
best_state = None
best_val = math.inf
for epoch in range(1, 21):
    train_loss = train_epoch(final_model, full_train_loader, optimizer, criterion)
    val_loss = eval_epoch(final_model, full_val_loader, criterion)
    print(f"Epoch {epoch} | train {train_loss:.6f} | val {val_loss:.6f}")
    if val_loss < best_val - 1e-6:
        best_val = val_loss
        best_state = copy.deepcopy(final_model.state_dict())
final_model.load_state_dict(best_state)


## Sharpe evaluation

In [None]:
train_preds = predict_batches(final_model, train_X_seq)
train_pnl = pd.Series(train_preds * train_y_seq)
train_sharpe = calculate_sharpe(train_pnl)

test_preds = predict_batches(final_model, test_X_seq)
test_pnl = pd.Series(test_preds * test_y_seq)
test_sharpe = calculate_sharpe(test_pnl)

print(f"Train Sharpe: {train_sharpe:.6f}")
print(f"Test Sharpe:  {test_sharpe:.6f}")


## Rolling 30-minute backtest (optional heavy cell)

In [None]:
ROLLING_WINDOW_MINUTES = 30
MIN_SEQ_PER_WINDOW = 50

def build_window_sequences(window_df, feature_cols, seq_len=SEQ_LEN):
    X_seq, y_seq = [], []
    for symbol, group in window_df.groupby('symbol'):
        group = group.sort_values('datetime')
        if len(group) < seq_len + 1:
            continue
        feats = group[feature_cols].values
        target = group['y_target'].values
        for i in range(len(group) - seq_len):
            X_seq.append(feats[i:i+seq_len])
            y_seq.append(target[i+seq_len])
    if not X_seq:
        return None, None
    return np.array(X_seq, dtype=np.float32), np.array(y_seq, dtype=np.float32)

backtest_results = []
unique_test_minutes = np.sort(test_df['datetime'].unique())

for current_dt in tqdm(unique_test_minutes, desc='Rolling backtest'):
    start_dt = current_dt - pd.Timedelta(minutes=ROLLING_WINDOW_MINUTES)
    window_df = df_processed[(df_processed['datetime'] >= start_dt) & (df_processed['datetime'] < current_dt)]
    X_tr, y_tr = build_window_sequences(window_df, feature_cols, seq_len=SEQ_LEN)
    if X_tr is None or len(X_tr) < MIN_SEQ_PER_WINDOW:
        continue
    train_loader = DataLoader(TensorDataset(torch.from_numpy(X_tr), torch.from_numpy(y_tr)), batch_size=128, shuffle=True)
    temp_model = ReturnTransformer(len(feature_cols)).to(DEVICE)
    optimizer = torch.optim.AdamW(temp_model.parameters(), lr=1e-3, weight_decay=1e-4)
    criterion = nn.MSELoss()
    for _ in range(3):
        train_epoch(temp_model, train_loader, optimizer, criterion)

    # Build prediction sequences ending at current_dt for each symbol
    pred_rows = df_processed[df_processed['datetime'] == current_dt]
    symbol_preds = []
    combined = pd.concat([window_df, pred_rows], ignore_index=True)
    for symbol, group in combined.groupby('symbol'):
        group = group.sort_values('datetime')
        if group['datetime'].iloc[-1] != current_dt:
            continue
        if len(group) < SEQ_LEN:
            continue
        seq = group[feature_cols].values[-SEQ_LEN:]
        symbol_preds.append((symbol, seq))

    if not symbol_preds:
        continue

    batch = torch.from_numpy(np.stack([seq for _, seq in symbol_preds])).to(DEVICE)
    temp_model.eval()
    with torch.no_grad():
        preds = temp_model(batch).cpu().numpy()

    actuals = df_processed[df_processed['datetime'] == current_dt][['symbol', 'y_target']]
    actual_map = dict(zip(actuals['symbol'], actuals['y_target']))
    for (symbol, _), pred in zip(symbol_preds, preds):
        if symbol not in actual_map:
            continue
        backtest_results.append({
            'datetime': current_dt,
            'symbol': symbol,
            'predicted_log_return': pred,
            'actual_log_return': actual_map[symbol]
        })

backtest_df = pd.DataFrame(backtest_results)
if not backtest_df.empty:
    backtest_df['positive_prediction'] = backtest_df['predicted_log_return'].clip(lower=0)
    minute_sum = backtest_df.groupby('datetime')['positive_prediction'].transform('sum')
    backtest_df['weight_relative'] = np.where(minute_sum == 0, 0.0, backtest_df['positive_prediction'] / minute_sum)
    backtest_df.drop(columns=['positive_prediction'], inplace=True)
    backtest_df['weight_sign'] = np.sign(backtest_df['predicted_log_return'])
    print(backtest_df.head())
    print(f"Backtest rows: {len(backtest_df)}")
else:
    print("Backtest skipped due to insufficient sequences")
