# Hull Tactical - Gen5 "Lean & Fast" + Meta-Learning

### STRATEGY UPDATE: Gen5
This notebook implements the **"Lean & Fast"** optimization with an added **Meta-Learning** layer.

### CORE UPGRADES:
1. **Automated Hyperparameter Optimization (HPO)**: Tries to use `optuna` if available to find the perfect balance. If running offline without Optuna, falls back to robust Gen5 defaults.
2. **Walk-Forward Validation**: Prevents overfitting by optimizing on a validation set while keeping a strict hold-out set for final testing.
3. **Dynamic Feature Engineering**: Feature windows are no longer hardcoded; they adapt based on the meta-learner's findings.
4. **"Flash" Regime Detection**: Optimizes the ratio between short-term and long-term volatility to switch between **Aggressive** and **Defensive** modes instantly.

In [None]:
import os
import sys
import time
import warnings
import numpy as np
import pandas as pd
import polars as pl
from datetime import datetime
from sklearn.linear_model import SGDRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit
from lightgbm import LGBMRegressor
import kaggle_evaluation.default_inference_server

# Try importing Optuna for Meta-Learning, handle offline/missing case gracefully
try:
    import optuna
    OPTUNA_AVAILABLE = True
    optuna.logging.set_verbosity(optuna.logging.WARNING)
except ImportError:
    OPTUNA_AVAILABLE = False
    print("Optuna not found. Using Gen5 Defaults (Meta-Learning skipped).")

warnings.filterwarnings("ignore")

# -----------------------------------------------------------------------------------------
# 1. CONFIGURATION (DYNAMIC)
# -----------------------------------------------------------------------------------------
class Config:
    SEED = 42
    
    # Gen5 Defaults (Optimized for "Lean & Fast")
    VOL_SHORT = 5
    VOL_LONG = 22
    VOL_QUARTERLY = 66
    
    EMA_FAST = 5
    EMA_SLOW = 26
    
    # Model Params (Gen5 Robust Defaults)
    LGBM_PARAMS = {
        'n_estimators': 1000,
        'learning_rate': 0.01,
        'max_depth': 5,
        'num_leaves': 31,
        'subsample': 0.8,
        'colsample_bytree': 0.5, # Gen5: Reduced to force feature diversity
        'reg_alpha': 0.1,        # Gen5: Increased L1 Regularization
        'n_jobs': -1,
        'verbose': -1,
        'random_state': 42
    }
    
    # Trading Logic
    BASE_W_LINEAR = 0.4
    TARGET_VOL = 0.005
    MAX_LEVERAGE = 2.0
    SGD_LR = 0.001
    SGD_ALPHA = 0.001

# -----------------------------------------------------------------------------------------
# 2. FEATURE ENGINEERING (ADAPTIVE)
# -----------------------------------------------------------------------------------------
def calculate_ema(series, period):
    return series.ewm(span=period, adjust=False).mean()

def calculate_rsi(series, period=14):
    delta = series.diff()
    gain = (delta.where(delta > 0, 0)).rolling(window=period).mean()
    loss = (-delta.where(delta < 0, 0)).rolling(window=period).mean()
    rs = gain / (loss + 1e-10)
    return 100 - (100 / (1 + rs))

def calculate_macd(series, fast=12, slow=26, signal=9):
    ema_fast = calculate_ema(series, fast)
    ema_slow = calculate_ema(series, slow)
    macd_line = ema_fast - ema_slow
    signal_line = calculate_ema(macd_line, signal)
    macd_hist = macd_line - signal_line
    return macd_line, signal_line, macd_hist

def feature_engineering(df: pd.DataFrame) -> pd.DataFrame:
    # Uses Config values dynamically
    df = df.copy()
    targets = ['forward_returns', 'risk_free_rate']
    base_col = 'lag_forward_returns_1'
    
    # 1. Lags
    for col in targets:
        for lag in [1, 2, 3, 5, 10, 22]:
            df[f'lag_{col}_{lag}'] = df[col].shift(lag)
            
    # 2. Volatility (Dynamic Windows)
    df['vol_short'] = df[base_col].rolling(Config.VOL_SHORT).std()
    df['vol_long'] = df[base_col].rolling(Config.VOL_LONG).std()
    df['vol_quarterly'] = df[base_col].rolling(Config.VOL_QUARTERLY).std()
    
    # 3. Momentum & Tech
    df['mom_short'] = df[base_col].rolling(Config.VOL_SHORT).mean()
    df['ema_fast'] = calculate_ema(df[base_col], Config.EMA_FAST)
    df['ema_slow'] = calculate_ema(df[base_col], Config.EMA_SLOW)
    df['ema_cross'] = df['ema_fast'] - df['ema_slow']
    
    df['rsi'] = calculate_rsi(df[base_col], 14)
    df['macd'], _, _ = calculate_macd(df[base_col])
    
    # 4. Regime Features
    # Dynamic Vol Ratio
    df['vol_ratio'] = df['vol_long'] / (df['vol_quarterly'] + 1e-8)
    df['flash_crash_signal'] = df['vol_short'] / (df['vol_long'] + 1e-8)
    
    # Fill NaNs
    df = df.fillna(0)
    return df

# -----------------------------------------------------------------------------------------
# 3. DATA LOADING & PREP
# -----------------------------------------------------------------------------------------
def load_data(path):
    print(f"Loading {path}...")
    df_pl = pl.read_csv(path)
    cols = [c for c in df_pl.columns if c != 'date_id']
    df_pl = df_pl.with_columns([pl.col(c).cast(pl.Float64, strict=False).fill_null(0) for c in cols])
    return df_pl.to_pandas()

TRAIN_PATH = "/kaggle/input/hull-tactical-market-prediction/train.csv"
raw_train_df = load_data(TRAIN_PATH)

TARGET = "forward_returns"
DROP_COLS = ['date_id', 'is_scored', 'forward_returns', 'risk_free_rate', 'market_forward_excess_returns']

# -----------------------------------------------------------------------------------------
# 4. META-LEARNING (OPTUNA)
# -----------------------------------------------------------------------------------------
def objective(trial):
    # 1. Suggest Parameters
    Config.VOL_SHORT = trial.suggest_int('vol_short', 3, 10)
    Config.VOL_LONG = trial.suggest_int('vol_long', 15, 30)
    Config.VOL_QUARTERLY = trial.suggest_int('vol_quarterly', 50, 80)
    
    lgbm_params = {
        'n_estimators': 500, # Lower for speed during optimization
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 7),
        'num_leaves': trial.suggest_int('num_leaves', 15, 63),
        'subsample': trial.suggest_float('subsample', 0.5, 0.9),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.9),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 1.0, log=True),
        'n_jobs': -1,
        'verbose': -1,
        'random_state': Config.SEED
    }
    
    # 2. Generate Features (Dynamic)
    df = feature_engineering(raw_train_df)
    
    # 3. Walk-Forward Validation
    train_start = 75
    
    cols_to_drop = [c for c in DROP_COLS if c in df.columns]
    X = df.iloc[train_start:].drop(columns=cols_to_drop)
    y = df.iloc[train_start:][TARGET]
    
    tscv = TimeSeriesSplit(n_splits=3)
    scores = []
    
    for train_idx, val_idx in tscv.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = LGBMRegressor(**lgbm_params)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        scores.append(mean_squared_error(y_val, preds))
        
    return np.mean(scores)

print("Starting Optuna Optimization...")
# Check: Optuna Available + Not Rerun
if OPTUNA_AVAILABLE and not os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    try:
        study = optuna.create_study(direction='minimize')
        study.optimize(objective, n_trials=20)
        
        print("Best Params:", study.best_params)
        
        # Update Config
        Config.VOL_SHORT = study.best_params['vol_short']
        Config.VOL_LONG = study.best_params['vol_long']
        Config.VOL_QUARTERLY = study.best_params['vol_quarterly']
        
        for k, v in study.best_params.items():
            if k in Config.LGBM_PARAMS:
                Config.LGBM_PARAMS[k] = v
        Config.LGBM_PARAMS['n_estimators'] = 1000
    except Exception as e:
        print(f"Optimization Failed: {e}. Using Defaults.")
else:
    print("Skipping optimization (Offline/Rerun). Using Gen5 Defaults.")

# -----------------------------------------------------------------------------------------
# 5. FINAL MODEL TRAINING
# -----------------------------------------------------------------------------------------
# Re-generate features with FINAL BEST windows
train_df = feature_engineering(raw_train_df)
train_df = train_df.iloc[75:].reset_index(drop=True)

cols_to_drop = [c for c in DROP_COLS if c in train_df.columns]
X = train_df.drop(columns=cols_to_drop)
y = train_df[TARGET]
FEATURES = X.columns.tolist()

print(f"Training Final Model on {len(X)} rows...")

# Linear Model (Base Trend)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
linear_model = SGDRegressor(
    loss='squared_error', penalty='l2', alpha=Config.SGD_ALPHA,
    learning_rate='constant', eta0=Config.SGD_LR, 
    random_state=Config.SEED, max_iter=2000
)
linear_model.fit(X_scaled, y)

# Tree Model (Optimized)
lgbm_model = LGBMRegressor(**Config.LGBM_PARAMS)
lgbm_model.fit(X, y)

print("Gen5 Models Ready.")

# -----------------------------------------------------------------------------------------
# 6. INFERENCE LOOP (Optimized)
# -----------------------------------------------------------------------------------------
GLOBAL_HISTORY = raw_train_df.iloc[-150:].copy() 
STEP = 0

# 1. Initialize a small buffer to store history of ratios (e.g., last 100 days)
ratio_history = [] 

def get_adaptive_weights(current_vol, long_term_vol, crash_sensitivity=2.0):
    """
    crash_sensitivity: Standard Deviations (Sigma) to trigger defensive mode.
                       2.0 = Top 2.5% of violent days (Robust).
    """
    global ratio_history
    
    # Calculate current ratio
    ratio = current_vol / (long_term_vol + 1e-8)
    
    # Add to history for rolling stats
    ratio_history.append(ratio)
    if len(ratio_history) > 100: 
        ratio_history.pop(0) # Keep window fixed size
    
    w_linear = Config.BASE_W_LINEAR
    
    # --- DYNAMIC LOGIC HERE ---
    # Only calculate Z-score if we have enough history (e.g., 20 days)
    if len(ratio_history) > 20:
        rolling_series = pd.Series(ratio_history)
        
        # Calculate dynamic context
        mean = rolling_series.mean()
        std = rolling_series.std() + 1e-8
        
        # Z-Score: How many Sigmas away is today's volatility?
        z_score = (ratio - mean) / std
        
        # Decision: Use Z-Score instead of "1.3"
        if z_score > crash_sensitivity: 
            # Volatility is statistically shocking relative to recent context
            w_linear = 0.7
            print(f"Defensive Mode Triggered! Z-Score: {z_score:.2f}")
            
        elif z_score < -1.0:
            # Volatility is unusually calm
            w_linear = 0.2
            
    return w_linear, 1.0 - w_linear

def predict(test_pl: pl.DataFrame) -> float:
    global GLOBAL_HISTORY, STEP, linear_model, scaler
    
    # 1. Update History
    cols = [c for c in test_pl.columns if c != 'date_id']
    test_pl = test_pl.with_columns([pl.col(c).cast(pl.Float64, strict=False).fill_null(0) for c in cols])
    test_df_raw = test_pl.to_pandas()
    
    GLOBAL_HISTORY = pd.concat([GLOBAL_HISTORY, test_df_raw], axis=0, ignore_index=True)
    
    # 2. Features (Uses Best Params from Config)
    full_features = feature_engineering(GLOBAL_HISTORY)
    current_features = full_features.iloc[[-1]][FEATURES]
    
    # 3. Prediction
    curr_X_scaled = scaler.transform(current_features)
    pred_linear = linear_model.predict(curr_X_scaled)[0]
    pred_tree = lgbm_model.predict(current_features)[0]
    
    # 4. Regime Ensemble
    curr_vol = current_features['vol_short'].values[0] 
    long_vol = current_features['vol_quarterly'].values[0]
    
    w_lin, w_tree = get_adaptive_weights(curr_vol, long_vol)
    raw_pred = (pred_linear * w_lin) + (pred_tree * w_tree)
    
    # 5. Risk Control
    safe_vol = curr_vol if curr_vol > 1e-5 else 0.005
    vol_scalar = Config.TARGET_VOL / safe_vol
    sharpe_forecast = abs(raw_pred) / safe_vol
    
    allocation_size = sharpe_forecast * vol_scalar * 50
    sign = np.sign(raw_pred)
    
    # RSI Sanity Check
    rsi = current_features['rsi'].values[0]
    if rsi > 75 and sign > 0: allocation_size *= 0.5
    elif rsi < 25 and sign < 0: allocation_size *= 0.5
        
    allocation = np.clip(1.0 + (sign * allocation_size), 0.0, 2.0)
    
    # 6. Online Learning
    try:
        prev_target = test_df_raw['lagged_forward_returns'].values[0]
        if not np.isnan(prev_target) and STEP > 0:
            linear_model.partial_fit(curr_X_scaled, [prev_target])
    except:
        pass
        
    if len(GLOBAL_HISTORY) > 300:
        GLOBAL_HISTORY = GLOBAL_HISTORY.iloc[-200:].reset_index(drop=True)
        
    STEP += 1
    return float(allocation)

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))