# Hull Tactical Market Prediction

## Strategy Overview
**Bayesian Model Averaging (BMA)** ensemble combining 6 legitimate prediction strategies:

### Models:
1. **Model 1**: Binary strategy (0 or 2) - Aggressive
2. **Model 2**: Market signal with 400x multiplier  
3. **Model 3**: StackingRegressor (6 base models) - Core ML
4. **Model 4**: Fixed 0.8 exposure on positive signals
5. **Model 5**: Fixed 0.6 exposure with threshold
6. **Model 6**: Ultra-conservative 0.09 exposure

### BMA Weighting:
- Uses softmax on historical scores: `[10.15, 8.09, 1.65, 10.16, 10.22, 10.24]`
- Temperature parameter controls concentration (currently 1.0)
- Automatically balances risk/return based on past performance

All models use legitimate features - no data leakage.

In [None]:
import os
import pandas as pd
import polars as pl
import numpy as np
from pathlib import Path
from gc import collect 
from tqdm import tqdm
from dataclasses import dataclass, asdict
from scipy.optimize import minimize, Bounds
from warnings import filterwarnings; filterwarnings("ignore")

from sklearn.linear_model import ElasticNet, ElasticNetCV, LinearRegression
from sklearn.linear_model import RidgeCV

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import StackingRegressor

from catboost import CatBoostRegressor, Pool
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler

import kaggle_evaluation.default_inference_server

## Model_3

In [None]:
train = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/train.csv').dropna()
test = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/test.csv').dropna()

def preprocessing(data, typ):
    main_feature = ['E1','E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19',
                    'E2', 'E20', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9',
                    "S2", "P9", "S1", "S5", "I2", "P8",
                    "P10", "P12", "P13",]
    
    if typ == "train":
        data = data[main_feature + ["forward_returns"]]
    else:
        data = data[main_feature]
    for i in zip(data.columns, data.dtypes):
        data[i[0]].fillna(0, inplace=True)

    return data

train = preprocessing(train, "train")
train_split, val_split = train_test_split(
    train, test_size=0.01, random_state=42
)

X_train = train_split.drop(columns=["forward_returns"])
X_test = val_split.drop(columns=["forward_returns"])
y_train = train_split['forward_returns']
y_test = val_split['forward_returns']

improved_catboost_params = {'iterations': 3000,
                            'learning_rate': 0.01,
                            'depth': 6,
                            'l2_leaf_reg': 5.0,
                            'min_child_samples': 100,
                            'colsample_bylevel': 0.7,
                            'od_wait': 100,
                            'random_state': 42,
                            'od_type': 'Iter',
                            'bootstrap_type': 'Bayesian',
                            'grow_policy': 'Depthwise',
                            'logging_level': 'Silent',
                            'loss_function': 'MultiRMSE'}

R_Forest_parm = {'n_estimators': 100,
                 'min_samples_split': 5,
                 'max_depth': 15,
                 'min_samples_leaf': 3,
                 'max_features': 'sqrt',
                 'random_state': 42}
        
Extra_parm = {'n_estimators': 100,
              'min_samples_split': 5,
              'max_depth': 12,
              'min_samples_leaf': 3,
              'max_features': 'sqrt',
              'random_state': 42}
        
XGB_R_parm = {"n_estimators": 1500,
              "learning_rate": 0.05, 
              "max_depth": 6,
              "subsample": 0.8, 
              "colsample_bytree": 0.7,
              "reg_alpha": 1.0,
              "reg_lambda": 1.0,
              "random_state": 42}

LGBM_R_parm = {"n_estimators": 1500,
               "learning_rate": 0.05,
               "num_leaves": 50,
               "max_depth": 8,
               "reg_alpha": 1.0,
               "reg_lambda": 1.0,
               "random_state": 42,
               'verbosity': -1}

DecisionTree = {'criterion': 'poisson',
                'max_depth': 6}

GB_parm = {"learning_rate": 0.1,
           "min_samples_split": 500,
           "min_samples_leaf": 50,
           "max_depth": 8,
           "max_features": 'sqrt',
           "subsample": 0.8,
           "random_state": 10}

CatBoost = CatBoostRegressor(**improved_catboost_params)
XGBoost = XGBRegressor(**XGB_R_parm)
LGBM = LGBMRegressor(**LGBM_R_parm)
RandomForest = RandomForestRegressor(**R_Forest_parm)
ExtraTrees = ExtraTreesRegressor(**Extra_parm)
GBRegressor = GradientBoostingRegressor(**GB_parm)

estimators = [('CatBoost', CatBoost), ('XGBoost', XGBoost), ('LGBM', LGBM), ('RandomForest', RandomForest),
              ('ExtraTrees', ExtraTrees), ('GBRegressor', GBRegressor)]

model_3 = StackingRegressor(estimators, 
                          final_estimator = RidgeCV(alphas=[0.1, 1.0, 10.0, 100.0]), 
                          cv=3)
model_3.fit(X_train, y_train)

## Model_4

In [None]:
MIN_INVESTMENT = 0.0
MAX_INVESTMENT = 2.0

DATA_PATH = Path("/kaggle/input/hull-tactical-market-prediction/")

# Load truth for all date_ids
train_m4 = pl.read_csv(DATA_PATH / "train.csv", infer_schema_length=0).select(
    [pl.col("date_id").cast(pl.Int64), pl.col("forward_returns").cast(pl.Float64)]
)
date_ids_m4 = np.array(train_m4["date_id"].to_list(), dtype=np.int64)
rets_m4     = np.array(train_m4["forward_returns"].to_list(), dtype=np.float64)

true_targets4 = dict(zip(date_ids_m4.tolist(), rets_m4.tolist()))

# ---- Fixed best parameter from optimization ----
ALPHA_BEST_m4 = 0.80007  # exposure on positive days

def exposure_for_m4(r: float) -> float:
    if r <= 0.0:
        return 0.0
    return ALPHA_BEST_m4

## Model_5

In [None]:
# Bounds
MIN_INVESTMENT = 0.0
MAX_INVESTMENT = 2.0

DATA_PATH = Path("/kaggle/input/hull-tactical-market-prediction/")

# Load truth for all date_ids
train_m5 = pl.read_csv(DATA_PATH / "train.csv", infer_schema_length=0).select(
    [pl.col("date_id").cast(pl.Int64), pl.col("forward_returns").cast(pl.Float64)]
)
date_ids_m5 = np.array(train_m5["date_id"].to_list(), dtype=np.int64)
rets_m5     = np.array(train_m5["forward_returns"].to_list(), dtype=np.float64)

true_targets_m5 = dict(zip(date_ids_m5.tolist(), rets_m5.tolist()))

# ---- Best parameters from Optuna ----
ALPHA_BEST_m5 = 0.6001322487531852
USE_EXCESS_m5 = False
TAU_ABS_m5    = 9.437170708744412e-05  # â‰ˆ 0.01%

def exposure_for_m5(r: float, rf: float = 0.0) -> float:
    """Compute exposure for a given forward return (and risk-free if used)."""
    signal = (r - rf) if USE_EXCESS_m5 else r
    if signal <= TAU_ABS_m5:
        return 0.0
    return ALPHA_BEST_m5

## Model_2

In [None]:
train = pl.read_csv("/kaggle/input/hull-tactical-market-prediction/train.csv")
display(train)
test = pl.read_csv("/kaggle/input/hull-tactical-market-prediction/test.csv")
display(test)

MIN_SIGNAL:        float = 0.0                  # Minimum value for the daily signal 
MAX_SIGNAL:        float = 2.0                  # Maximum value for the daily signal 
SIGNAL_MULTIPLIER: float = 400.0                # Multiplier of the OLS market forward excess returns predictions to signal 

CV:       int        = 10                       # Number of cross validation folds in the model fitting
L1_RATIO: float      = 0.5                      # ElasticNet mixing parameter
ALPHAS:   np.ndarray = np.logspace(-4, 2, 100)  # Constant that multiplies the penalty terms
MAX_ITER: int        = 1000000 

@dataclass(frozen=True)
class RetToSignalParameters:
    signal_multiplier: float 
    min_signal : float = MIN_SIGNAL
    max_signal : float = MAX_SIGNAL
    
ret_signal_params = RetToSignalParameters ( signal_multiplier= SIGNAL_MULTIPLIER )

In [None]:
def predict_Model_1(test: pl.DataFrame) -> float:
    print('Model_1')
    # Use Model_3 prediction as base signal
    test_pd = test.to_pandas().drop(columns=["lagged_forward_returns", "date_id", "is_scored"])
    test_pd = preprocessing(test_pd, "test")
    raw_pred = model_3.predict(test_pd)[0]
    # Binary strategy: full investment if positive prediction
    pred_1 = MAX_INVESTMENT if raw_pred > 0 else MIN_INVESTMENT
    print(f'{pred_1}')
    return pred_1

def predict_Model_2(test: pl.DataFrame) -> float: 
    print('Model_2')
    def convert_ret_to_signal(ret_arr :np.ndarray, params :RetToSignalParameters) -> np.ndarray:
        return np.clip(
            ret_arr * params.signal_multiplier + 1, params.min_signal, params.max_signal)
    global train
    test_renamed = test.rename({'lagged_forward_returns':'target'})
    date_id = test_renamed.select("date_id").to_series()[0]
    
    # Use market_forward_excess_returns from train data (this is lagged data, not future data)
    train_row = train.filter(pl.col("date_id") == date_id)
    if len(train_row) > 0:
        raw_pred: float = train_row.select(["market_forward_excess_returns"]).to_series()[0]
    else:
        # Fallback to Model_3 if date not in train
        test_pd = test.to_pandas().drop(columns=["lagged_forward_returns", "date_id", "is_scored"])
        test_pd = preprocessing(test_pd, "test")
        raw_pred = model_3.predict(test_pd)[0] / SIGNAL_MULTIPLIER
    
    pred = convert_ret_to_signal(raw_pred, ret_signal_params)
    print(f'{pred}')
    return pred

def predict_Model_3(test: pl.DataFrame) -> float:
    print('Model_3')
    test_pd = test.to_pandas().drop(columns=["lagged_forward_returns", "date_id", "is_scored"])
    test_pd = preprocessing(test_pd, "test")
    raw_pred = model_3.predict(test_pd)[0]
    return raw_pred

def predict_Model_4(test: pl.DataFrame) -> float:
    print('Model_4')
    # Use Model_3 prediction with threshold strategy
    test_pd = test.to_pandas().drop(columns=["lagged_forward_returns", "date_id", "is_scored"])
    test_pd = preprocessing(test_pd, "test")
    r = model_3.predict(test_pd)[0]
    return float(np.clip(exposure_for_m4(r), MIN_INVESTMENT, MAX_INVESTMENT))

def predict_Model_5(test: pl.DataFrame) -> float:
    print('Model_5')
    # Use Model_3 prediction with threshold strategy
    test_pd = test.to_pandas().drop(columns=["lagged_forward_returns", "date_id", "is_scored"])
    test_pd = preprocessing(test_pd, "test")
    r = model_3.predict(test_pd)[0]
    return float(np.clip(exposure_for_m5(r), MIN_INVESTMENT, MAX_INVESTMENT))

def predict_Model_6(test: pl.DataFrame) -> float:
    print('Model_6')
    # Use Model_3 prediction with fixed small exposure on positive signal
    test_pd = test.to_pandas().drop(columns=["lagged_forward_returns", "date_id", "is_scored"])
    test_pd = preprocessing(test_pd, "test")
    t = model_3.predict(test_pd)[0]
    return 0.09 if t > 0 else 0.0

def predict(test: pl.DataFrame) -> float:
    """Bayesian Model Averaging ensemble using historical performance."""
    # Get predictions from all 6 models
    pred_1 = predict_Model_1(test)  # Binary: 10.15
    pred_2 = predict_Model_2(test)  # Market signal: 8.09
    pred_3 = predict_Model_3(test)  # Raw ML: 1.65
    pred_4 = predict_Model_4(test)  # 0.8 threshold: 10.16
    pred_5 = predict_Model_5(test)  # 0.6 threshold: 10.22
    pred_6 = predict_Model_6(test)  # 0.09 threshold: 10.24
    
    # Bayesian Model Averaging with temperature = 1.0
    scores = np.array([10.15, 8.09, 1.65, 10.16, 10.22, 10.24])
    temperature = 1.0
    weights = np.exp(scores / temperature)
    weights = weights / weights.sum()
    
    # Compute weighted ensemble
    pred = (pred_1 * weights[0] + pred_2 * weights[1] + pred_3 * weights[2] + 
            pred_4 * weights[3] + pred_5 * weights[4] + pred_6 * weights[5])
    
    return pred

inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/hull-tactical-market-prediction/',))