<a href="https://colab.research.google.com/github/comet-ctrl/SP500_prediction/blob/main/CMT2ull_Tactical_Market_Prediction_Demo_Submission.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

hull_tactical_market_prediction_path = kagglehub.competition_download('hull-tactical-market-prediction')

print('Data source import complete.')


In [None]:
import numpy as np
import pandas as pd
from collections import deque
import os

# Just to see files (optional)
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
train_df = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/train.csv')
test_df  = pd.read_csv('/kaggle/input/hull-tactical-market-prediction/test.csv')

# Make sure sorted by time
train_df = train_df.sort_values("date_id").reset_index(drop=True)

In [None]:
train_df.tail()

In [None]:
target_col = "forward_returns"

# Columns NOT used directly as features
drop_cols = [
    "date_id",
    "forward_returns",
    "risk_free_rate",
    "market_forward_excess_returns"
]

In [None]:
feature_cols = [c for c in train_df.columns if c not in drop_cols]

# Replace Â±inf with NaN
train_df = train_df.replace([np.inf, -np.inf], np.nan)

In [None]:
train_df["lagged_forward_returns"] = train_df["forward_returns"].shift(1)
train_df["lagged_risk_free_rate"] = train_df["risk_free_rate"].shift(1)
train_df["lagged_market_forward_excess_returns"] = train_df["market_forward_excess_returns"].shift(1)

feature_cols += [
    "lagged_forward_returns",
    "lagged_risk_free_rate",
    "lagged_market_forward_excess_returns"
]

In [None]:
cols_to_keep = feature_cols + [target_col]
mask = train_df[cols_to_keep].notna().all(axis=1)
train_df_clean = train_df.loc[mask].reset_index(drop=True)

print("Original shape:", train_df.shape)
print("Clean shape   :", train_df_clean.shape)
print("Number of features:", len(feature_cols))
print("Example feature columns:", feature_cols[:10])

In [None]:
from sklearn.preprocessing import StandardScaler

X_raw = train_df_clean[feature_cols].values
y_raw = train_df_clean[target_col].values
N, F = X_raw.shape
print("N, F =", N, F)


In [None]:
split_ratio = 0.8
split_idx = int(N * split_ratio)

X_train_raw = X_raw[:split_idx]
y_train_raw = y_raw[:split_idx]
X_val_raw   = X_raw[split_idx:]
y_val_raw   = y_raw[split_idx:]

print("Train/Val shapes:", X_train_raw.shape, X_val_raw.shape)

In [None]:
# Fit scaler on training only
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_raw)
X_val_scaled   = scaler.transform(X_val_raw)

# Also scale the full dataset for warm start later
X_full_scaled = scaler.transform(X_raw)

# For filling missing values in test later
train_feature_medians = train_df_clean[feature_cols].median()

In [None]:
window = 20  # past days used to predict next day

def make_sequences(X, y, window):
    Xs, ys = [], []
    for i in range(window, len(X)):
        Xs.append(X[i-window:i])
        ys.append(y[i])
    return np.array(Xs), np.array(ys)

In [None]:
X_train_seq, y_train_seq = make_sequences(X_train_scaled, y_train_raw, window)
X_val_seq,   y_val_seq   = make_sequences(X_val_scaled,   y_val_raw,   window)

print("Train seq shape:", X_train_seq.shape, y_train_seq.shape)
print("Val seq shape  :", X_val_seq.shape,   y_val_seq.shape)

In [None]:
X_train_seq = X_train_seq.astype("float32")
y_train_seq = y_train_seq.astype("float32")
X_val_seq   = X_val_seq.astype("float32")
y_val_seq   = y_val_seq.astype("float32")

timesteps = X_train_seq.shape[1]
n_features = X_train_seq.shape[2]

In [None]:
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

model = models.Sequential([
    layers.Input(shape=(timesteps, n_features)),
    layers.LSTM(64, return_sequences=False),
    layers.Dropout(0.2),              # small regularization
    layers.Dense(32, activation="relu"),
    layers.Dense(1)                   # predict forward return
])


In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4),
    loss="mse",
    metrics=["mae"]
)

model.summary()

In [None]:
es = callbacks.EarlyStopping(
    monitor="val_loss",
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train_seq, y_train_seq,
    validation_data=(X_val_seq, y_val_seq),
    epochs=100,
    batch_size=64,
    callbacks=[es],
    verbose=1
)

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

y_val_pred = model.predict(X_val_seq, verbose=0).ravel()
mse = mean_squared_error(y_val_seq, y_val_pred)
mae = mean_absolute_error(y_val_seq, y_val_pred)
corr = np.corrcoef(y_val_seq, y_val_pred)[0, 1]

print("Validation MSE:", mse)
print("Validation MAE:", mae)
print("Correlation (signal quality):", corr)

In [None]:
model.save("/kaggle/working/lstm_model.keras")

In [None]:
import matplotlib.pyplot as plt
y_train_pred = model.predict(X_train_seq, verbose=0).ravel()

plt.figure(figsize=(10, 5))
plt.plot(y_train_seq, label="Actual returns", alpha=0.8)
plt.plot(y_train_pred, label="Predicted returns", alpha=0.8)
plt.title("LSTM: Training Predicted vs Actual Forward Returns")
plt.xlabel("Time (days in validation period)")
plt.ylabel("Forward return")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Predictions vs actuals on validation set
plt.figure(figsize=(10, 5))
plt.plot(y_val_seq, label="Actual returns", alpha=0.8)
plt.plot(y_val_pred, label="Predicted returns", alpha=0.8)
plt.title("LSTM: Validation Predicted vs Actual Forward Returns")
plt.xlabel("Time (days in validation period)")
plt.ylabel("Forward return")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import numpy as np
np.corrcoef(y_val_seq, y_val_pred)[0,1]


In [None]:
# Use std of training target to scale leverage
y_train_std = float(y_train_seq.std() + 1e-8)  # avoid divide-by-zero
k_leverage  = 5.0                              # aggressiveness

# Warm start: last `window` scaled rows from FULL training set
last_window_scaled = X_full_scaled[-window:]
history_buffer = deque(last_window_scaled, maxlen=window)

In [None]:
import polars as pl
import kaggle_evaluation.default_inference_server

def predict(test: pl.DataFrame) -> float:
    """
    Called repeatedly by the evaluation API.

    `test` is a Polars DataFrame with one (or a few) rows.
    We must return a single float: allocation for the latest row, in [0, 2].
    """
    global history_buffer

    # Convert to pandas
    test_pd = test.to_pandas()

    # Use exactly the same feature columns as in training
    # 1) select, 2) clean inf/NaN, 3) fill from training medians
    X_raw = test_pd[feature_cols].copy()
    X_raw = X_raw.replace([np.inf, -np.inf], np.nan)
    X_raw = X_raw.fillna(train_feature_medians)

    # Scale using the same scaler
    X_scaled = scaler.transform(X_raw.values)

    # Default neutral allocation
    alloc = 1.0

    # Usually N == 1, but loop just in case
    for row in X_scaled:
        # Append new timestep to rolling window
        history_buffer.append(row)

        # If, for some reason, we ever had less than `window` steps,
        # stay neutral; but with warm start this should not happen.
        if len(history_buffer) < window:
            alloc = 1.0
        else:
            # Build input sequence of shape (1, window, n_features)
            seq = np.array(history_buffer, dtype="float32").reshape(1, window, n_features)

            # Model predicts forward return (scalar)
            pred_ret = float(model.predict(seq, verbose=0)[0, 0])

            # Map predicted return -> leverage in [0, 2]
            # Center at 1.0 (neutral), scale by training std
            raw_alloc = 1.0 + k_leverage * (pred_ret / (2.0 * y_train_std))

            # Hard clip to the allowed range [0, 2]
            alloc = float(np.clip(raw_alloc, 0.0, 2.0))

    print("Pred allocation:", alloc)
    return float(alloc)



In [None]:
# ======================
# 8. Start inference server
# ======================
inference_server = kaggle_evaluation.default_inference_server.DefaultInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(
        ('/kaggle/input/hull-tactical-market-prediction/',)
    )