In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score

import joblib

from hull_tactical.data_loading import load_train_test
from hull_tactical.preprocessing import build_cleaned_data
from hull_tactical.feature_engineering import build_features
from hull_tactical.backtest import run_backtest
from hull_tactical.config import TARGET_COL
from hull_tactical.paths import RESULTS_DIR, ARTIFACTS_DIR

In [None]:
train, _ = load_train_test()                             # raw train.csv

full_cleaned, train_clean, val_clean, test_clean, high_na_cols = (
    build_cleaned_data(train)
)

print("Full cleaned shape :", full_cleaned.shape)
print("Train cleaned shape:", train_clean.shape)
print("Val cleaned shape  :", val_clean.shape)
print("Test cleaned shape :", test_clean.shape)

full_feat_df, feat_train_set, feat_val_set, feat_test_set, new_cols = (
    build_features(full_cleaned)
)

print("Full feat shape :", full_feat_df.shape)
print("Train feat shape:", feat_train_set.shape)
print("Val feat shape  :", feat_val_set.shape)
print("Test feat shape :", feat_test_set.shape)

In [None]:
train, _ = load_train_test()

full_cleaned, train_clean, val_clean, test_clean, high_na_cols = (
    build_cleaned_data(train)
)

print("Full cleaned shape :", full_cleaned.shape)
print("Train cleaned shape:", train_clean.shape)
print("Val cleaned shape  :", val_clean.shape)
print("Test cleaned shape :", test_clean.shape)

full_feat_df, feat_train_set, feat_val_set, feat_test_set, new_cols = (
    build_features(full_cleaned)
)

print("Full feat shape :", full_feat_df.shape)
print("Train feat shape:", feat_train_set.shape)
print("Val feat shape  :", feat_val_set.shape)
print("Test feat shape :", feat_test_set.shape)

In [None]:
model_path = ARTIFACTS_DIR / "lgbm_directional.joblib"
config_path = RESULTS_DIR / "lgbm_halving_config.json"

print("Model path :", model_path)
print("Config path:", config_path)

best_model = joblib.load(model_path)

import json
with open(config_path, "r", encoding="utf-8") as f:
    model_config = json.load(f)

feature_cols = model_config["feature_cols"]
print("Number of model feature columns:", len(feature_cols))
print("First 10 feature columns:", feature_cols[:10])

In [None]:
auc = roc_auc_score(y_label_test, proba_test)
pred_label = (proba_test >= 0.5).astype(int)
acc = accuracy_score(y_label_test, pred_label)

print("Test ROC AUC:", auc)
print("Test Accuracy:", acc)

In [None]:
fpr, tpr, thresholds = roc_curve(y_label_test, proba_test)

plt.figure(figsize=(6, 6))
plt.plot(fpr, tpr, label=f"ROC curve (AUC = {auc:.3f})")
plt.plot([0, 1], [0, 1], "--", color="grey", label="Random")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Test ROC Curve")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 4))
sns.histplot(proba_test, bins=50, kde=False)
plt.xlabel("Predicted p(up)")
plt.ylabel("Count")
plt.title("Distribution of predicted probabilities (test set)")
plt.tight_layout()
plt.show()

In [None]:
bt_res = run_backtest(
    proba_test,
    y_test.values,
    risk_free_test,
)

In [None]:
baseline_adj = bt_res["baseline"]["adjusted_sharpe"]
best_name = bt_res["best"]["name"]
best_adj = bt_res["best"]["kaggle_details"]["adjusted_sharpe"]
best_stats = bt_res["best"]["basic_stats"]

print("Baseline adjusted Sharpe:", baseline_adj)
print("Best strategy name       :", best_name)
print("Best strategy adj Sharpe :", best_adj)
print("Best strategy basic stats:", best_stats)

In [None]:
# baseline equity
baseline_returns = bt_res["baseline"]["returns"]
baseline_equity = np.cumprod(1 + baseline_returns)

# best strategy equity
best_pos = bt_res["best"]["positions"]
fwd = y_test.values
rf = risk_free_test

best_ret = rf * (1 - best_pos) + best_pos * fwd
best_equity = np.cumprod(1 + best_ret)

plt.figure(figsize=(8, 4))
plt.plot(baseline_equity, label=f"Baseline (adj Sharpe={baseline_adj:.3f})")
plt.plot(best_equity, label=f"Best strategy ({best_name}, adj Sharpe={best_adj:.3f})")
plt.xlabel("Time (test index)")
plt.ylabel("Equity (normalised)")
plt.title("Equity curve: baseline vs best strategy (test set)")
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()