In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from final_project.modeling import load_models, EVENT_WEIGHT
from final_project.data import read_data, split_data
from final_project.preprocessing import NUM_FEATURES, CAT_FEATURES, RESPONDER
from final_project.evaluation import evaluate_predictions
from final_project.plotting import plot_pred_vs_true, plot_day_predictions, plot_feature_relevance, plot_pdps

## Load Models

In [None]:
df = read_data("clean_data")

# Set up data + models
df_train, df_test, df_val = split_data(df, [0.6, 0, 0.4])

X_train = df_train[NUM_FEATURES + CAT_FEATURES].copy()
y_train = df_train[RESPONDER]
sample_weight_train = np.where(X_train["event_code"] != "NONE", EVENT_WEIGHT, 1)

X_val = df_val[NUM_FEATURES + CAT_FEATURES].copy()
y_val = df_val[RESPONDER]


glm, lgbm = load_models(X_train, y_train, sample_weight_train)

In [None]:
# Predict values, make df
glm_y_pred = glm.predict(X_val)
lgbm_y_pred = lgbm.predict(X_val)
df_pred = pd.DataFrame({
    "y_true": y_val, 
    "glm_y_pred": glm_y_pred,
    "lgbm_y_pred": lgbm_y_pred,
    "baseline_y_pred": X_val["past_50m_span_ewm_vol"],
    "weight": 1,
    "date": df_val["date"],
    "time_of_day": df_val["time_of_day"]
})


In [None]:
# Evaluate glm
glm_eval = evaluate_predictions(y_val, df_pred["glm_y_pred"], df_pred["weight"])
glm_eval

In [None]:
# Evaluate lgbm
lgbm_eval = evaluate_predictions(y_val, df_pred["lgbm_y_pred"], df_pred["weight"])
lgbm_eval

In [None]:
# Evaluate baseline
lgbm_eval = evaluate_predictions(y_val, df_pred["baseline_y_pred"], df_pred["weight"])
lgbm_eval

Looks like the GBT outperformed the GLM on all measures!

In [None]:
# Plot pred vs. true for glm
fig = plot_pred_vs_true(df_pred, "glm")

This should look better with logs!

In [None]:
# Plot pred vs. true for glm, log axes
fig = plot_pred_vs_true(df_pred, "glm", log=True)

In [None]:
# Plot pred vs. true for lgbm, log axes
fig = plot_pred_vs_true(df_pred, "lgbm", log=True)

In [None]:
# Plot pred vs. true for baseline, log axes
fig = plot_pred_vs_true(df_pred, "baseline", log=True)

In [None]:
fig = plot_day_predictions(df_pred, "2025-08-31")
fig = plot_day_predictions(df_pred, "2025-07-16")

In [None]:
# Plot glm features
glm_top_5 = plot_feature_relevance(glm, X_val, y_val)

In [None]:
# Plot lgbm features
lgbm_top_5 = plot_feature_relevance(lgbm, X_val, y_val)

In [None]:
plot_pdps(glm, X_val, y_val, n_top=5)

In [None]:
plot_pdps(lgbm, X_val, y_val, n_top=5)