In [1]:
# ============================================================
# 10_testing_pipeline.ipynb
# Testing & Validation Notebook for Final Ensemble Model
# ============================================================

import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import (
    roc_auc_score, accuracy_score,
    classification_report, confusion_matrix
)

import xgboost as xgb
import lightgbm as lgb

plt.style.use("seaborn-v0_8-whitegrid")

print("ðŸ”¥ Testing notebook initialized.")


ðŸ”¥ Testing notebook initialized.


In [2]:
# ============================================================
# Load trained models
# ============================================================

scaler = joblib.load("../models/tab_scaler.pkl")
xgb_model = joblib.load("../models/xgb_final.json")
lgb_model = joblib.load("../models/lgb_final.txt")
meta_model = joblib.load("../models/ensemble_final.pkl")

print("âœ… Models loaded successfully!")


âœ… Models loaded successfully!


In [10]:
def engineer_features(df):
    df = df.copy()
    df = df.sort_values(["Ticker", "Date"]).reset_index(drop=True)

    df["Return"] = df.groupby("Ticker")["Close"].pct_change()
    df["Return_lag1"] = df.groupby("Ticker")["Return"].shift(1)
    df["Return_lag2"] = df.groupby("Ticker")["Return"].shift(2)
    df["Return_lag3"] = df.groupby("Ticker")["Return"].shift(3)

    df["return_ma5"] = df.groupby("Ticker")["Return_lag1"].transform(lambda x: x.rolling(5).mean())
    df["Volatility"] = df.groupby("Ticker")["Return_lag1"].transform(lambda x: x.rolling(5).std())
    df["Volatility_10"] = df.groupby("Ticker")["Return_lag1"].transform(lambda x: x.rolling(10).std())

    df["price_mom5"] = df.groupby("Ticker")["Close"].pct_change(5)
    df["price_trend5"] = df.groupby("Ticker")["Close"].transform(lambda x: x.rolling(5).mean())

    df["sentiment_lag1"] = df.groupby("Ticker")["avg_sentiment_score"].shift(1)
    df["sentiment_lag2"] = df.groupby("Ticker")["avg_sentiment_score"].shift(2)
    df["sentiment_lag3"] = df.groupby("Ticker")["avg_sentiment_score"].shift(3)

    df["sentiment_ma3"] = df.groupby("Ticker")["avg_sentiment_score"].transform(lambda x: x.rolling(3).mean())
    df["sentiment_ma5"] = df.groupby("Ticker")["avg_sentiment_score"].transform(lambda x: x.rolling(5).mean())
    df["sentiment_std5"] = df.groupby("Ticker")["avg_sentiment_score"].transform(lambda x: x.rolling(5).std())
    df["sentiment_mom"] = df.groupby("Ticker")["avg_sentiment_score"].diff(1)
    df["sentiment_mom2"] = df.groupby("Ticker")["avg_sentiment_score"].diff(2)

    df["sentiment_vol_interact"] = df["avg_sentiment_score"] * df["Volatility"]
    df["sentiment_return_interact"] = df["avg_sentiment_score"] * df["Return_lag1"]

    df["return_sent_corr"] = df.groupby("Ticker").apply(
        lambda g: g["Return_lag1"].rolling(5).corr(g["avg_sentiment_score"])
    ).reset_index(level=0, drop=True)

    def calc_rsi(series, window=10):
        delta = series.diff()
        gain = delta.clip(lower=0).rolling(window).mean()
        loss = -delta.clip(upper=0).rolling(window).mean()
        rs = gain / (loss + 1e-9)
        return 100 - (100 / (1 + rs))

    df["RSI_10"] = df.groupby("Ticker")["Close"].transform(calc_rsi)

    FEATURES = [
        "Return_lag1","Return_lag2","Return_lag3",
        "return_ma5","Volatility","Volatility_10",
        "price_mom5","price_trend5",
        "sentiment_lag1","sentiment_lag2","sentiment_lag3",
        "sentiment_ma3","sentiment_ma5","sentiment_std5",
        "sentiment_mom","sentiment_mom2",
        "sentiment_return_interact","sentiment_vol_interact","return_sent_corr",
        "RSI_10"
    ]

    # ðŸ”¥ FIXED: Only drop NaNs for feature columns
    df = df.dropna(subset=FEATURES).reset_index(drop=True)

    return df, FEATURES


### Load a test dataset

In [13]:
# ============================================================
# Load NEW dataset for testing
# ============================================================

# Example: use last 20% of original dataset OR a new file
df_test = pd.read_csv("../data/processed/stocks_news_merged.csv")
df_test["Date"] = pd.to_datetime(df_test["Date"])

# Only test on unseen data â†’ last 20%
cutoff = df_test["Date"].quantile(0.60)
df_test = df_test[df_test["Date"] > cutoff].copy()

print("Test dataset shape:", df_test.shape)
df_test.head()


Test dataset shape: (1004, 10)


Unnamed: 0,Date,Open,High,Low,Close,Volume,Ticker,avg_sentiment_score,avg_sentiment_numeric,article_count
753,2023-09-14,172.312197,174.391833,171.896273,174.035324,60895800,AAPL,0.84332,0.0,37.0
754,2023-09-15,174.76812,174.78793,172.133934,173.312378,109259500,AAPL,0.823998,0.047619,21.0
755,2023-09-18,174.768146,177.640025,174.461155,176.243698,67257600,AAPL,0.830727,-0.272727,22.0
756,2023-09-19,175.798055,177.887589,175.411839,177.333023,51826900,AAPL,0.819387,0.263158,19.0
757,2023-09-20,177.521186,177.95692,173.698627,173.787766,58436200,AAPL,0.843843,0.115385,26.0


### Generate Ensemble Predictions

In [14]:
# ============================================================
# Generate Predictions from Ensemble
# ============================================================

df_fe, FEATURES = engineer_features(df_test)

X = df_fe[FEATURES].values
X_scaled = scaler.transform(X)

xgb_prob = xgb_model.predict_proba(X_scaled)[:, 1]
lgb_prob = lgb_model.predict_proba(X_scaled)[:, 1]

meta_input = np.column_stack([xgb_prob, lgb_prob])
ensemble_prob = meta_model.predict_proba(meta_input)[:, 1]

df_fe["Pred_Prob"] = ensemble_prob
df_fe["Prediction"] = (ensemble_prob > 0.5).astype(int)

df_fe.head()


  df["return_sent_corr"] = df.groupby("Ticker").apply(


Unnamed: 0,Date,Open,High,Low,Close,Volume,Ticker,avg_sentiment_score,avg_sentiment_numeric,article_count,...,sentiment_ma5,sentiment_std5,sentiment_mom,sentiment_mom2,sentiment_vol_interact,sentiment_return_interact,return_sent_corr,RSI_10,Pred_Prob,Prediction
0,2023-09-29,170.351427,171.391245,168.687715,169.549286,51861100,AAPL,0.831837,0.16,25.0,...,0.841557,0.020226,-0.017331,-0.022881,0.010521,0.001269,-0.388677,39.314015,0.321961,0
1,2023-10-02,169.559178,172.609304,169.271983,172.064636,52164500,AAPL,0.876877,0.0,26.0,...,0.854726,0.016471,0.045039,0.027709,0.010828,0.002671,0.158197,37.845653,0.252339,0
2,2023-10-03,170.589081,171.945802,169.163061,170.727722,49594600,AAPL,0.903082,-0.190476,21.0,...,0.863137,0.027523,0.026205,0.071245,0.012971,0.013398,0.574236,31.061942,0.035566,0
3,2023-10-04,169.430416,172.520162,169.311585,171.975494,53020300,AAPL,0.900334,-0.25,24.0,...,0.87226,0.031332,-0.002749,0.023457,0.008656,-0.006995,0.329479,44.015644,0.146432,0
4,2023-10-05,172.104248,173.74815,171.005014,173.213394,48527900,AAPL,0.798173,0.0,19.0,...,0.862061,0.045722,-0.10216,-0.104909,0.006602,0.005833,-0.103576,53.271104,0.961128,1


### Save the test predictions

In [16]:
df_fe.to_csv("../results/testing_predictions.csv", index=False)
print("Saved test predictions")


Saved test predictions


In [17]:
# ============================================================
# 7) Clean Output for Final Predictions
# ============================================================

df_clean = df_fe[["Date", "Ticker", "Pred_Prob", "Prediction"]].copy()

# Map binary prediction to human labels
df_clean["Predicted_Direction"] = df_clean["Prediction"].map({
    1: "Up",
    0: "Down"
})

# Sort nicely
df_clean = df_clean.sort_values(["Ticker", "Date"])

display(df_clean.head(20))

# Save clean predictions
df_clean.to_csv("../results/testing_predictions_clean.csv", index=False)

print("ðŸ’¾ Saved clean predictions â†’ testing_predictions_clean.csv")


Unnamed: 0,Date,Ticker,Pred_Prob,Prediction,Predicted_Direction
0,2023-09-29,AAPL,0.321961,0,Down
1,2023-10-02,AAPL,0.252339,0,Down
2,2023-10-03,AAPL,0.035566,0,Down
3,2023-10-04,AAPL,0.146432,0,Down
4,2023-10-05,AAPL,0.961128,1,Up
5,2023-10-06,AAPL,0.977879,1,Up
6,2023-10-09,AAPL,0.068314,0,Down
7,2023-10-10,AAPL,0.910856,1,Up
8,2023-10-11,AAPL,0.982249,1,Up
9,2023-10-12,AAPL,0.468377,0,Down


ðŸ’¾ Saved clean predictions â†’ testing_predictions_clean.csv
