09_inference_pipeline.ipynb notebook

In [1]:
# ============================================================
# Inference Pipeline Notebook
# Final Ensemble: XGB + LGBM + Logistic Regression
# ============================================================

import pandas as pd
import numpy as np
import joblib

import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler

# Load trained models
scaler = joblib.load("../models/tab_scaler.pkl")
xgb_model = joblib.load("../models/xgb_final.json")
lgb_model = joblib.load("../models/lgb_final.txt")
meta_model = joblib.load("../models/ensemble_final.pkl")

print("ðŸ”¥ All models successfully loaded!")


ðŸ”¥ All models successfully loaded!


### Reusable feature engineering function
Mirrors entire training feature pipeline

In [2]:
# ============================================================
# Feature Engineering For Inference
# ============================================================

def engineer_features(df):
    df = df.copy()
    df = df.sort_values(["Ticker", "Date"]).reset_index(drop=True)

    # --- Core features ---
    df["Return"] = df.groupby("Ticker")["Close"].pct_change()

    df["Return_lag1"] = df.groupby("Ticker")["Return"].shift(1)
    df["Return_lag2"] = df.groupby("Ticker")["Return"].shift(2)
    df["Return_lag3"] = df.groupby("Ticker")["Return"].shift(3)

    df["return_ma5"] = df.groupby("Ticker")["Return_lag1"].transform(lambda x: x.rolling(5).mean())
    df["Volatility"] = df.groupby("Ticker")["Return_lag1"].transform(lambda x: x.rolling(5).std())
    df["Volatility_10"] = df.groupby("Ticker")["Return_lag1"].transform(lambda x: x.rolling(10).std())

    df["price_mom5"] = df.groupby("Ticker")["Close"].pct_change(5)
    df["price_trend5"] = df.groupby("Ticker")["Close"].transform(lambda x: x.rolling(5).mean())

    # --- Sentiment ---
    df["sentiment_lag1"] = df.groupby("Ticker")["avg_sentiment_score"].shift(1)
    df["sentiment_lag2"] = df.groupby("Ticker")["avg_sentiment_score"].shift(2)
    df["sentiment_lag3"] = df.groupby("Ticker")["avg_sentiment_score"].shift(3)

    df["sentiment_ma3"] = df.groupby("Ticker")["avg_sentiment_score"].transform(lambda x: x.rolling(3).mean())
    df["sentiment_ma5"] = df.groupby("Ticker")["avg_sentiment_score"].transform(lambda x: x.rolling(5).mean())
    df["sentiment_std5"] = df.groupby("Ticker")["avg_sentiment_score"].transform(lambda x: x.rolling(5).std())
    df["sentiment_mom"] = df.groupby("Ticker")["avg_sentiment_score"].diff(1)
    df["sentiment_mom2"] = df.groupby("Ticker")["avg_sentiment_score"].diff(2)

    df["sentiment_vol_interact"] = df["avg_sentiment_score"] * df["Volatility"]
    df["sentiment_return_interact"] = df["avg_sentiment_score"] * df["Return_lag1"]

    # Rolling correlation
    df["return_sent_corr"] = df.groupby("Ticker").apply(
        lambda g: g["Return_lag1"].rolling(5).corr(g["avg_sentiment_score"])
    ).reset_index(level=0, drop=True)

    # --- RSI ---
    def calc_rsi(series, window=10):
        delta = series.diff()
        gain = delta.clip(lower=0).rolling(window).mean()
        loss = -delta.clip(upper=0).rolling(window).mean()
        rs = gain / (loss + 1e-9)
        return 100 - (100 / (1 + rs))

    df["RSI_10"] = df.groupby("Ticker")["Close"].transform(calc_rsi)

    # Same feature list as training
    FEATURES = [
        "Return_lag1","Return_lag2","Return_lag3","return_ma5","Volatility","Volatility_10",
        "price_mom5","price_trend5",
        "sentiment_lag1","sentiment_lag2","sentiment_lag3",
        "sentiment_ma3","sentiment_ma5","sentiment_std5",
        "sentiment_mom","sentiment_mom2",
        "sentiment_return_interact","sentiment_vol_interact","return_sent_corr",
        "RSI_10"
    ]

    df = df.dropna(subset=FEATURES).reset_index(drop=True)

    return df, FEATURES


### Prediction Function

In [3]:
# ============================================================
# 3) Predict Function â€” Final Ensemble
# ============================================================

def ensemble_predict(df):
    df_fe, FEATURES = engineer_features(df)

    X = df_fe[FEATURES].values
    X_scaled = scaler.transform(X)

    xgb_prob = xgb_model.predict_proba(X_scaled)[:, 1]
    lgb_prob = lgb_model.predict_proba(X_scaled)[:, 1]

    meta_input = np.column_stack([xgb_prob, lgb_prob])
    ensemble_prob = meta_model.predict_proba(meta_input)[:, 1]

    df_fe["Ensemble_Prob"] = ensemble_prob
    df_fe["Prediction"] = (ensemble_prob > 0.5).astype(int)

    return df_fe


### Run Inference on new data

In [None]:
# ============================================================
# 4) Load new data & run inference
# ============================================================

df_new = pd.read_csv("../data/raw/new_stock_data.csv")  # your future data
df_new["Date"] = pd.to_datetime(df_new["Date"])

pred_df = ensemble_predict(df_new)

display(pred_df.head())


### Export preditions

In [None]:
pred_df.to_csv("../results/inference_predictions.csv", index=False)
print("ðŸ’¾ Saved predictions â†’ inference_predictions.csv")
