# Phase 2 Feature Engineering Demo

This notebook walks through the Phase 2 pipeline for technical indicators, windowed sequences, and FinBERT embeddings described in `docs/metrics_and_evaluation.md`. It uses the same utilities as the CLI scripts (`scripts/feature_engineering.py` and `scripts/text_embeddings.py`) to ensure consistency and reproducibility.


In [None]:
import json
import math
from pathlib import Path
from datetime import datetime

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA

# Repo imports
import sys
sys.path.insert(0, str(Path('..').resolve()))

from backend.utils.config import config
from backend.utils.file_operations import get_latest_file, load_existing_data
from backend.utils.preprocessing import (
    temporal_train_test_split,
    fit_scaler,
    transform_with_scaler,
    validate_no_leakage,
)
from scripts.feature_engineering import (
    compute_technical_indicators,
    create_windowed_sequences,
    split_and_scale,
)
from scripts import text_embeddings as text_embed

sns.set_theme(style="whitegrid")


## 1. Configuration

Define the ticker, data directories, and defaults inherited from `.env` via `backend.utils.config.Config`. The notebook automatically searches for the most recent raw snapshot.


In [None]:
TICKER = "RELIANCE.NS"
RAW_PRICES_DIR = Path("../data/raw/prices").resolve()
RAW_NEWS_DIR = Path("../data/raw/news").resolve()
PROCESSED_DIR = Path("../data/processed").resolve()
LOOKBACK = config.lookback_window


def load_latest_price_df(ticker: str) -> pd.DataFrame:
    sanitized = ticker.replace(".", "_")
    latest_file = get_latest_file(RAW_PRICES_DIR, f"*/{sanitized}.csv")
    if latest_file is None:
        print("⚠️ No raw price snapshot found. Generating synthetic demo data.")
        dates = pd.date_range("2022-01-01", periods=400, freq="B")
        base = np.linspace(900, 1400, num=len(dates))
        noise = np.random.default_rng(42).normal(0, 15, len(dates))
        df = pd.DataFrame(
            {
                "date": dates,
                "open": base + noise,
                "high": base + noise + 10,
                "low": base + noise - 10,
                "close": base + noise / 2,
                "volume": np.linspace(1e6, 5e6, len(dates)),
            }
        )
        return df
    df = load_existing_data(latest_file, file_format="csv")
    df["date"] = pd.to_datetime(df["date"])
    df = df.sort_values("date").reset_index(drop=True)
    return df


price_df = load_latest_price_df(TICKER)
price_df.head()


In [None]:
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(price_df["date"], price_df["close"], label="Close", color="#2563eb")
ax.set_title(f"{TICKER} Close Price")
ax.set_ylabel("Price (INR)")
ax.legend()
plt.show()


## 2. Technical Indicators

Use the helper from `scripts/feature_engineering.py` (`compute_technical_indicators`) so the notebook stays aligned with the CLI workflow.


In [None]:
indicator_df, indicator_cols = compute_technical_indicators(price_df, config.technical_indicators)
print(f"Computed indicators: {indicator_cols}")
indicator_df.tail().head()


In [None]:
fig, axes = plt.subplots(3, 1, figsize=(12, 10), sharex=True)
latest = indicator_df.tail(200)

# Price + Bollinger
axes[0].plot(latest["date"], latest["close"], label="Close", color="#0ea5e9")
if {"bb_upper_20", "bb_middle_20", "bb_lower_20"}.issubset(latest.columns):
    axes[0].plot(latest["date"], latest["bb_upper_20"], label="BB Upper", linestyle="--", color="#f97316")
    axes[0].plot(latest["date"], latest["bb_middle_20"], label="BB Mid", linestyle=":", color="#22c55e")
    axes[0].plot(latest["date"], latest["bb_lower_20"], label="BB Lower", linestyle="--", color="#f97316")
axes[0].set_title("Close with Bollinger Bands")
axes[0].legend()

# RSI
if "rsi_14" in latest.columns:
    axes[1].plot(latest["date"], latest["rsi_14"], color="#f43f5e")
    axes[1].axhline(70, color="#94a3b8", linestyle="--")
    axes[1].axhline(30, color="#94a3b8", linestyle="--")
    axes[1].set_title("RSI (14)")
    axes[1].set_ylim(0, 100)

# MACD
macd_cols = {"macd_line", "macd_signal", "macd_hist"}
if macd_cols.issubset(latest.columns):
    axes[2].plot(latest["date"], latest["macd_line"], label="MACD", color="#6366f1")
    axes[2].plot(latest["date"], latest["macd_signal"], label="Signal", color="#22d3ee")
    axes[2].bar(latest["date"], latest["macd_hist"], label="Hist", color="#a855f7")
    axes[2].set_title("MACD (12-26-9)")
    axes[2].legend()

plt.tight_layout()
plt.show()


## 3. Train/Val/Test Split & Scaling

We reuse `temporal_train_test_split`, `fit_scaler`, and `split_and_scale` to guarantee the notebook honors the same leakage constraints as the CLI. This cell reports split sizes and window shapes.


In [None]:
feature_cols = [col for col in ["open", "high", "low", "close", "volume"] if col in indicator_df.columns]
feature_cols += indicator_cols
(
    split_tensors,
    split_targets,
    split_indices,
    split_ranges,
    feature_names,
    scaler,
    window_counts,
) = split_and_scale(
    indicator_df,
    feature_cols=feature_cols,
    lookback=LOOKBACK,
    train_ratio=config.train_split_ratio,
    val_ratio=config.val_split_ratio,
    test_ratio=config.test_split_ratio,
    scaler_type=config.scaler_type,
)

print("Window counts:", json.dumps(window_counts, indent=2))


In [None]:
sample_train_window = split_tensors["train"][0]
plt.figure(figsize=(8, 4))
sns.heatmap(sample_train_window.T, cmap="mako", cbar=True)
plt.title("Sample Train Window (features × timesteps)")
plt.xlabel("Timesteps (days)")
plt.ylabel("Features")
plt.show()


## 4. Leakage Validation

Per docs §4.3, validation/test samples must chronologically follow the training period. The helper will raise if indices overlap.


In [None]:
sorted_df, splits = temporal_train_test_split(indicator_df)
validate_no_leakage(sorted_df, splits)
print("✅ Temporal ordering check passed")


## 5. Scaling Diagnostics

Fit the scaler on training rows only, then compare original vs scaled distributions to confirm values are bounded and centered as expected.


In [None]:
train_slice = indicator_df.iloc[split_indices["train"]["start_idx"]:split_indices["train"]["end_idx"]]
scaler = fit_scaler(train_slice[feature_cols], scaler_type=config.scaler_type)
train_scaled = transform_with_scaler(scaler, train_slice[feature_cols])

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].hist(train_slice["close"], bins=30, color="#f59e0b")
axes[0].set_title("Raw Close Distribution")
axes[1].hist(train_scaled[:, feature_cols.index("close")], bins=30, color="#10b981")
axes[1].set_title("Scaled Close Distribution")
plt.show()


## 6. FinBERT Embeddings (News)

To avoid long downloads during demos, the cell attempts to load the ProsusAI/finbert checkpoint. If unavailable, it falls back to dummy vectors so downstream shapes still match.


In [None]:
def load_news_records(ticker: str):
    sanitized = ticker.replace(".", "_")
    latest_file = get_latest_file(RAW_NEWS_DIR, f"*/{sanitized}_news.json")
    if latest_file is None:
        print("⚠️ No raw news snapshot found. Using synthesized headlines for demo.")
        now = datetime.utcnow().isoformat()
        return [
            {"title": "Reliance gains on earnings", "description": "Q2 beat estimates", "published_at": now, "url": "demo-1"},
            {"title": "Oil prices dip", "description": "Potential drag on refining margins", "published_at": now, "url": "demo-2"},
        ]
    data = load_existing_data(latest_file, file_format="json")
    if isinstance(data, pd.DataFrame):
        return data.to_dict("records")
    return data


news_records = load_news_records(TICKER.split(".")[0])
news_samples = text_embed.prepare_news_samples(news_records, ticker=TICKER.split(".")[0])
samples_subset = news_samples[: min(32, len(news_samples))]

try:
    import torch

    device = torch.device("cpu")
    tokenizer, embed_model, classifier, label_map = text_embed.load_models(
        config.finbert_model_name,
        device=device,
        classify=False,
    )
    news_embeddings, _, _ = text_embed.generate_embeddings(
        samples_subset,
        tokenizer=tokenizer,
        embed_model=embed_model,
        classifier=None,
        label_map=None,
        device=device,
        batch_size=min(config.finbert_batch_size, len(samples_subset)),
        max_length=config.finbert_max_length,
    )
except Exception as exc:  # noqa: BLE001
    print(f"⚠️ FinBERT unavailable ({exc}); using random demo embeddings instead.")
    news_embeddings = np.random.normal(size=(len(samples_subset), 768))

print("Embedding matrix shape:", news_embeddings.shape)

if len(samples_subset) >= 2:
    pca = PCA(n_components=2)
    coords = pca.fit_transform(news_embeddings)
    plt.figure(figsize=(6, 5))
    plt.scatter(coords[:, 0], coords[:, 1], c=np.arange(len(coords)), cmap="viridis")
    plt.title("News Embeddings (PCA)")
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.show()


## 7. Metadata Snapshot

Feature + embedding scripts persist metadata JSON files. The snippet below shows the structure stored inside `data/processed/{ticker}/YYYY-MM-DD/metadata.json`.


In [None]:
metadata_preview = {
    "ticker": TICKER,
    "lookback_window": LOOKBACK,
    "split_indices": split_indices,
    "split_date_ranges": split_ranges,
    "window_counts": window_counts,
    "indicators": indicator_cols,
    "scaler": {"type": config.scaler_type, "n_features": scaler.n_features_in_},
}
print(json.dumps(metadata_preview, indent=2))


## 8. Summary & Next Steps

- Technical indicators, splits, scaling, and window generation mirror `scripts/feature_engineering.py`.
- FinBERT embeddings (with fallback) demonstrate how `scripts/text_embeddings.py` feeds downstream sentiment-aware models.
- Metadata captures split indices, date ranges, scaler params, and embedding paths for leakage audits.

Next: feed `train_features.npy` / embeddings into the Phase 3 model notebooks once modeling scripts are implemented.
