# 01 - Exploratory Data Analysis (EDA)

This notebook performs data sanity checks for proxy-hedging inputs, explores cross-market correlations, and identifies simple volatility regimes.

In [None]:
from __future__ import annotations

import sys
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import yaml

PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "src").exists() and (PROJECT_ROOT.parent / "src").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

SRC_ROOT = PROJECT_ROOT / "src"
if str(SRC_ROOT) not in sys.path:
    sys.path.insert(0, str(SRC_ROOT))

from data.loaders import load_proxy_hedging_prices
from data.preprocess import MissingDataPolicy, preprocess_prices_and_returns


In [None]:
config_path = PROJECT_ROOT / "src/config/default.yaml"
config = yaml.safe_load(config_path.read_text(encoding="utf-8"))

data_cfg = config["data"]
target_name = data_cfg["target"]["name"]
proxy_files = {item["name"]: item["file"] for item in data_cfg["proxies"]}

prices = load_proxy_hedging_prices(
    raw_dir=PROJECT_ROOT / data_cfg["raw_dir"],
    target_file=data_cfg["target"]["file"],
    proxy_files=proxy_files,
    target_name=target_name,
    date_column=data_cfg.get("date_column", "Date"),
    price_column=data_cfg.get("price_column", "Price"),
)

cleaned_prices, simple_returns, _ = preprocess_prices_and_returns(
    prices=prices,
    frequency=config.get("frequency", "B"),
    start_date=config.get("date_range", {}).get("start"),
    end_date=config.get("date_range", {}).get("end"),
    missing_data_policy=MissingDataPolicy(**config.get("missing_data_policy", {})),
)
returns = simple_returns.dropna()

print("Prices shape:", prices.shape)
print("Cleaned prices shape:", cleaned_prices.shape)
print("Returns shape:", returns.shape)


In [None]:
# Sanity checks
quality = pd.DataFrame({
    "missing_count": cleaned_prices.isna().sum(),
    "min_price": cleaned_prices.min(),
    "max_price": cleaned_prices.max(),
    "mean_return": returns.mean(),
    "vol_return": returns.std(ddof=1),
})
quality


In [None]:
# Correlation matrix
corr = returns.corr()
fig, ax = plt.subplots(figsize=(7, 6))
im = ax.imshow(corr.values, cmap="coolwarm", vmin=-1, vmax=1)
ax.set_xticks(range(len(corr.columns)))
ax.set_yticks(range(len(corr.columns)))
ax.set_xticklabels(corr.columns, rotation=45, ha="right")
ax.set_yticklabels(corr.columns)
ax.set_title("Return Correlation Matrix")
fig.colorbar(im, ax=ax, shrink=0.8)
plt.tight_layout()
plt.show()


In [None]:
# Regime proxy: rolling volatility quantiles on target returns
target_ret = returns[target_name]
rolling_vol = target_ret.rolling(20).std(ddof=1)
q_low = rolling_vol.quantile(0.33)
q_high = rolling_vol.quantile(0.67)

regime = pd.Series("mid_vol", index=rolling_vol.index)
regime[rolling_vol <= q_low] = "low_vol"
regime[rolling_vol >= q_high] = "high_vol"

fig, axes = plt.subplots(2, 1, figsize=(10, 7), sharex=True)
axes[0].plot(target_ret.index, target_ret.values, label=target_name)
axes[0].set_title("Target Return Series")
axes[0].grid(alpha=0.25)

axes[1].plot(rolling_vol.index, rolling_vol.values, color="tab:orange", label="20d vol")
axes[1].axhline(q_low, color="green", linestyle="--", label="33% quantile")
axes[1].axhline(q_high, color="red", linestyle="--", label="67% quantile")
axes[1].set_title("Rolling Volatility Regime Proxy")
axes[1].grid(alpha=0.25)
axes[1].legend()

plt.tight_layout()
plt.show()

regime.value_counts(dropna=False)


## Interpretation Notes

- Check that proxy-return correlations remain economically plausible over the sample.
- Use regime splits to test whether hedge performance deteriorates in high-volatility windows.
- If severe regime dependence appears, prioritize dynamic hedge models (rolling or Kalman).