# Download SPX Data to `market_data` and Run agent-alpha

This notebook does two things:

1. Downloads real SPX universe + prices data using `agent_alpha.data.download_spx_data`.
2. Runs factor evaluation where features/factor are computed on all downloaded prices, while RankIC/ICIR/Ex-ante IR are evaluated only on the universe mask.

Optional: run the full `AgentAlphaWorkflow` if `OPENAI_API_KEY` is set.


In [None]:
from __future__ import annotations

import sys
from pathlib import Path

import pandas as pd

CURRENT_DIR = Path.cwd().resolve()
candidate_roots = [CURRENT_DIR, CURRENT_DIR.parent, CURRENT_DIR.parent.parent]
REPO_ROOT = next((p for p in candidate_roots if (p / "agent_alpha").exists()), None)
if REPO_ROOT is None:
    raise FileNotFoundError(
        "Could not find project root containing 'agent_alpha'. Open this notebook from agent-alpha/notebooks."
    )
NOTEBOOK_DIR = REPO_ROOT / "notebooks"
if not NOTEBOOK_DIR.exists():
    raise FileNotFoundError(f"Expected notebooks directory at: {NOTEBOOK_DIR}")

if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

MARKET_DATA_DIR = NOTEBOOK_DIR / "market_data"
MARKET_DATA_DIR.mkdir(parents=True, exist_ok=True)

print("NOTEBOOK_DIR:", NOTEBOOK_DIR)
print("REPO_ROOT:", REPO_ROOT)
print("MARKET_DATA_DIR:", MARKET_DATA_DIR)

If your environment is missing data-download deps, run this once:

```python
%pip install yfinance requests beautifulsoup4 lxml
```

In [None]:
from agent_alpha.data.download_spx_data import build_spx_data

START_DATE = "2015-01-01"
END_DATE = pd.Timestamp.today().strftime("%Y-%m-%d")
BATCH_SIZE = 100
PAUSE_SECONDS = 0.2

summary = build_spx_data(
    start_date=START_DATE,
    end_date=END_DATE,
    output_dir=MARKET_DATA_DIR,
    batch_size=BATCH_SIZE,
    pause_seconds=PAUSE_SECONDS,
    auto_adjust=True,
)

summary

In [None]:
from IPython.display import display

for file_path in sorted(MARKET_DATA_DIR.glob("spx_*.csv")):
    size_mb = file_path.stat().st_size / (1024 * 1024)
    print(f"{file_path.name}: {size_mb:.2f} MB")

prices_csv_path = MARKET_DATA_DIR / "spx_prices.csv"
universe_csv_path = MARKET_DATA_DIR / "spx_universe_filtered.csv"

display(pd.read_csv(prices_csv_path, nrows=5))
display(pd.read_csv(universe_csv_path, nrows=5))


In [None]:
prices_csv_path = MARKET_DATA_DIR / "spx_prices.csv"
universe_csv_path = MARKET_DATA_DIR / "spx_universe_filtered.csv"

prices_df = pd.read_csv(prices_csv_path, parse_dates=["date"])
universe_mask = pd.read_csv(universe_csv_path, parse_dates=["date"])

panel = (
    prices_df.rename(
        columns={
            "date": "datetime",
            "ticker": "instrument",
            "open": "$open",
            "high": "$high",
            "low": "$low",
            "close": "$close",
            "volume": "$volume",
        }
    )
    .set_index(["datetime", "instrument"])
    .sort_index()[["$open", "$high", "$low", "$close", "$volume"]]
)

dates = panel.index.get_level_values("datetime")
instruments = panel.index.get_level_values("instrument")
print(
    {
        "panel_rows": int(len(panel)),
        "panel_tickers": int(instruments.nunique()),
        "panel_dates": int(dates.nunique()),
        "start": str(dates.min().date()),
        "end": str(dates.max().date()),
        "universe_rows": int(len(universe_mask)),
        "universe_tickers": int(universe_mask["ticker"].nunique()),
        "universe_snapshot_dates": int(universe_mask["date"].nunique()),
    }
)
panel.head()


In [None]:
import json

from agent_alpha.evaluator import FactorEvaluator

example_ast = {
    "version": "1",
    "root": {
        "type": "call",
        "op": "RANK",
        "args": [
            {
                "type": "call",
                "op": "DELTA",
                "args": [
                    {"type": "var", "name": "$close"},
                    {"type": "const", "value": 5},
                ],
            }
        ],
    },
}

evaluator = FactorEvaluator(periods=[1, 5, 10], min_cross_section=5)
factor = evaluator.calculate_factor(panel, example_ast)
forward_returns = evaluator.calculate_forward_returns(panel, periods=[1, 5, 10])
metrics_all = evaluator.calculate_ex_ante_ir(factor, forward_returns)
metrics_universe = evaluator.calculate_ex_ante_ir(
    factor,
    forward_returns,
    universe_mask=universe_mask,
)

print("Example AST rank_ic_ir (all rows):", metrics_all["rank_ic_ir"])
print("Example AST rank_ic_ir (universe-filtered):", metrics_universe["rank_ic_ir"])
print("Universe evaluation scope:", json.dumps(metrics_universe.get("evaluation_scope", {}), indent=2))
display(factor.dropna().head(10))
metrics_universe


Optional full workflow with LLM (`gpt-5-mini`). This requires `OPENAI_API_KEY` in your environment.

In [None]:
import os

from agent_alpha.workflow import AgentAlphaWorkflow
if not os.environ.get("OPENAI_API_KEY"):
    print("OPENAI_API_KEY is not set. Skipping LLM workflow run.")
else:
    workflow = AgentAlphaWorkflow(model_name="gpt-5-mini", periods=[1], max_attempts=2)
    user_goal = (
        "Generate a robust SPX cross-sectional alpha from OHLCV data. "
        "Keep the expression compact and interpretable."
    )
    state = workflow.run(
        user_goal=user_goal,
        panel=panel,
        max_attempts=2,
        universe_mask=universe_mask,
    )

    print("error:", state.get("error"))
    print("hypothesis:", state.get("hypothesis"))
    print("ast_expression:", state.get("ast_expression"))
    print("metrics:", state.get("metrics"))
