# agent-alpha SPX Experiment (All Prices + Universe-Filtered Metrics)

This notebook runs the hypothesis -> blueprint -> AST pipeline using your real SPX inputs:

- `prices` file: OHLCV for all tickers that have ever existed in the universe history
- `universe` file: constituent membership snapshots (daily or month-end)

Evaluation logic:

1. Compute feature components and factor scores on **all** tickers in the price panel.
2. Compute RankIC / ICIR / Ex-ante IR only on rows included by the universe mask.


In [None]:
import os
import sys
from pathlib import Path

cwd = Path.cwd().resolve()
repo_root = cwd
if not (repo_root / "agent-alpha").exists():
    if (cwd.parent / "agent-alpha").exists():
        repo_root = cwd.parent
    elif (cwd.parent.parent / "agent-alpha").exists():
        repo_root = cwd.parent.parent

agent_root = repo_root / "agent-alpha"
if str(agent_root) not in sys.path:
    sys.path.insert(0, str(agent_root))

print("agent_root:", agent_root)
print("data_dir:", agent_root / "agent_alpha" / "data")


In [None]:
from __future__ import annotations

import re
from typing import Iterable

import numpy as np
import pandas as pd

from agent_alpha.workflow import AgentAlphaWorkflow

DATA_DIR = agent_root / "agent_alpha" / "data"
ALLOWED_DATA_EXTS = {".csv", ".txt", ".parquet"}


def _read_table(path: Path) -> pd.DataFrame:
    suffix = path.suffix.lower()
    if suffix in {".csv", ".txt"}:
        return pd.read_csv(path)
    if suffix == ".parquet":
        return pd.read_parquet(path)
    raise ValueError(f"Unsupported file extension: {path.suffix}")


def _normalize_name(name: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", str(name).strip().lower())


def _pick_column(df: pd.DataFrame, aliases: Iterable[str], required: bool = True) -> str | None:
    lookup = {_normalize_name(col): col for col in df.columns}
    for alias in aliases:
        key = _normalize_name(alias)
        if key in lookup:
            return lookup[key]
    if required:
        raise ValueError(f"Missing required column. Tried aliases: {list(aliases)}")
    return None


def _find_data_file(kind: str, data_dir: Path = DATA_DIR) -> Path:
    preferred_names = {
        "universe": [
            "spx_universe_filtered.csv",
            "spx_universe.csv",
        ],
        "prices": [
            "spx_prices.csv",
            "spx_agent_panel.csv",
        ],
    }
    patterns = {
        "universe": ["*universe*", "*constituent*", "*membership*", "*spx*univ*"],
        "prices": ["*prices*", "*price*", "*ohlcv*", "*panel*", "*bars*", "*spx*price*"],
    }
    if kind not in patterns or kind not in preferred_names:
        raise ValueError(f"Unsupported kind: {kind}")

    for name in preferred_names[kind]:
        path = data_dir / name
        if path.is_file() and path.suffix.lower() in ALLOWED_DATA_EXTS:
            return path

    candidates: list[Path] = []
    for pattern in patterns[kind]:
        for path in data_dir.rglob(pattern):
            if path.is_file() and path.suffix.lower() in ALLOWED_DATA_EXTS:
                candidates.append(path)

    unique = sorted(set(candidates))
    if not unique:
        raise FileNotFoundError(
            f"No {kind} file found in {data_dir}. "
            f"Expected one of extensions {sorted(ALLOWED_DATA_EXTS)} and a filename containing: {patterns[kind]}"
        )

    if len(unique) > 1:
        print(f"[{kind}] multiple files found; using first candidate after sort:")
        for path in unique:
            print(" -", path.relative_to(data_dir))

    return unique[0]


def _parse_membership(series: pd.Series) -> pd.Series:
    if series.dtype == bool:
        return series.astype(int)

    numeric = pd.to_numeric(series, errors="coerce")
    if numeric.notna().any():
        return (numeric.fillna(0) != 0).astype(int)

    text = series.astype(str).str.strip().str.lower()
    true_values = {"1", "true", "t", "yes", "y", "in", "member"}
    return text.isin(true_values).astype(int)


def standardize_universe(universe_raw: pd.DataFrame) -> pd.DataFrame:
    date_col = _pick_column(universe_raw, ["date", "datetime", "timestamp", "trading_date"], required=False)
    ticker_col = _pick_column(universe_raw, ["ticker", "symbol", "instrument", "ric", "asset"], required=True)
    flag_col = _pick_column(
        universe_raw,
        ["in_universe", "is_member", "in_index", "member", "active", "weight"],
        required=False,
    )

    out = pd.DataFrame()
    out["ticker"] = universe_raw[ticker_col].astype(str).str.upper().str.replace(".", "-", regex=False).str.strip()
    out["ticker"] = out["ticker"].replace({"": np.nan})

    if date_col is None:
        out["date"] = pd.NaT
    else:
        out["date"] = pd.to_datetime(universe_raw[date_col], errors="coerce").dt.tz_localize(None).dt.normalize()

    if flag_col is None:
        out["in_universe"] = 1
    else:
        parsed = _parse_membership(universe_raw[flag_col])
        if _normalize_name(flag_col) == "weight":
            parsed = (pd.to_numeric(universe_raw[flag_col], errors="coerce").fillna(0.0) > 0).astype(int)
        out["in_universe"] = parsed.astype(int)

    out = out.dropna(subset=["ticker"])
    out = out[out["in_universe"] == 1].copy()

    if out["date"].notna().any():
        out = out.dropna(subset=["date"])
        out = out.drop_duplicates(["date", "ticker"])
        out = out.sort_values(["date", "ticker"])
    else:
        out = out.drop_duplicates(["ticker"])
        out = out.sort_values(["ticker"])

    return out.reset_index(drop=True)


def standardize_prices(prices_raw: pd.DataFrame) -> pd.DataFrame:
    date_col = _pick_column(prices_raw, ["date", "datetime", "timestamp", "trading_date"], required=True)
    ticker_col = _pick_column(prices_raw, ["ticker", "symbol", "instrument", "ric", "asset"], required=True)
    open_col = _pick_column(prices_raw, ["open", "$open", "px_open", "o"], required=True)
    high_col = _pick_column(prices_raw, ["high", "$high", "px_high", "h"], required=True)
    low_col = _pick_column(prices_raw, ["low", "$low", "px_low", "l"], required=True)
    close_col = _pick_column(prices_raw, ["close", "adj_close", "$close", "px_close", "c"], required=True)
    volume_col = _pick_column(prices_raw, ["volume", "$volume", "vol", "v"], required=True)

    out = pd.DataFrame(
        {
            "date": pd.to_datetime(prices_raw[date_col], errors="coerce").dt.tz_localize(None).dt.normalize(),
            "ticker": prices_raw[ticker_col].astype(str).str.upper().str.replace(".", "-", regex=False).str.strip(),
            "open": pd.to_numeric(prices_raw[open_col], errors="coerce"),
            "high": pd.to_numeric(prices_raw[high_col], errors="coerce"),
            "low": pd.to_numeric(prices_raw[low_col], errors="coerce"),
            "close": pd.to_numeric(prices_raw[close_col], errors="coerce"),
            "volume": pd.to_numeric(prices_raw[volume_col], errors="coerce"),
        }
    )

    out = out.replace([np.inf, -np.inf], np.nan)
    out = out.dropna(subset=["date", "ticker", "open", "high", "low", "close"])
    out["volume"] = out["volume"].fillna(0.0)
    out = out[(out["high"] >= out["low"]) & (out["volume"] >= 0)]
    out = out.drop_duplicates(["date", "ticker"]).sort_values(["date", "ticker"])
    return out.reset_index(drop=True)


def build_panel_for_agent(prices_df: pd.DataFrame) -> pd.DataFrame:
    if prices_df.empty:
        raise ValueError("Prices input is empty after normalization")

    panel = prices_df.rename(
        columns={
            "date": "datetime",
            "ticker": "instrument",
            "open": "$open",
            "high": "$high",
            "low": "$low",
            "close": "$close",
            "volume": "$volume",
        }
    )
    panel = panel.set_index(["datetime", "instrument"]).sort_index()
    panel.index = panel.index.set_names(["datetime", "instrument"])
    panel = panel[["$open", "$high", "$low", "$close", "$volume"]]
    return panel


def build_universe_mask_for_agent(universe_df: pd.DataFrame, panel: pd.DataFrame) -> pd.DataFrame:
    if universe_df.empty:
        raise ValueError("Universe input is empty after normalization")

    out = universe_df[["date", "ticker", "in_universe"]].copy()
    out["ticker"] = out["ticker"].astype(str).str.upper().str.replace(".", "-", regex=False).str.strip()

    panel_tickers = set(panel.index.get_level_values("instrument").astype(str).str.upper())
    out = out[out["ticker"].isin(panel_tickers)].copy()
    if out.empty:
        raise ValueError("No overlap between universe tickers and price panel tickers")

    if out["date"].notna().any():
        out = out.dropna(subset=["date"]).drop_duplicates(["date", "ticker"]).sort_values(["date", "ticker"])
    else:
        out = out.drop_duplicates(["ticker"]).sort_values(["ticker"])

    return out.reset_index(drop=True)


def panel_profile(panel: pd.DataFrame) -> dict[str, float | int | str]:
    dates = panel.index.get_level_values("datetime")
    instruments = panel.index.get_level_values("instrument")
    close = panel["$close"].astype(float)
    rets = close.groupby(level="instrument", sort=False).pct_change()
    dollar_vol = (panel["$close"] * panel["$volume"]).groupby(level="datetime").median()

    return {
        "rows": int(len(panel)),
        "n_dates": int(dates.nunique()),
        "n_tickers": int(instruments.nunique()),
        "start": str(dates.min().date()),
        "end": str(dates.max().date()),
        "median_abs_1d_return_pct": float(rets.abs().median() * 100.0),
        "median_daily_dollar_volume": float(dollar_vol.median()),
    }


def universe_profile(universe_mask: pd.DataFrame) -> dict[str, int | str]:
    has_dates = bool(universe_mask["date"].notna().any())
    snapshot_dates = int(universe_mask["date"].dropna().nunique()) if has_dates else 0
    return {
        "rows": int(len(universe_mask)),
        "n_tickers": int(universe_mask["ticker"].nunique()),
        "has_dates": str(has_dates),
        "snapshot_dates": snapshot_dates,
    }


In [None]:
universe_path = _find_data_file("universe")
prices_path = _find_data_file("prices")

print("universe_path:", universe_path)
print("prices_path:", prices_path)

universe_raw = _read_table(universe_path)
prices_raw = _read_table(prices_path)

universe_df = standardize_universe(universe_raw)
prices_df = standardize_prices(prices_raw)
panel = build_panel_for_agent(prices_df)
universe_mask = build_universe_mask_for_agent(universe_df, panel)

profile = panel_profile(panel)
univ = universe_profile(universe_mask)
print("panel_profile:", profile)
print("universe_profile:", univ)

display(universe_mask.head(3))
display(prices_df.head(3))
display(panel.head(5))


In [None]:
if not os.environ.get("OPENAI_API_KEY"):
    raise EnvironmentError("OPENAI_API_KEY is not set. Export it before running LLM steps.")

MODEL_NAME = "gpt-5-mini"

llm_data_context = (
    f"SPX panel from {profile['start']} to {profile['end']}; "
    f"{profile['n_tickers']} names, {profile['n_dates']} trading days, {profile['rows']} rows. "
    f"Median abs 1D return is {profile['median_abs_1d_return_pct']:.3f}%. "
    "Focus on robust, interpretable cross-sectional OHLCV factors."
)

user_goal = (
    "Generate a robust SPX cross-sectional alpha hypothesis and express it as a compact factor blueprint. "
    + llm_data_context
)

print(user_goal)


In [None]:
workflow = AgentAlphaWorkflow(
    model_name=MODEL_NAME,
    periods=(1, 5, 10),
    max_attempts=2,
)

state = workflow.run(
    user_goal=user_goal,
    panel=panel,
    max_attempts=2,
    universe_mask=universe_mask,
)


In [None]:
import json

metrics = state.get("metrics", {})

print("error:", state.get("error"))
print("
hypothesis:
", state.get("hypothesis"))
print("
rationale:
", state.get("rationale"))
print("
AST expression:
", state.get("ast_expression"))
print("
AST summary:
", state.get("ast_summary"))
print("
evaluation_scope:
", json.dumps(metrics.get("evaluation_scope", {}), indent=2, ensure_ascii=False))
print("
metrics:
", json.dumps(metrics, indent=2, ensure_ascii=False))
print("
blueprint:
", json.dumps(state.get("blueprint_json", {}), indent=2, ensure_ascii=False))


In [None]:
factor = state.get("factor")
metrics = state.get("metrics", {})
scope = metrics.get("evaluation_scope", {})

if factor is None:
    print("No factor returned. Check state['error'].")
else:
    print("factor_non_null_all_tickers:", int(factor.notna().sum()))
    print("rows_in_evaluation_scope:", scope.get("rows_in_scope"))
    print("n_tickers_in_evaluation_scope:", scope.get("n_tickers_in_scope"))
    display(factor.dropna().head(10))
