# 4.4 与 Polymarket 概率对比（仅目标公司）

目标：基于 4.2/4.3 产出的 OOS 文本概率，与 Polymarket 的 Yes 隐含概率对照。

最终对齐结果包含：
- `ticker`
- `quarter`
- `y_true`
- `p_text`
- `p_poly`

注意：必须是同一批 OOS 样本（`ticker + quarter` 对齐后交集）。

In [9]:
# ========== 配置 ==========

from pathlib import Path
import json
import time
import sqlite3
from datetime import datetime, timezone

import numpy as np
import pandas as pd
import requests
import matplotlib.pyplot as plt
from sklearn.metrics import log_loss, brier_score_loss
from sklearn.calibration import calibration_curve

PROJECT_ROOT = Path("..").resolve()
MODEL_DIR = PROJECT_ROOT / "data" / "modeling"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

# 4.2/4.3 输出：优先使用 best_oos_probabilities_run_012.csv，不存在再用 oos_probabilities.csv
TEXT_OOS_CANDIDATES = [
    MODEL_DIR / "best_oos_probabilities_run_012.csv",
    MODEL_DIR / "oos_probabilities.csv",
    MODEL_DIR / "oos_probabilities_logit_l2.csv",
    MODEL_DIR / "oos_probabilities_logit_l1.csv",
]

TEXT_OOS_CSV = next((p for p in TEXT_OOS_CANDIDATES if p.exists()), None)
if TEXT_OOS_CSV is None:
    raise FileNotFoundError("未找到 OOS 概率文件，请先运行 4.2/4.3")

# 目标公司：None 表示不过滤（推荐，避免漏掉 JPM）
TARGET_TICKERS = None

# p_poly 数据源："gamma" / "manual" / "url_map"
POLY_SOURCE = "gamma"

# 手动模式文件（列要求：ticker, quarter, p_poly）
POLY_DIRECT_CSV = MODEL_DIR / "polymarket_probabilities_manual.csv"

# URL 映射模式文件（列要求：ticker, quarter, poly_api_url）
POLY_MAP_CSV = MODEL_DIR / "polymarket_market_map.csv"

# Gamma 自动模式配置
EARNINGS_DB = PROJECT_ROOT / "data" / "earnings_calls.db"
GAMMA_MARKETS_URL = "https://gamma-api.polymarket.com/markets"
GAMMA_EVENTS_URL = "https://gamma-api.polymarket.com/events"
GAMMA_HISTORY_URLS = [
    "https://clob.polymarket.com/prices-history",
    "https://data-api.polymarket.com/prices-history",
]

# 优先抓公告前 14 天内价格，失败再放宽
LOOKBACK_DAYS = 14
REQUEST_TIMEOUT = 20
SLEEP_SECONDS = 0.2

OUT_MERGED_CSV = MODEL_DIR / "oos_text_vs_poly_target_tickers.csv"
OUT_METRICS_JSON = MODEL_DIR / "oos_text_vs_poly_target_tickers_metrics.json"
OUT_GAP_CSV = MODEL_DIR / "oos_text_vs_poly_mapping_gaps.csv"
OUT_GAMMA_MAP_CSV = MODEL_DIR / "gamma_market_mapping.csv"

print("TEXT_OOS_CSV:", TEXT_OOS_CSV)
print("POLY_SOURCE:", POLY_SOURCE)
print("EARNINGS_DB:", EARNINGS_DB)
print("TARGET_TICKERS:", TARGET_TICKERS)

TEXT_OOS_CSV: /Users/xinyuewang/Desktop/1.27/data/modeling/best_oos_probabilities_run_012.csv
POLY_SOURCE: gamma
EARNINGS_DB: /Users/xinyuewang/Desktop/1.27/data/earnings_calls.db
TARGET_TICKERS: None


In [12]:
# ========== Gamma 自动：market id -> price history -> announcement 前最后价格 ==========
import re


# 1) 读取 text 概率（ticker, quarter, y_true, p_text）
def normalize_quarter(q):
    if pd.isna(q):
        return None
    s = str(q).strip().upper().replace("_", "-").replace(" ", "")
    if "-Q" in s:
        return s
    if len(s) >= 6 and s[:4].isdigit() and "Q" in s:
        i = s.find("Q")
        return f"{s[:4]}-Q{s[i+1:]}"
    return s


def to_utc_dt(x):
    if pd.isna(x):
        return pd.NaT
    s = str(x).strip().replace("Z", "+00:00")
    return pd.to_datetime(s, utc=True, errors="coerce")


def quarter_idx(q):
    s = str(q).replace("-", "")
    m = re.match(r"^(\d{4})Q([1-4])$", s)
    if not m:
        return np.nan
    return int(m.group(1)) * 4 + int(m.group(2))


df_text = pd.read_csv(TEXT_OOS_CSV)
if "actual" in df_text.columns and "y_true" not in df_text.columns:
    df_text = df_text.rename(columns={"actual": "y_true"})
if "fiscal_quarter" in df_text.columns and "quarter" not in df_text.columns:
    df_text = df_text.rename(columns={"fiscal_quarter": "quarter"})

need_cols = ["ticker", "quarter", "y_true", "p_text"]
missing = [c for c in need_cols if c not in df_text.columns]
if missing:
    raise ValueError(f"OOS 文件缺少列: {missing}")

df_text = df_text[need_cols].copy()
df_text["ticker"] = df_text["ticker"].astype(str).str.upper().str.strip()
df_text["quarter"] = df_text["quarter"].apply(normalize_quarter)
df_text["y_true"] = pd.to_numeric(df_text["y_true"], errors="coerce")
df_text["p_text"] = pd.to_numeric(df_text["p_text"], errors="coerce")

# 仅保留 OOS 期间
df_text = df_text[df_text["quarter"].map(quarter_idx) >= (2023 * 4 + 1)].copy()
if TARGET_TICKERS is not None:
    target = [t.upper().strip() for t in TARGET_TICKERS]
    df_text = df_text[df_text["ticker"].isin(target)].copy()

df_text = df_text.dropna(subset=["ticker", "quarter", "y_true", "p_text"]).drop_duplicates(["ticker", "quarter"])

print("text rows:", len(df_text))


# 2) 找 announcement 时间（来自 earnings_calls.db 的 segments.timestamp 最早值）
def get_announcement_map(pairs):
    if not EARNINGS_DB.exists():
        out = pairs.copy()
        out["announcement_time"] = pd.NaT
        return out

    conn = sqlite3.connect(EARNINGS_DB)
    seg = pd.read_sql_query(
        "SELECT ticker, quarter, timestamp FROM segments WHERE timestamp IS NOT NULL AND timestamp <> ''",
        conn,
    )
    conn.close()

    seg["ticker"] = seg["ticker"].astype(str).str.upper().str.strip()
    seg["quarter"] = seg["quarter"].apply(normalize_quarter)
    seg["timestamp"] = seg["timestamp"].apply(to_utc_dt)
    seg = seg.dropna(subset=["ticker", "quarter", "timestamp"])

    ann = seg.groupby(["ticker", "quarter"], as_index=False)["timestamp"].min()
    ann = ann.rename(columns={"timestamp": "announcement_time"})

    return pairs.merge(ann, on=["ticker", "quarter"], how="left")


pairs = df_text[["ticker", "quarter"]].drop_duplicates().copy()
pairs = get_announcement_map(pairs)
print("pairs:", len(pairs), "missing announcement_time:", int(pairs["announcement_time"].isna().sum()))


# 3) Gamma: 找 market id
def fetch_json(url, params=None):
    try:
        r = requests.get(url, params=params, timeout=REQUEST_TIMEOUT)
        r.raise_for_status()
        return r.json()
    except Exception:
        return None


def score_market(m, ticker, quarter):
    q = str(m.get("question", "") or m.get("title", "") or "").lower()
    s = str(m.get("slug", "") or "").lower()
    text = q + " " + s
    sc = 0
    if ticker.lower() in text:
        sc += 3
    if "earnings" in text:
        sc += 2
    if "beat" in text or "miss" in text:
        sc += 2
    if quarter.lower() in text or quarter.replace("-", "").lower() in text:
        sc += 1
    return sc


def find_gamma_market_id(ticker, quarter):
    cands = []
    for query in [f"{ticker} earnings", f"{ticker} beat miss", ticker]:
        payload = fetch_json(GAMMA_MARKETS_URL, params={"search": query, "limit": 200})
        if isinstance(payload, list):
            cands.extend(payload)

    if len(cands) == 0:
        payload = fetch_json(GAMMA_EVENTS_URL, params={"search": f"{ticker} earnings", "limit": 200})
        if isinstance(payload, list):
            for e in payload:
                for m in e.get("markets", []) or []:
                    cands.append(m)

    if len(cands) == 0:
        return None, None

    scored = []
    for m in cands:
        mid = m.get("id") or m.get("marketId") or m.get("conditionId")
        if mid is None:
            continue
        scored.append((score_market(m, ticker, quarter), str(mid), str(m.get("question") or m.get("title") or "")))

    if len(scored) == 0:
        return None, None

    scored.sort(key=lambda x: x[0], reverse=True)
    return scored[0][1], scored[0][2]


# 4) Gamma: 抓 price history，并取 announcement 前最后一个价格

def parse_history(payload):
    if payload is None:
        return []
    if isinstance(payload, dict):
        arr = None
        for k in ["history", "prices", "data"]:
            if isinstance(payload.get(k), list):
                arr = payload[k]
                break
        if arr is None:
            arr = []
    elif isinstance(payload, list):
        arr = payload
    else:
        arr = []

    out = []
    for it in arr:
        if not isinstance(it, dict):
            continue
        t = it.get("t") or it.get("timestamp") or it.get("time")
        p = it.get("p") if it.get("p") is not None else it.get("price")
        if t is None or p is None:
            continue
        try:
            dt = pd.to_datetime(int(t), unit="s", utc=True)
            out.append((dt, float(p)))
        except Exception:
            pass
    out.sort(key=lambda x: x[0])
    return out


def fetch_market_history(market_id):
    for url in GAMMA_HISTORY_URLS:
        payload = fetch_json(url, params={"market": market_id, "interval": "max", "fidelity": 1})
        hist = parse_history(payload)
        if len(hist) > 0:
            return hist
    return []


rows = []
map_rows = []
for _, r in pairs.iterrows():
    ticker = r["ticker"]
    quarter = r["quarter"]
    ann = r["announcement_time"]

    market_id, question = find_gamma_market_id(ticker, quarter)
    p_poly = np.nan
    price_ts = pd.NaT

    if market_id is not None:
        hist = fetch_market_history(market_id)
        if len(hist) > 0:
            if pd.notna(ann):
                pre = [(t, p) for (t, p) in hist if t <= ann]
            else:
                pre = hist
            if len(pre) > 0:
                price_ts, p_poly = pre[-1]

    rows.append({"ticker": ticker, "quarter": quarter, "p_poly": p_poly})
    map_rows.append({
        "ticker": ticker,
        "quarter": quarter,
        "announcement_time": ann,
        "market_id": market_id,
        "market_question": question,
        "price_ts": price_ts,
        "p_poly": p_poly,
    })

    time.sleep(SLEEP_SECONDS)


df_poly = pd.DataFrame(rows)
df_map = pd.DataFrame(map_rows)
df_map.to_csv(OUT_GAMMA_MAP_CSV, index=False)
print("gamma map saved:", OUT_GAMMA_MAP_CSV)
print("non-null p_poly:", int(df_poly["p_poly"].notna().sum()), "/", len(df_poly))


# 5) 合并并导出最终对比表（你要的格式）
df_cmp = df_text.merge(df_poly, on=["ticker", "quarter"], how="inner")
df_cmp = df_cmp.dropna(subset=["y_true", "p_text", "p_poly"]).copy()

print("comparable rows:", len(df_cmp))
if len(df_cmp) == 0:
    raise RuntimeError("Gamma 未匹配到可用 p_poly，请检查 OUT_GAMMA_MAP_CSV 的 market_id 与 price_ts")

out_cols = ["ticker", "quarter", "y_true", "p_text", "p_poly"]
df_cmp[out_cols].to_csv(OUT_MERGED_CSV, index=False)

y = df_cmp["y_true"].astype(int).values
p_text = df_cmp["p_text"].astype(float).values
p_poly = df_cmp["p_poly"].astype(float).values
metrics = {
    "n_samples": int(len(df_cmp)),
    "logloss_text": float(log_loss(y, p_text)),
    "logloss_poly": float(log_loss(y, p_poly)),
    "brier_text": float(brier_score_loss(y, p_text)),
    "brier_poly": float(brier_score_loss(y, p_poly)),
}
with open(OUT_METRICS_JSON, "w", encoding="utf-8") as f:
    json.dump(metrics, f, ensure_ascii=False, indent=2)

print("saved:", OUT_MERGED_CSV)
print("saved:", OUT_METRICS_JSON)
print("saved:", OUT_GAMMA_MAP_CSV)
display(df_cmp[out_cols].head(20))


text rows: 307
pairs: 307 missing announcement_time: 22


KeyboardInterrupt: 

In [23]:
import requests

BASE = "https://gamma-api.polymarket.com"

def search_ebay_markets():
    url = f"{BASE}/markets"
    params = {
        "limit": 1000,
        "closed": "true"
    }

    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    markets = r.json()

    # 只筛选 EBAY earnings
    ebay_markets = [
        m for m in markets
        if "eBay" in (m.get("question") or "")
        and "beat" in (m.get("question") or "").lower()
    ]

    return ebay_markets


data = search_ebay_markets()

for m in data[:5]:
    print(m["slug"], m["endDate"])



