In [None]:
import re
import time
import random
from io import StringIO
from typing import Optional, Tuple, List, Dict

import requests
import pandas as pd


TICKERS = [
    "JPM","BK","DAL","C","WFC","BAC","TSM","MS","GS","BLK","MTB","STT","PNC","BR",
    "AAPL","MSFT","GOOGL","AMZN","NVDA","META","TSLA","NFLX","AMD","AVGO","INTC","QCOM",
    "ARM","ASML","AMAT","SMCI","PLTR","CRM","SNOW","DDOG","PANW","NET","SHOP","NOW","ADBE",
    "COIN","MSTR","PYPL","V","MA","HOOD","SQ","WMT","COST","TGT","SBUX","DIS","BABA","PDD",
    "KO","PEP","UBER","ABNB","FDX","UPS","BA","F","GEV","GME","PFE"
]

START_Q = "2023Q1"
END_Q   = "2025Q4"


def quarter_from_fq_end(dt: pd.Timestamp) -> str:
    """Fiscal quarter end date -> 'YYYYQn' by month bucket (3/6/9/12)."""
    if pd.isna(dt):
        return None
    m = int(dt.month)
    q = (m - 1)//3 + 1
    return f"{dt.year}Q{q}"


def q_to_index(q: str) -> int:
    """'YYYYQn' -> sortable int index."""
    m = re.fullmatch(r"(\d{4})Q([1-4])", q.strip())
    if not m:
        raise ValueError(f"Bad quarter format: {q}")
    year = int(m.group(1))
    qq = int(m.group(2))
    return year * 4 + (qq - 1)


START_I = q_to_index(START_Q)
END_I   = q_to_index(END_Q)


def _clean_money(x) -> Optional[float]:
    """'$4.87' -> 4.87 ; handles '-', '--', None."""
    if pd.isna(x):
        return None
    s = str(x).strip()
    if s in {"", "-", "--", "None", "nan"}:
        return None
    s = s.replace("$", "").replace(",", "")
    try:
        return float(s)
    except:
        return None


def fetch_earnings_history_from_alphaquery(ticker: str, session: requests.Session) -> Optional[pd.DataFrame]:
    """
    Returns cleaned earnings-history table for one ticker, or None if not found/blocked.
    Uses pd.read_html (as you requested).
    """
    url = f"https://www.alphaquery.com/stock/{ticker}/earnings-history"
    headers = {
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120 Safari/537.36"
    }
    r = session.get(url, headers=headers, timeout=30)
    if r.status_code != 200:
        return None

    html = r.text

    # 如果页面被订阅墙挡住，通常会出现明显提示；这里先做个软判断
    if "Please subscribe to continue using AlphaQuery" in html or "Your free access has expired" in html:
        # 有时表格仍在页面里；我们仍然尝试 read_html
        pass

    try:
        tables = pd.read_html(StringIO(html))
    except ValueError:
        return None

    if not tables:
        return None

    # 通常第一个表就是 earnings announcements
    df = tables[0].copy()

    # 兼容列名轻微变化：标准化列名
    df.columns = [str(c).strip() for c in df.columns]

    # 期望列：Announcement Date, Fiscal Quarter End, Estimated EPS, Actual EPS
    # 如果列名不完全一致，做模糊匹配
    col_map = {}
    for c in df.columns:
        cl = c.lower()
        if "announcement" in cl and "date" in cl:
            col_map[c] = "announcement_date"
        elif "fiscal" in cl and ("end" in cl or "quarter" in cl):
            col_map[c] = "fiscal_quarter_end"
        elif "estimated" in cl and "eps" in cl:
            col_map[c] = "estimated_eps"
        elif "actual" in cl and "eps" in cl:
            col_map[c] = "actual_eps"

    # 如果没找到关键列，直接放弃（或你也可以 print(df.head()) 调试）
    need = {"announcement_date","fiscal_quarter_end","estimated_eps","actual_eps"}
    if set(col_map.values()) & need != need:
        return None

    df = df.rename(columns=col_map)

    df["ticker"] = ticker
    df["announcement_date"] = pd.to_datetime(df["announcement_date"], errors="coerce")
    df["fiscal_quarter_end"] = pd.to_datetime(df["fiscal_quarter_end"], errors="coerce")
    df["estimated_eps"] = df["estimated_eps"].apply(_clean_money)
    df["actual_eps"] = df["actual_eps"].apply(_clean_money)

    df["fiscal_quarter"] = df["fiscal_quarter_end"].apply(quarter_from_fq_end)
    df["fiscal_q_index"] = df["fiscal_quarter"].apply(lambda x: q_to_index(x) if isinstance(x, str) else None)

    # 过滤 2023Q1–2025Q4
    df = df[df["fiscal_q_index"].between(START_I, END_I, inclusive="both")].copy()

    # beat / miss / meet
    df["surprise"] = df["actual_eps"] - df["estimated_eps"]
    df["result"] = df["surprise"].apply(
        lambda s: None if pd.isna(s) else ("beat" if s > 0 else ("miss" if s < 0 else "meet"))
    )

    return df


def build_summary(df_all: pd.DataFrame) -> pd.DataFrame:
    """Per ticker beat/miss/meet counts + ratios within the selected window."""
    g = df_all.groupby(["ticker","result"], dropna=False).size().unstack(fill_value=0)

    # ensure columns exist
    for c in ["beat","miss","meet"]:
        if c not in g.columns:
            g[c] = 0

    g["total_reports"] = g["beat"] + g["miss"] + g["meet"]
    g["beat_miss_total"] = g["beat"] + g["miss"]

    # beat share among (beat+miss); ignore meet
    g["beat_share"] = g.apply(lambda r: (r["beat"] / r["beat_miss_total"]) if r["beat_miss_total"] > 0 else None, axis=1)
    g["miss_share"] = g.apply(lambda r: (r["miss"] / r["beat_miss_total"]) if r["beat_miss_total"] > 0 else None, axis=1)

    # 也给一个 “beat 在全部(含meet)里占比”
    g["beat_share_all"] = g.apply(lambda r: (r["beat"] / r["total_reports"]) if r["total_reports"] > 0 else None, axis=1)

    out = g.reset_index().sort_values("ticker")
    return out


# ====== main run ======
session = requests.Session()

all_rows = []
failed = []

for i, t in enumerate(TICKERS, 1):
    try:
        df_t = fetch_earnings_history_from_alphaquery(t, session=session)
        if df_t is None or df_t.empty:
            failed.append(t)
        else:
            all_rows.append(df_t)
    except Exception as e:
        failed.append(t)

    # 关键：别太快（避免被限流/订阅墙更快触发）
    time.sleep(random.uniform(1.0, 2.2))

df_all = pd.concat(all_rows, ignore_index=True) if all_rows else pd.DataFrame()

print("rows:", len(df_all))
print("failed tickers:", failed)

# 导出原始明细
df_all.to_csv("alphaquery_earnings_history_2023Q1_2025Q4_raw.csv", index=False)

# 汇总：beat/miss 占比
summary = build_summary(df_all)
summary.to_csv("alphaquery_earnings_beat_miss_summary_2023Q1_2025Q4.csv", index=False)
summary["beat_ratio"] = summary["beat"] / (summary["beat"] + summary["miss"])



summary.head(10)

rows: 747
failed tickers: ['SQ']


result,ticker,beat,meet,miss,NaN,total_reports,beat_miss_total,beat_share,miss_share,beat_share_all
0,AAPL,12,0,0,0,12,12,1.0,0.0,1.0
1,ABNB,7,0,5,0,12,12,0.583333,0.416667,0.583333
2,ADBE,10,1,1,0,12,11,0.909091,0.090909,0.833333
3,AMAT,12,0,0,0,12,12,1.0,0.0,1.0
4,AMD,8,3,1,0,12,9,0.888889,0.111111,0.666667
5,AMZN,11,0,1,0,12,12,0.916667,0.083333,0.916667
6,ARM,6,3,1,0,10,7,0.857143,0.142857,0.6
7,ASML,10,0,2,0,12,12,0.833333,0.166667,0.833333
8,AVGO,9,0,3,0,12,12,0.75,0.25,0.75
9,BA,5,0,7,0,12,12,0.416667,0.583333,0.416667


KeyError: 'beat_ratio'