# 3.6 从 yfinance 获取 EPS Beat/Miss 并写入 features DB

目标：
- 使用 `yfinance` 抓取 earnings history（保留 `EPS Estimate` / `Reported EPS` / `Earnings Date`）
- 生成 `eps_beat` 标签（下个季度 Actual EPS > Estimated EPS 则为 1，否则 0）
- 将标签合并到 `earnings_calls_features.db` 的 `segments_features`（按 `ticker + quarter`）

说明：
- 标签定义采用 one-quarter-ahead：当前季度 call 的标签来自“下一季度”的 beat/miss
- 若某个 ticker/quarter 在 AlphaQuery 找不到下一季度 EPS，则标签为缺失
- 仅追加标签列，不删除原有特征列

In [1]:
# ========== 配置 ==========

import time
import sqlite3
from pathlib import Path

import pandas as pd

PROJECT_ROOT = Path("..").resolve()
FEATURE_DB = PROJECT_ROOT / "data" / "earnings_calls_features.db"
LABEL_DIR = PROJECT_ROOT / "data" / "labels"

REQUEST_SLEEP_SECONDS = 1.0
SAVE_EARNINGS_CSV = True
SAVE_LABELS_CSV = True

LABEL_DIR.mkdir(parents=True, exist_ok=True)

print("PROJECT_ROOT:", PROJECT_ROOT)
print("FEATURE_DB:", FEATURE_DB)
print("LABEL_DIR:", LABEL_DIR)

PROJECT_ROOT: /Users/xinyuewang/Desktop/1.27
FEATURE_DB: /Users/xinyuewang/Desktop/1.27/data/earnings_calls_features.db
LABEL_DIR: /Users/xinyuewang/Desktop/1.27/data/labels


In [2]:
# ========== 1. yfinance 抓取函数 ==========

def format_quarter(dt_val):
    """日期 -> YYYY-Qn"""
    try:
        p = pd.to_datetime(dt_val).to_period("Q")
        return f"{p.year}-Q{p.quarter}"
    except Exception:
        return None


def fetch_earnings_history_from_yfinance(ticker: str, limit: int = 80):
    print(f"抓取 {ticker} (yfinance)")

    try:
        import yfinance as yf
    except Exception as e:
        print("❌ 未安装 yfinance，请先: pip install yfinance")
        print("错误:", e)
        return None

    try:
        tk = yf.Ticker(ticker)
        df = tk.get_earnings_dates(limit=limit)
    except Exception as e:
        print(f"❌ {ticker}: yfinance 抓取失败 -> {e}")
        return None

    if df is None or len(df) == 0:
        print(f"⚠️ {ticker}: 无 earnings dates")
        return None

    df = df.reset_index().copy()

    # 兼容不同版本列名
    date_col = next((c for c in ["Earnings Date", "EarningsDate", "Date"] if c in df.columns), None)
    est_col = next((c for c in ["EPS Estimate", "EPSEstimate", "Estimate"] if c in df.columns), None)
    rep_col = next((c for c in ["Reported EPS", "ReportedEPS", "Reported"] if c in df.columns), None)

    if date_col is None or est_col is None or rep_col is None:
        print(f"⚠️ {ticker}: 列不完整, 当前列: {df.columns.tolist()}")
        return None

    # 保留你要求的原始字段名
    out = df[[date_col, est_col, rep_col]].copy()
    out = out.rename(columns={
        date_col: "Earnings Date",
        est_col: "EPS Estimate",
        rep_col: "Reported EPS",
    })

    out["Earnings Date"] = pd.to_datetime(out["Earnings Date"], errors="coerce", utc=True).dt.tz_localize(None)
    out["EPS Estimate"] = pd.to_numeric(out["EPS Estimate"], errors="coerce")
    out["Reported EPS"] = pd.to_numeric(out["Reported EPS"], errors="coerce")

    # 与后续原逻辑兼容的标准列
    out["fiscal_quarter_end"] = out["Earnings Date"].dt.date.astype(str)
    out["estimated_eps"] = out["EPS Estimate"]
    out["actual_eps"] = out["Reported EPS"]
    out["fiscal_quarter"] = out["Earnings Date"].apply(format_quarter)
    out["ticker"] = ticker.upper()

    out = out.dropna(subset=["fiscal_quarter"]).copy()
    out = out.sort_values("Earnings Date").reset_index(drop=True)

    print(f"✅ {ticker}: {len(out)} 条 earnings")
    return out


In [3]:
# ========== 2. 批量抓取并保存 raw earnings ==========

conn = sqlite3.connect(FEATURE_DB)
df_keys = pd.read_sql_query("SELECT DISTINCT ticker FROM segments_features ORDER BY ticker", conn)
conn.close()

tickers = df_keys["ticker"].dropna().astype(str).tolist()
print(f"需抓取 ticker 数: {len(tickers)} -> {tickers}")

all_rows = []
for t in tickers:
    df_t = fetch_earnings_history_from_yfinance(t)
    if df_t is not None and len(df_t) > 0:
        all_rows.append(df_t)
    time.sleep(REQUEST_SLEEP_SECONDS)

if not all_rows:
    raise RuntimeError("未抓取到任何 earnings 数据，请检查网络或 yfinance 可访问性")

df_earnings = pd.concat(all_rows, ignore_index=True)
print(f"总 earnings 记录: {len(df_earnings)}")

if SAVE_EARNINGS_CSV:
    out_path = LABEL_DIR / "earnings_history_from_yfinance.csv"
    df_earnings.to_csv(out_path, index=False)
    print("已保存:", out_path)

# 预览时优先展示你要求的三列
show_cols = [c for c in ["ticker", "fiscal_quarter", "Earnings Date", "EPS Estimate", "Reported EPS"] if c in df_earnings.columns]
df_earnings[show_cols].head()

需抓取 ticker 数: 28 -> ['AAPL', 'ADBE', 'AMD', 'AMZN', 'AVGO', 'BAC', 'BK', 'BLK', 'BR', 'C', 'CRM', 'DAL', 'GS', 'INTC', 'JPM', 'MS', 'MSFT', 'MTB', 'NFLX', 'NOW', 'NVDA', 'PNC', 'QCOM', 'SHOP', 'STT', 'TSLA', 'TSM', 'WFC']
抓取 AAPL (yfinance)
✅ AAPL: 100 条 earnings
抓取 ADBE (yfinance)
✅ ADBE: 100 条 earnings
抓取 AMD (yfinance)
✅ AMD: 100 条 earnings
抓取 AMZN (yfinance)
✅ AMZN: 100 条 earnings
抓取 AVGO (yfinance)
✅ AVGO: 67 条 earnings
抓取 BAC (yfinance)
✅ BAC: 100 条 earnings
抓取 BK (yfinance)
✅ BK: 100 条 earnings
抓取 BLK (yfinance)
✅ BLK: 26 条 earnings
抓取 BR (yfinance)
✅ BR: 77 条 earnings
抓取 C (yfinance)
✅ C: 100 条 earnings
抓取 CRM (yfinance)
✅ CRM: 87 条 earnings
抓取 DAL (yfinance)
✅ DAL: 76 条 earnings
抓取 GS (yfinance)
✅ GS: 100 条 earnings
抓取 INTC (yfinance)
✅ INTC: 100 条 earnings
抓取 JPM (yfinance)
✅ JPM: 100 条 earnings
抓取 MS (yfinance)
✅ MS: 100 条 earnings
抓取 MSFT (yfinance)
✅ MSFT: 100 条 earnings
抓取 MTB (yfinance)
✅ MTB: 100 条 earnings
抓取 NFLX (yfinance)
✅ NFLX: 96 条 earnings
抓取 NOW (yfinance)
✅ NO

Exception ignored from cffi callback <function buffer_callback at 0x1394a1760>:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/curl_cffi/curl.py", line 100, in buffer_callback
    @ffi.def_extern()
    
KeyboardInterrupt: 


❌ WFC: yfinance 抓取失败 -> Failed to perform, curl: (23) Failure writing output to destination, passed 13 returned 0. See https://curl.se/libcurl/c/libcurl-errors.html first for more details.
总 earnings 记录: 2392
已保存: /Users/xinyuewang/Desktop/1.27/data/labels/earnings_history_from_yfinance.csv


Unnamed: 0,ticker,fiscal_quarter,Earnings Date,EPS Estimate,Reported EPS
0,AAPL,2002-Q1,2002-01-16 21:00:00,,
1,AAPL,2002-Q2,2002-04-17 20:00:00,,
2,AAPL,2002-Q2,2002-04-17 20:00:00,,
3,AAPL,2002-Q3,2002-07-16 20:00:00,,
4,AAPL,2002-Q3,2002-07-16 20:00:00,,


In [None]:
# ========== 3. 生成 one-quarter-ahead 的 eps_beat label ==========

def generate_eps_labels(df_earnings: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for ticker, grp in df_earnings.groupby("ticker"):
        grp = grp.sort_values("fiscal_quarter_end").reset_index(drop=True)
        for i in range(len(grp) - 1):
            cur = grp.iloc[i]
            nxt = grp.iloc[i + 1]

            fq = cur["fiscal_quarter"]
            if pd.isna(fq):
                continue

            if pd.notna(nxt["estimated_eps"]) and pd.notna(nxt["actual_eps"]):
                beat = 1 if float(nxt["actual_eps"]) > float(nxt["estimated_eps"]) else 0
                rows.append({
                    "ticker": ticker,
                    "quarter": fq,
                    "eps_beat": beat,
                    "next_quarter_estimated": float(nxt["estimated_eps"]),
                    "next_quarter_actual": float(nxt["actual_eps"]),
                })

    out = pd.DataFrame(rows)
    if len(out) == 0:
        raise RuntimeError("未生成任何 label，请检查 earnings 数据")

    out = out.drop_duplicates(subset=["ticker", "quarter"], keep="last")
    return out


df_labels = generate_eps_labels(df_earnings)

print(f"labels 行数: {len(df_labels)}")
print(f"Beat={int(df_labels['eps_beat'].sum())}, Miss={int((df_labels['eps_beat']==0).sum())}")

if SAVE_LABELS_CSV:
    out_path = LABEL_DIR / "eps_labels_from_yfinance.csv"
    df_labels.to_csv(out_path, index=False)
    print("已保存:", out_path)

df_labels.head()

labels 行数: 1217
Beat=957, Miss=260
已保存: /Users/xinyuewang/Desktop/1.27/data/labels/eps_labels_from_alphaquery.csv


Unnamed: 0,ticker,quarter,eps_beat,next_quarter_estimated,next_quarter_actual
0,AAPL,2014-Q4,1,0.55,0.58
1,AAPL,2015-Q1,1,0.45,0.46
2,AAPL,2015-Q2,1,0.47,0.49
3,AAPL,2015-Q3,1,0.81,0.82
4,AAPL,2015-Q4,0,0.49,0.47


In [None]:
# ========== 4. 合并标签到 segments_features 并写回 ==========

conn = sqlite3.connect(FEATURE_DB)
df_features = pd.read_sql_query("SELECT * FROM segments_features", conn)
conn.close()

required = {"ticker", "quarter"}
missing_req = sorted(list(required - set(df_features.columns)))
if missing_req:
    raise ValueError(f"segments_features 缺少必要列: {missing_req}")

base_cols = [c for c in ["eps_beat", "next_quarter_estimated", "next_quarter_actual"] if c in df_features.columns]
if base_cols:
    df_features = df_features.drop(columns=base_cols)

df_out = df_features.merge(df_labels, on=["ticker", "quarter"], how="left")

coverage = df_out["eps_beat"].notna().mean()
print(f"合并后行数: {len(df_out)}")
print(f"eps_beat 覆盖率: {coverage:.2%}")

conn = sqlite3.connect(FEATURE_DB)
df_out.to_sql("segments_features", conn, if_exists="replace", index=False)
conn.close()

print("已写回:", FEATURE_DB)
print(f"当前列数: {len(df_out.columns)}")

合并后行数: 2374
eps_beat 覆盖率: 96.93%
已写回: /Users/xinyuewang/Desktop/1.27/data/earnings_calls_features.db
当前列数: 43


In [None]:
# ========== 5. 覆盖率检查 + 缺失清单导出 ==========

conn = sqlite3.connect(FEATURE_DB)
preview = pd.read_sql_query(
    """
    SELECT id, ticker, quarter, section, eps_beat, next_quarter_estimated, next_quarter_actual
    FROM segments_features
    LIMIT 20
    """,
    conn
)
missing_pairs = pd.read_sql_query(
    """
    SELECT DISTINCT ticker, quarter
    FROM segments_features
    WHERE eps_beat IS NULL
    ORDER BY ticker, quarter
    """,
    conn
)
conn.close()

missing_path = LABEL_DIR / "missing_eps_labels_pairs.csv"
missing_pairs.to_csv(missing_path, index=False)

print("缺失 ticker-quarter 数:", len(missing_pairs))
print("缺失清单已保存:", missing_path)
display(preview)
display(missing_pairs.head(30))

缺失 ticker-quarter 数: 37
缺失清单已保存: /Users/xinyuewang/Desktop/1.27/data/labels/missing_eps_labels_pairs.csv


Unnamed: 0,id,ticker,quarter,section,eps_beat,next_quarter_estimated,next_quarter_actual
0,75,AAPL,2015-Q1,Prepared Remarks,1.0,0.45,0.46
1,77,AAPL,2015-Q1,Prepared Remarks,1.0,0.45,0.46
2,76,AAPL,2015-Q1,Q&A,1.0,0.45,0.46
3,78,AAPL,2015-Q1,Q&A,1.0,0.45,0.46
4,21,AAPL,2015-Q3,Prepared Remarks,1.0,0.81,0.82
5,22,AAPL,2015-Q3,Q&A,1.0,0.81,0.82
6,79,AAPL,2015-Q4,Prepared Remarks,0.0,0.49,0.47
7,80,AAPL,2015-Q4,Q&A,0.0,0.49,0.47
8,71,AAPL,2016-Q1,Prepared Remarks,0.0,0.35,0.35
9,72,AAPL,2016-Q1,Q&A,0.0,0.35,0.35


Unnamed: 0,ticker,quarter
0,AAPL,2025-Q4
1,ADBE,2025-Q4
2,AMD,2025-Q4
3,AMZN,2025-Q4
4,AVGO,2025-Q4
5,BAC,2025-Q4
6,BK,2025-Q4
7,BLK,2025-Q4
8,BR,2025-Q4
9,C,2025-Q4
