# 5. 检测 Transcript 完整性

检查每家公司的 earnings call transcript 是否覆盖 **2015 Q1 - 2025 Q4**，如有缺失则输出到文件。

In [1]:
import re
from pathlib import Path
from collections import defaultdict

PROJECT_ROOT = Path("..").resolve()
TRANSCRIPTS_DIR = PROJECT_ROOT / "data" / "transcripts"
OUTPUT_FILE = PROJECT_ROOT / "data" / "missing_transcripts.txt"

# 期望覆盖范围：2015 Q1 - 2025 Q4
MIN_YEAR, MAX_YEAR = 2015, 2025
QUARTERS = [1, 2, 3, 4]

# 从文件名中提取 Q 和年份的正则
# 匹配: Q1 2015, Q2 2016, on Q3 2020 Results, Q4 2025 Earnings 等
PATTERN = re.compile(r"Q([1-4])\s+20(1[5-9]|2[0-5])", re.IGNORECASE)

print(f"Transcript 目录: {TRANSCRIPTS_DIR}")
print(f"期望范围: {MIN_YEAR}Q1 - {MAX_YEAR}Q4")
print(f"输出文件: {OUTPUT_FILE}")

Transcript 目录: /Users/xinyuewang/Desktop/1.27/data/transcripts
期望范围: 2015Q1 - 2025Q4
输出文件: /Users/xinyuewang/Desktop/1.27/data/missing_transcripts.txt


In [2]:
def extract_quarter_year(filename: str) -> list[tuple[int, int]]:
    """从文件名中提取所有 (quarter, year) 组合。"""
    matches = PATTERN.findall(filename)
    result = []
    for q_str, yy in matches:
        year = int(f"20{yy}")  # 2015-2025
        quarter = int(q_str)
        result.append((quarter, year))
    return result


def get_expected_quarters() -> set[tuple[int, int]]:
    """生成期望的 (quarter, year) 集合。"""
    expected = set()
    for year in range(MIN_YEAR, MAX_YEAR + 1):
        for q in QUARTERS:
            expected.add((q, year))
    return expected


# 遍历 transcripts 目录下每家公司的文件夹
ticker_dirs = [d for d in TRANSCRIPTS_DIR.iterdir() if d.is_dir() and not d.name.startswith(".")]
tickers = sorted([d.name for d in ticker_dirs])

print(f"共有 {len(tickers)} 家公司: {tickers}")

expected = get_expected_quarters()
company_quarters = defaultdict(set)

for ticker in tickers:
    ticker_dir = TRANSCRIPTS_DIR / ticker
    for f in ticker_dir.glob("*.txt"):
        for q, y in extract_quarter_year(f.name):
            if MIN_YEAR <= y <= MAX_YEAR:
                company_quarters[ticker].add((q, y))

共有 28 家公司: ['AAPL', 'ADBE', 'AMD', 'AMZN', 'AVGO', 'BAC', 'BK', 'BLK', 'BR', 'C', 'CRM', 'DAL', 'GS', 'INTC', 'JPM', 'MS', 'MSFT', 'MTB', 'NFLX', 'NOW', 'NVDA', 'PNC', 'QCOM', 'SHOP', 'STT', 'TSLA', 'TSM', 'WFC']


In [3]:
# 找出每家公司的缺失季度
missing_by_company = {}

for ticker in tickers:
    have = company_quarters.get(ticker, set())
    missing = expected - have
    if missing:
        missing_by_company[ticker] = sorted(missing, key=lambda x: (x[1], x[0]))

# 汇总统计
total_expected = len(expected)
print("=" * 60)
print("各公司 transcript 覆盖情况 (2015Q1 - 2025Q4)")
print("=" * 60)

for ticker in tickers:
    have = len(company_quarters.get(ticker, set()))
    missing_count = len(missing_by_company.get(ticker, []))
    status = "✓ 完整" if missing_count == 0 else f"✗ 缺失 {missing_count} 个季度"
    print(f"{ticker:6} | 已有 {have:2}/{total_expected} | {status}")

if not missing_by_company:
    print("\n所有公司 transcript 均完整！")
else:
    print(f"\n有 {len(missing_by_company)} 家公司存在缺失。")

各公司 transcript 覆盖情况 (2015Q1 - 2025Q4)
AAPL   | 已有 44/44 | ✓ 完整
ADBE   | 已有 44/44 | ✓ 完整
AMD    | 已有 44/44 | ✓ 完整
AMZN   | 已有 44/44 | ✓ 完整
AVGO   | 已有 41/44 | ✗ 缺失 3 个季度
BAC    | 已有 44/44 | ✓ 完整
BK     | 已有 43/44 | ✗ 缺失 1 个季度
BLK    | 已有 44/44 | ✓ 完整
BR     | 已有 44/44 | ✓ 完整
C      | 已有 44/44 | ✓ 完整
CRM    | 已有 40/44 | ✗ 缺失 4 个季度
DAL    | 已有 44/44 | ✓ 完整
GS     | 已有 44/44 | ✓ 完整
INTC   | 已有 44/44 | ✓ 完整
JPM    | 已有 44/44 | ✓ 完整
MS     | 已有 44/44 | ✓ 完整
MSFT   | 已有 43/44 | ✗ 缺失 1 个季度
MTB    | 已有 44/44 | ✓ 完整
NFLX   | 已有 42/44 | ✗ 缺失 2 个季度
NOW    | 已有 42/44 | ✗ 缺失 2 个季度
NVDA   | 已有 42/44 | ✗ 缺失 2 个季度
PNC    | 已有 44/44 | ✓ 完整
QCOM   | 已有 43/44 | ✗ 缺失 1 个季度
SHOP   | 已有 40/44 | ✗ 缺失 4 个季度
STT    | 已有 43/44 | ✗ 缺失 1 个季度
TSLA   | 已有 44/44 | ✓ 完整
TSM    | 已有 43/44 | ✗ 缺失 1 个季度
WFC    | 已有 44/44 | ✓ 完整

有 11 家公司存在缺失。


In [4]:
# 将缺失信息写入文件
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    f.write("# Transcript 缺失季度清单 (2015 Q1 - 2025 Q4)\n")
    f.write("# 格式: TICKER | 缺失季度列表\n")
    f.write("=" * 60 + "\n\n")

    if not missing_by_company:
        f.write("所有公司 transcript 均完整，无缺失。\n")
    else:
        for ticker in sorted(missing_by_company.keys()):
            missing_list = missing_by_company[ticker]
            quarters_str = ", ".join(f"{y}Q{q}" for q, y in missing_list)
            f.write(f"{ticker} | {quarters_str}\n")

print(f"缺失清单已保存到: {OUTPUT_FILE}")

# 预览文件内容
if missing_by_company:
    print("\n--- 文件内容预览 ---")
    with open(OUTPUT_FILE, "r", encoding="utf-8") as f:
        print(f.read())

缺失清单已保存到: /Users/xinyuewang/Desktop/1.27/data/missing_transcripts.txt

--- 文件内容预览 ---
# Transcript 缺失季度清单 (2015 Q1 - 2025 Q4)
# 格式: TICKER | 缺失季度列表

AVGO | 2015Q2, 2019Q1, 2022Q2
BK | 2021Q3
CRM | 2015Q1, 2015Q2, 2018Q1, 2018Q3
MSFT | 2022Q1
NFLX | 2021Q3, 2021Q4
NOW | 2015Q1, 2016Q4
NVDA | 2019Q3, 2022Q3
QCOM | 2019Q2
SHOP | 2015Q1, 2015Q2, 2015Q3, 2015Q4
STT | 2017Q4
TSM | 2017Q4

