# 3.1 文字切割

将 Earnings Call Transcript 切割为 **Prepared Remarks** 与 **Q&A** 两段，写入 SQLite。

**输出表 `segments` 结构：**
| 字段 | 说明 |
|------|------|
| id | 主键 |
| ticker | 公司代码 |
| quarter | 财季，如 2024-Q1 |
| section | Prepared Remarks / Q&A |
| timestamp | 电话会议时间 (ET) |
| url | Seeking Alpha 原文链接 |
| content | 文本内容 |
| source_file | 源文件路径（可追溯） |

In [8]:
# ========== 配置 ==========

import json
import re
import sqlite3
from pathlib import Path
from datetime import datetime

PROJECT_ROOT = Path("..").resolve()
TRANSCRIPTS_DIR = PROJECT_ROOT / "data" / "transcripts"
INDEX_DIR = PROJECT_ROOT / "data" / "transcript_index"
DB_PATH = PROJECT_ROOT / "data" / "earnings_calls.db"

# 只保留此范围内的季度
MIN_QUARTER = "2015-Q1"
MAX_QUARTER = "2025-Q4"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("TRANSCRIPTS_DIR:", TRANSCRIPTS_DIR)
print("INDEX_DIR:", INDEX_DIR)
print("DB_PATH:", DB_PATH)

PROJECT_ROOT: /Users/xinyuewang/Desktop/1.27
TRANSCRIPTS_DIR: /Users/xinyuewang/Desktop/1.27/data/transcripts
INDEX_DIR: /Users/xinyuewang/Desktop/1.27/data/transcript_index
DB_PATH: /Users/xinyuewang/Desktop/1.27/data/earnings_calls.db


In [9]:
# ========== 1. 元数据解析 & 分段函数 ==========

FISCAL_QTR_PATTERN = re.compile(
    r"Q(?P<qtr>[1-4])\s+(?P<year>\d{4})",
    re.IGNORECASE,
)

DATE_LINE_PATTERN = re.compile(
    r"(?P<month>[A-Za-z]+\.?)\s+(?P<day>\d{1,2}),\s+(?P<year>\d{4}),?\s+"
    r"(?P<hour>\d{1,2}):(?P<minute>\d{2})\s*(?P<ampm>AM|PM)\s*ET",
    re.IGNORECASE,
)

MONTH_MAP = {
    "jan": 1, "january": 1, "jan.": 1,
    "feb": 2, "february": 2, "feb.": 2,
    "mar": 3, "march": 3, "mar.": 3,
    "apr": 4, "april": 4, "apr.": 4,
    "may": 5,
    "jun": 6, "june": 6, "jun.": 6,
    "jul": 7, "july": 7, "jul.": 7,
    "aug": 8, "august": 8, "aug.": 8,
    "sep": 9, "september": 9, "sept": 9, "sep.": 9,
    "oct": 10, "october": 10, "oct.": 10,
    "nov": 11, "november": 11, "nov.": 11,
    "dec": 12, "december": 12, "dec.": 12,
}


def parse_fiscal_quarter_from_text(text: str):
    """从文本前15行提取 Fiscal Quarter（如 2024-Q1）"""
    lines = text.split('\n')[:15]
    filtered_lines = [line for line in lines if "Call Start" not in line]
    search_text = '\n'.join(filtered_lines)
    matches = list(FISCAL_QTR_PATTERN.finditer(search_text))
    if not matches:
        return None
    valid_matches = [(m, int(m.group("year"))) for m in matches if 2000 <= int(m.group("year")) <= 2030]
    if not valid_matches:
        return None
    best_match, _ = max(valid_matches, key=lambda x: x[1])
    year, qtr = int(best_match.group("year")), int(best_match.group("qtr"))
    return f"{year}-Q{qtr}"


def parse_call_datetime_et(text: str):
    """从文本前15行提取 Call Date（如 2024-04-16 08:30）"""
    lines = text.split('\n')[:15]
    filtered_lines = [line for line in lines if "Call Start" not in line]
    search_text = '\n'.join(filtered_lines)
    matches = list(DATE_LINE_PATTERN.finditer(search_text))
    if not matches:
        return None
    valid_matches = [(m, int(m.group("year"))) for m in matches if 2000 <= int(m.group("year")) <= 2030]
    if not valid_matches:
        return None
    best_match, _ = max(valid_matches, key=lambda x: x[1])
    m = best_match
    month_str = m.group("month").lower()
    month = MONTH_MAP.get(month_str)
    if month is None:
        return None
    day, year = int(m.group("day")), int(m.group("year"))
    hour, minute = int(m.group("hour")), int(m.group("minute"))
    ampm = m.group("ampm").upper()
    if ampm == "PM" and hour != 12:
        hour += 12
    if ampm == "AM" and hour == 12:
        hour = 0
    try:
        return datetime(year, month, day, hour, minute).strftime("%Y-%m-%d %H:%M")
    except ValueError:
        return None


QA_MARKERS = [
    r"Question-and-Answer Session",
    r"Questions and Answers",
    r"Question and Answer Session",
    r"Q&A Session",
    r"Q & A Session",
]


def find_qa_marker(text: str, skip_first_n_sentences: int = 30):
    """找到第 30 句话以后的第一个 Q&A 标记位置"""
    sentence_endings = re.finditer(r'[.!?]+\s+', text)
    sentence_positions = [m.end() for m in sentence_endings]
    search_start_pos = sentence_positions[skip_first_n_sentences - 1] if len(sentence_positions) >= skip_first_n_sentences else 0
    search_text = text[search_start_pos:]
    for marker in QA_MARKERS:
        m = re.search(marker, search_text, flags=re.IGNORECASE)
        if m:
            return search_start_pos + m.start()
    return None


def split_transcript_into_segments(text: str):
    """切割为 Prepared Remarks 与 Q&A"""
    segments = []
    qa_start_pos = find_qa_marker(text)
    if qa_start_pos is not None:
        prepared_text = text[:qa_start_pos].strip()
        qa_text = text[qa_start_pos:].strip()
        if prepared_text:
            segments.append({"segment_type": "Prepared Remarks", "text": prepared_text})
        if qa_text:
            segments.append({"segment_type": "Q&A", "text": qa_text})
    else:
        segments.append({"segment_type": "Prepared Remarks", "text": text.strip()})
    return segments

In [10]:
# ========== 2. 从 transcript_index 查找 URL ==========

def load_index_url_map() -> dict:
    """
    加载 transcript_index，构建 (ticker, quarter) -> url 映射。
    quarter 格式: 2024-Q1，title 中匹配 Q1 2024
    """
    url_map = {}
    qtr_pattern = re.compile(r"Q([1-4])\s+20(\d{2})", re.IGNORECASE)

    for json_path in INDEX_DIR.glob("*.json"):
        if json_path.name.startswith("."):
            continue
        ticker = json_path.stem.upper()
        try:
            with open(json_path, "r", encoding="utf-8") as f:
                data = json.load(f)
        except Exception:
            continue
        for item in data:
            title = item.get("title", "")
            link = item.get("link", "")
            if not link or "Earnings Call Transcript" not in title:
                continue
            for m in qtr_pattern.finditer(title):
                q, yy = int(m.group(1)), int(m.group(2))
                year = 2000 + yy
                quarter = f"{year}-Q{q}"
                key = (ticker, quarter)
                if key not in url_map:
                    url_map[key] = link.split("#")[0]
    return url_map


INDEX_URL_MAP = load_index_url_map()
print(f"已加载 {len(INDEX_URL_MAP)} 个 (ticker, quarter) -> url 映射")

已加载 1218 个 (ticker, quarter) -> url 映射


In [11]:
# ========== 3. 初始化 SQLite 表 ==========

def init_segments_table(db_path: Path):
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    cur.execute("""
        CREATE TABLE IF NOT EXISTS segments (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            ticker TEXT,
            quarter TEXT,
            section TEXT,
            timestamp TEXT,
            url TEXT,
            content TEXT,
            source_file TEXT
        )
    """)
    conn.commit()
    conn.close()
    print("segments 表初始化完成")


init_segments_table(DB_PATH)

segments 表初始化完成


In [12]:
# ========== 4. 读取 transcript 并写入 segments ==========

def insert_segments_from_transcripts(db_path: Path, transcripts_dir: Path, url_map: dict, clear_existing: bool = True, min_quarter: str = MIN_QUARTER, max_quarter: str = MAX_QUARTER):
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()
    if clear_existing:
        cur.execute("DELETE FROM segments")
    count_files, count_segments, failed, skipped = 0, 0, [], 0

    for ticker_dir in sorted(transcripts_dir.glob("*")):
        if not ticker_dir.is_dir():
            continue
        ticker = ticker_dir.name.upper()

        for txt_path in sorted(ticker_dir.glob("*.txt")):
            count_files += 1
            try:
                with open(txt_path, "r", encoding="utf-8") as f:
                    text = f.read()
                fiscal_quarter = parse_fiscal_quarter_from_text(text)
                timestamp = parse_call_datetime_et(text)
                if not fiscal_quarter:
                    print(f"⚠️ 无法解析 Quarter: {txt_path.name}")
                    failed.append(txt_path.name)
                    continue
                # 只保留 2015-Q1 至 2025-Q4
                if fiscal_quarter < min_quarter or fiscal_quarter > max_quarter:
                    skipped += 1
                    continue
                url = url_map.get((ticker, fiscal_quarter), "")
                source_file = str(txt_path.relative_to(transcripts_dir.parent))
                segments = split_transcript_into_segments(text)

                for seg in segments:
                    cur.execute(
                        """
                        INSERT INTO segments (ticker, quarter, section, timestamp, url, content, source_file)
                        VALUES (?, ?, ?, ?, ?, ?, ?)
                        """,
                        (ticker, fiscal_quarter, seg["segment_type"], timestamp or "", url, seg["text"], source_file),
                    )
                    count_segments += 1
            except Exception as e:
                print(f"❌ 处理失败 {txt_path.name}: {e}")
                failed.append(txt_path.name)

    conn.commit()
    conn.close()
    print(f"处理文件: {count_files}, 跳过(超出范围): {skipped}, 写入 segment: {count_segments}")
    if failed:
        print(f"⚠️ 失败 {len(failed)} 个: {failed[:5]}")


insert_segments_from_transcripts(DB_PATH, TRANSCRIPTS_DIR, INDEX_URL_MAP, clear_existing=True)

⚠️ 无法解析 Quarter: Morgan Stanley. (MS) Management on Bank of America Future of Financials Virtual Conference 2020 - Earnings Call Transcript.txt
⚠️ 无法解析 Quarter: Microsoft Corporation (MSFT) CEO Satya Nadella on Q1 Fiscal 2022 Results - Earnings Call Transcript.txt
⚠️ 无法解析 Quarter: The PNC Financial Services Group, Inc. (PNC) Q2 2024 Earnings Call Transcript.txt
处理文件: 1234, 跳过(超出范围): 18, 写入 segment: 2374
⚠️ 失败 3 个: ['Morgan Stanley. (MS) Management on Bank of America Future of Financials Virtual Conference 2020 - Earnings Call Transcript.txt', 'Microsoft Corporation (MSFT) CEO Satya Nadella on Q1 Fiscal 2022 Results - Earnings Call Transcript.txt', 'The PNC Financial Services Group, Inc. (PNC) Q2 2024 Earnings Call Transcript.txt']


In [13]:
# ========== 5. 预览结果 ==========

import pandas as pd

conn = sqlite3.connect(DB_PATH)
df = pd.read_sql_query(
    "SELECT id, ticker, quarter, section, timestamp, url, length(content) as content_len, source_file FROM segments LIMIT 10",
    conn
)
conn.close()
df

Unnamed: 0,id,ticker,quarter,section,timestamp,url,content_len,source_file
0,1,AAPL,2017-Q1,Prepared Remarks,2017-01-31 17:00,https://seekingalpha.com/article/4041266-apple...,19170,transcripts/AAPL/Apple (AAPL) Q1 2017 Results ...
1,2,AAPL,2017-Q1,Q&A,2017-01-31 17:00,https://seekingalpha.com/article/4041266-apple...,26897,transcripts/AAPL/Apple (AAPL) Q1 2017 Results ...
2,3,AAPL,2017-Q2,Prepared Remarks,2017-05-02 17:00,https://seekingalpha.com/article/4068153-apple...,21308,transcripts/AAPL/Apple (AAPL) Q2 2017 Results ...
3,4,AAPL,2017-Q2,Q&A,2017-05-02 17:00,https://seekingalpha.com/article/4068153-apple...,24682,transcripts/AAPL/Apple (AAPL) Q2 2017 Results ...
4,5,AAPL,2018-Q2,Prepared Remarks,2018-05-01 17:00,https://seekingalpha.com/article/4168271-apple...,21303,transcripts/AAPL/Apple (AAPL) Q2 2018 Results ...
5,6,AAPL,2018-Q2,Q&A,2018-05-01 17:00,https://seekingalpha.com/article/4168271-apple...,25190,transcripts/AAPL/Apple (AAPL) Q2 2018 Results ...
6,7,AAPL,2023-Q2,Prepared Remarks,2023-05-04 17:00,https://seekingalpha.com/article/4600254-apple...,20536,transcripts/AAPL/Apple (AAPL) Q2 2023 Earnings...
7,8,AAPL,2023-Q2,Q&A,2023-05-04 17:00,https://seekingalpha.com/article/4600254-apple...,26620,transcripts/AAPL/Apple (AAPL) Q2 2023 Earnings...
8,9,AAPL,2016-Q3,Prepared Remarks,2016-07-26 17:00,https://seekingalpha.com/article/3991811-apple...,21274,transcripts/AAPL/Apple (AAPL) Q3 2016 Results ...
9,10,AAPL,2016-Q3,Q&A,2016-07-26 17:00,https://seekingalpha.com/article/3991811-apple...,28958,transcripts/AAPL/Apple (AAPL) Q3 2016 Results ...


In [14]:
# ========== 6. 统计完整性 & 生成缺失清单 ==========
# 检查每家公司 2015-Q1 至 2025-Q4 是否 Prepared Remarks / Q&A 齐全

import pandas as pd

def generate_quarters(min_q: str, max_q: str):
    """生成季度列表，如 2015-Q1, 2015-Q2, ..., 2025-Q4"""
    min_year, min_qn = int(min_q.split("-")[0]), int(min_q.split("-")[1][1])
    max_year, max_qn = int(max_q.split("-")[0]), int(max_q.split("-")[1][1])
    quarters = []
    for y in range(min_year, max_year + 1):
        for q in range(1, 5):
            if (y == min_year and q < min_qn) or (y == max_year and q > max_qn):
                continue
            quarters.append(f"{y}-Q{q}")
    return quarters

EXPECTED_QUARTERS = generate_quarters(MIN_QUARTER, MAX_QUARTER)
REQUIRED_SECTIONS = ["Prepared Remarks", "Q&A"]

conn = sqlite3.connect(DB_PATH)
df = pd.read_sql_query(
    "SELECT ticker, quarter, section FROM segments",
    conn
)
conn.close()

# 获取所有 ticker
all_tickers = sorted(df["ticker"].unique())

# 检查每个 (ticker, quarter) 是否有 Prepared Remarks 和 Q&A
missing_records = []
summary = []

for ticker in all_tickers:
    ticker_df = df[df["ticker"] == ticker]
    have_quarters = set()
    for _, row in ticker_df.iterrows():
        q = row["quarter"]
        s = row["section"]
        if q in EXPECTED_QUARTERS:
            have_quarters.add((q, s))
    
    complete = 0
    for q in EXPECTED_QUARTERS:
        has_pr = (q, "Prepared Remarks") in have_quarters
        has_qa = (q, "Q&A") in have_quarters
        if has_pr and has_qa:
            complete += 1
        elif not has_pr and not has_qa:
            missing_records.append({"ticker": ticker, "quarter": q, "missing": "Prepared Remarks + Q&A"})
        elif not has_pr:
            missing_records.append({"ticker": ticker, "quarter": q, "missing": "Prepared Remarks"})
        else:
            missing_records.append({"ticker": ticker, "quarter": q, "missing": "Q&A"})
    
    total = len(EXPECTED_QUARTERS)
    status = "✓ 完整" if complete == total else f"✗ 缺失 {total - complete} 个季度"
    summary.append({"ticker": ticker, "complete": complete, "total": total, "status": status})

# 输出统计
print("=" * 60)
print("各公司 segments 完整性 (2015-Q1 ~ 2025-Q4)")
print("=" * 60)
for s in summary:
    print(f"{s['ticker']:6} | {s['complete']:2}/{s['total']} 季度齐全 | {s['status']}")
print()

# 写入缺失清单文件
OUTPUT_MISSING = PROJECT_ROOT / "data" / "missing_segments.txt"
with open(OUTPUT_MISSING, "w", encoding="utf-8") as f:
    f.write("# earnings_calls.db segments 缺失清单\n")
    f.write("# 期望范围: 2015-Q1 ~ 2025-Q4，每季度需有 Prepared Remarks 和 Q&A\n")
    f.write("# 格式: TICKER | 季度 | 缺失内容\n")
    f.write("=" * 60 + "\n\n")
    if not missing_records:
        f.write("所有公司 Prepared Remarks 与 Q&A 均齐全。\n")
    else:
        for r in missing_records:
            f.write(f"{r['ticker']} | {r['quarter']} | {r['missing']}\n")

print(f"缺失清单已保存: {OUTPUT_MISSING}")
print(f"共 {len(missing_records)} 条缺失记录")

各公司 segments 完整性 (2015-Q1 ~ 2025-Q4)
AAPL   | 43/44 季度齐全 | ✗ 缺失 1 个季度
ADBE   | 44/44 季度齐全 | ✓ 完整
AMD    | 44/44 季度齐全 | ✓ 完整
AMZN   | 38/44 季度齐全 | ✗ 缺失 6 个季度
AVGO   | 41/44 季度齐全 | ✗ 缺失 3 个季度
BAC    | 43/44 季度齐全 | ✗ 缺失 1 个季度
BK     | 43/44 季度齐全 | ✗ 缺失 1 个季度
BLK    | 43/44 季度齐全 | ✗ 缺失 1 个季度
BR     | 42/44 季度齐全 | ✗ 缺失 2 个季度
C      | 43/44 季度齐全 | ✗ 缺失 1 个季度
CRM    | 39/44 季度齐全 | ✗ 缺失 5 个季度
DAL    | 44/44 季度齐全 | ✓ 完整
GS     | 44/44 季度齐全 | ✓ 完整
INTC   | 44/44 季度齐全 | ✓ 完整
JPM    | 44/44 季度齐全 | ✓ 完整
MS     | 43/44 季度齐全 | ✗ 缺失 1 个季度
MSFT   | 42/44 季度齐全 | ✗ 缺失 2 个季度
MTB    | 44/44 季度齐全 | ✓ 完整
NFLX   |  0/44 季度齐全 | ✗ 缺失 44 个季度
NOW    | 42/44 季度齐全 | ✗ 缺失 2 个季度
NVDA   | 41/44 季度齐全 | ✗ 缺失 3 个季度
PNC    | 43/44 季度齐全 | ✗ 缺失 1 个季度
QCOM   | 43/44 季度齐全 | ✗ 缺失 1 个季度
SHOP   | 40/44 季度齐全 | ✗ 缺失 4 个季度
STT    | 42/44 季度齐全 | ✗ 缺失 2 个季度
TSLA   | 43/44 季度齐全 | ✗ 缺失 1 个季度
TSM    | 43/44 季度齐全 | ✗ 缺失 1 个季度
WFC    | 44/44 季度齐全 | ✓ 完整

缺失清单已保存: /Users/xinyuewang/Desktop/1.27/data/missing_segments.txt
共 83 条缺失记录
