# 3. 文本入库与特征工程

本 notebook 目标：
- 从 `data/transcripts` 读取 earnings call 文本
- 解析 `Ticker / Fiscal Quarter / Call Datetime(ET)`
- 按 `Prepared Remarks` / `Q&A` 分段，写入 SQLite
- 基于 segment 文本计算文本特征（长度、pronoun、adverb、textstat、LM 字典）

In [28]:
import os
import re
import sqlite3
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from textstat import (
    automated_readability_index,
    coleman_liau_index,
    dale_chall_readability_score,
    flesch_reading_ease,
    flesch_kincaid_grade,
    gunning_fog,
    smog_index,
)

PROJECT_ROOT = Path("..").resolve()
TRANSCRIPTS_DIR = PROJECT_ROOT / "data" / "transcripts"
DB_PATH = PROJECT_ROOT / "data" / "earnings_calls.db"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("TRANSCRIPTS_DIR:", TRANSCRIPTS_DIR)
print("DB_PATH:", DB_PATH)

PROJECT_ROOT: /Users/xinyuewang/Desktop/1.27
TRANSCRIPTS_DIR: /Users/xinyuewang/Desktop/1.27/data/transcripts
DB_PATH: /Users/xinyuewang/Desktop/1.27/data/earnings_calls.db


In [None]:
# 0. 检查每个 ticker 的 transcript 文件名与财季完整性

from typing import List


def _fq_to_index(fq: str) -> int:
    """把 'YYYY-Qn' 映射成可排序的整数: year*4 + (q-1)"""
    try:
        year_str, q_str = fq.split("-Q")
        return int(year_str) * 4 + int(q_str) - 1
    except Exception:
        return -1


def inspect_transcript_files(base_dir: Path = TRANSCRIPTS_DIR):
    """检查每个 ticker 子目录下的文件名：
    - 只保留文件名中包含 "(TICKER)" 的文件，其他文件删除
    - 从文件名中解析财季，例如 "Q1 2024" -> "2024-Q1"
    - 打印每个 ticker 的财季起止范围和中间是否有缺失的 earnings call
    """
    print("=" * 60)
    print("检查每个 ticker 的 transcript 文件名与财季完整性")
    print("根目录:", base_dir)
    print("=" * 60)

    if not base_dir.exists():
        print(f"目录不存在: {base_dir}")
        return

    ticker_dirs: List[Path] = sorted([p for p in base_dir.iterdir() if p.is_dir()])
    if not ticker_dirs:
        print("该目录下没有任何 ticker 子目录。")
        return

    for ticker_dir in ticker_dirs:
        ticker = ticker_dir.name.upper()
        print("\n" + "-" * 60)
        print(f"Ticker: {ticker}")
        txt_files = sorted(ticker_dir.glob("*.txt"))
        print(f"总文件数: {len(txt_files)}")

        valid_files = []
        removed_files = []
        fq_list = []  # 解析出的财季列表，比如 ['2019-Q1', ...]

        ticker_tag = f"({ticker})"

        for fp in txt_files:
            name = fp.name

            # 1) 剔除不属于当前 ticker 的文件（例如 MS 目录中的 EV 电话）
            if ticker_tag not in name:
                removed_files.append(name)
                try:
                    fp.unlink()
                    print(f"删除非 {ticker} 文件: {name}")
                except Exception as e:
                    print(f"无法删除 {name}: {e}")
                continue

            valid_files.append(name)

            # 2) 从文件名中解析财季，形如 "Q1 2024" 或 "Q3 2019"
            m = re.search(r"Q([1-4])\s+(\d{4})", name)
            if m:
                q = int(m.group(1))
                year = int(m.group(2))
                fq_list.append(f"{year}-Q{q}")

        print(f"保留文件数: {len(valid_files)}")
        if removed_files:
            print(f"剔除 {len(removed_files)} 个非 {ticker} 文件。")

        if not fq_list:
            print("无法从文件名中解析出任何财季信息。")
            continue

        # 去重并按时间排序
        fq_unique = sorted(set(fq_list), key=_fq_to_index)
        first_fq, last_fq = fq_unique[0], fq_unique[-1]
        print(f"财季范围: {first_fq} -> {last_fq}")

        # 构造完整的季度序列，检查中间是否缺失
        start_year, start_q = map(int, first_fq.split("-Q"))
        end_year, end_q = map(int, last_fq.split("-Q"))

        full_range = []
        y, q = start_year, start_q
        while (y < end_year) or (y == end_year and q <= end_q):
            full_range.append(f"{y}-Q{q}")
            q += 1
            if q == 5:
                q = 1
                y += 1

        missing = [fq for fq in full_range if fq not in fq_unique]
        if missing:
            print(f"中间缺失财季 {len(missing)} 个: {missing}")
        else:
            print("中间没有缺失的财季（按文件名解析）。")


# 运行检查并收集缺失信息
inspect_transcript_files()




检查每个 ticker 的 transcript 文件名与财季完整性
根目录: /Users/xinyuewang/Desktop/1.27/data/transcripts

------------------------------------------------------------
Ticker: BAC
总文件数: 48
保留文件数: 48
财季范围: 2013-Q2 -> 2025-Q4
中间缺失财季 3 个: ['2013-Q3', '2013-Q4', '2014-Q1']

------------------------------------------------------------
Ticker: BK
总文件数: 43
保留文件数: 43
财季范围: 2015-Q1 -> 2025-Q4
中间缺失财季 1 个: ['2021-Q3']

------------------------------------------------------------
Ticker: BLK
总文件数: 44
保留文件数: 44
财季范围: 2015-Q1 -> 2025-Q4
中间没有缺失的财季（按文件名解析）。

------------------------------------------------------------
Ticker: BR
总文件数: 44
保留文件数: 44
财季范围: 2015-Q1 -> 2026-Q1
中间缺失财季 1 个: ['2022-Q1']

------------------------------------------------------------
Ticker: C
总文件数: 45
保留文件数: 45
财季范围: 2015-Q1 -> 2025-Q4
中间没有缺失的财季（按文件名解析）。

------------------------------------------------------------
Ticker: DAL
总文件数: 40
保留文件数: 40
财季范围: 2015-Q3 -> 2025-Q3
中间缺失财季 1 个: ['2016-Q1']

---------------------------------------------------

In [39]:
# 1. 元数据解析 & 分段函数

# 从文本前几行提取 Fiscal Quarter（例如 "Q1 2024"）
FISCAL_QTR_PATTERN = re.compile(
    r"Q(?P<qtr>[1-4])\s+(?P<year>\d{4})",
    re.IGNORECASE,
)

# 从文本前几行提取 Call Date（例如 "April 16, 2024 8:30 AM ET" 或 "January 15, 2025, 08:30 AM ET"）
# 支持年份后面有逗号的情况
DATE_LINE_PATTERN = re.compile(
    r"(?P<month>[A-Za-z]+\.?)\s+(?P<day>\d{1,2}),\s+(?P<year>\d{4}),?\s+"
    r"(?P<hour>\d{1,2}):(?P<minute>\d{2})\s*(?P<ampm>AM|PM)\s*ET",
    re.IGNORECASE,
)

MONTH_MAP = {
    "jan": 1, "january": 1, "jan.": 1,
    "feb": 2, "february": 2, "feb.": 2,
    "mar": 3, "march": 3, "mar.": 3,
    "apr": 4, "april": 4, "apr.": 4,
    "may": 5,
    "jun": 6, "june": 6, "jun.": 6,
    "jul": 7, "july": 7, "jul.": 7,
    "aug": 8, "august": 8, "aug.": 8,
    "sep": 9, "september": 9, "sept": 9, "sep.": 9,
    "oct": 10, "october": 10, "oct.": 10,
    "nov": 11, "november": 11, "nov.": 11,
    "dec": 12, "december": 12, "dec.": 12,
}

def parse_fiscal_quarter_from_text(text: str):
    """从文本前15行中提取 Fiscal Quarter（例如 2024-Q1）
    跳过包含 'Call Start' 的行，只选择年份在合理范围内的（2000-2030）
    """
    lines = text.split('\n')[:15]
    
    # 过滤掉包含 "Call Start" 的行
    filtered_lines = [line for line in lines if "Call Start" not in line]
    search_text = '\n'.join(filtered_lines)
    
    # 找到所有匹配的季度
    matches = list(FISCAL_QTR_PATTERN.finditer(search_text))
    if not matches:
        return None
    
    # 选择年份在合理范围内的匹配
    valid_matches = []
    for m in matches:
        year = int(m.group("year"))
        if 2000 <= year <= 2030:
            valid_matches.append((m, year))
    
    if not valid_matches:
        return None
    
    # 选择年份最大的匹配
    best_match, _ = max(valid_matches, key=lambda x: x[1])
    m = best_match
    
    year = int(m.group("year"))
    qtr = int(m.group("qtr"))
    return f"{year}-Q{qtr}"


def parse_call_datetime_et(text: str):
    """从文本前15行中提取 Call Date（例如 2024-04-16 08:30）
    跳过包含 'Call Start' 的行，只选择年份在合理范围内的日期（2000-2030）
    """
    lines = text.split('\n')[:15]
    
    # 过滤掉包含 "Call Start" 的行（这些行通常有错误的日期）
    filtered_lines = [line for line in lines if "Call Start" not in line]
    search_text = '\n'.join(filtered_lines)
    
    # 找到所有匹配的日期
    matches = list(DATE_LINE_PATTERN.finditer(search_text))
    if not matches:
        return None
    
    # 选择年份在合理范围内的匹配（2000-2030）
    valid_matches = []
    for m in matches:
        year = int(m.group("year"))
        if 2000 <= year <= 2030:
            valid_matches.append((m, year))
    
    if not valid_matches:
        return None
    
    # 选择年份最大的匹配（通常是最新的正确日期）
    best_match, _ = max(valid_matches, key=lambda x: x[1])
    m = best_match

    month_str = m.group("month").lower()
    month = MONTH_MAP.get(month_str)
    if month is None:
        return None

    day = int(m.group("day"))
    year = int(m.group("year"))
    hour = int(m.group("hour"))
    minute = int(m.group("minute"))
    ampm = m.group("ampm").upper()

    if ampm == "PM" and hour != 12:
        hour += 12
    if ampm == "AM" and hour == 12:
        hour = 0

    try:
        dt = datetime(year, month, day, hour, minute)
        return dt.strftime("%Y-%m-%d %H:%M")
    except ValueError as e:
        # 如果日期无效（例如 2月30日），返回 None
        return None


# Q&A 分割标记（多种变体）
QA_MARKERS = [
    r"Question-and-Answer Session",
    r"Questions and Answers",
    r"Question and Answer Session",
    r"Q&A Session",
    r"Q & A Session",
]

def find_qa_marker(text: str, skip_first_n_sentences: int = 10):
    """找到第一个 Q&A 标记的位置
    从第 skip_first_n_sentences 句话之后开始搜索，避免在开头误匹配
    """
    # 先找到前 N 句话的结束位置
    sentence_endings = re.finditer(r'[.!?]+\s+', text)
    sentence_positions = [m.end() for m in sentence_endings]
    
    # 如果句子数少于 skip_first_n_sentences，从开头开始搜索
    if len(sentence_positions) < skip_first_n_sentences:
        search_start_pos = 0
    else:
        # 从第 skip_first_n_sentences 句话之后开始搜索
        search_start_pos = sentence_positions[skip_first_n_sentences - 1]
    
    # 只搜索 search_start_pos 之后的内容
    search_text = text[search_start_pos:]
    
    # 在搜索文本中查找 Q&A 标记
    for marker in QA_MARKERS:
        m = re.search(marker, search_text, flags=re.IGNORECASE)
        if m:
            # 返回在原文中的绝对位置
            return search_start_pos + m.start()
    
    return None


def split_transcript_into_segments(text: str):
    """
    分段逻辑：
    - 前半部分（到 Q&A 标记之前）自动是 Prepared Remarks
    - 如果找到 Q&A 标记，之后的部分是 Q&A
    - segment_type 只能是 "Prepared Remarks" 或 "Q&A"
    """
    segments = []
    
    qa_start_pos = find_qa_marker(text)
    
    if qa_start_pos is not None:
        # 找到 Q&A 标记，分成两部分
        prepared_text = text[:qa_start_pos].strip()
        qa_text = text[qa_start_pos:].strip()
        
        if prepared_text:
            segments.append({
                "segment_type": "Prepared Remarks",
                "text": prepared_text
            })
        if qa_text:
            segments.append({
                "segment_type": "Q&A",
                "text": qa_text
            })
    else:
        # 没找到 Q&A 标记，整篇作为 Prepared Remarks
        segments.append({
            "segment_type": "Prepared Remarks",
            "text": text.strip()
        })
    
    return segments

In [41]:
# 2. 初始化 SQLite 表

def init_db(db_path: Path):
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    cur.execute(
        """
        CREATE TABLE IF NOT EXISTS segments (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            ticker TEXT,
            fiscal_quarter TEXT,
            call_datetime_et TEXT,
            segment_type TEXT,
            text_content TEXT,
            source_file TEXT
        )
        """
    )

    cur.execute(
        """
        CREATE TABLE IF NOT EXISTS segment_features (
            segment_id INTEGER PRIMARY KEY,
            n_words INTEGER,
            n_sentences INTEGER,
            avg_words_per_sentence REAL,
            pronoun_plural_ratio REAL,
            adverb_ratio REAL,
            ari REAL,
            coleman_liau REAL,
            dale_chall REAL,
            flesch_ease REAL,
            flesch_kincaid REAL,
            gunning_fog REAL,
            smog REAL,
            lm_positive REAL,
            lm_negative REAL,
            lm_uncertainty REAL,
            lm_litigous REAL,
            lm_superfluous REAL,
            lm_interesting REAL,
            lm_modal_weak REAL,
            lm_modal_moderate REAL,
            lm_modal_strong REAL,
            lm_constraining REAL,
            lm_complexity REAL,
            lm_net_sentiment REAL,
            lm_polarity REAL,
            lm_subjectivity REAL,
            FOREIGN KEY(segment_id) REFERENCES segments(id)
        )
        """
    )

    conn.commit()
    conn.close()

init_db(DB_PATH)
print("DB 初始化完成")

DB 初始化完成


In [42]:
# 3. 将 transcripts 写入 segments 表

def insert_segments_from_transcripts(db_path: Path, transcripts_dir: Path):
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    # 清空现有数据（如果需要重新导入）
    cur.execute("DELETE FROM segments")
    cur.execute("DELETE FROM segment_features")

    count_files = 0
    count_segments = 0
    failed_files = []

    for ticker_dir in sorted(transcripts_dir.glob("*")):
        if not ticker_dir.is_dir():
            continue

        # Ticker 直接从文件夹名提取（例如 "BAC", "JPM"）
        ticker = ticker_dir.name.upper()

        for txt_path in sorted(ticker_dir.glob("*.txt")):
            count_files += 1

            try:
                with open(txt_path, "r", encoding="utf-8") as f:
                    text = f.read()

                # 从文本前几行提取 Fiscal Quarter 和 Call Date
                fiscal_quarter = parse_fiscal_quarter_from_text(text)
                call_datetime_et = parse_call_datetime_et(text)

                if not fiscal_quarter:
                    print(f"⚠️ 无法解析 Fiscal Quarter: {txt_path.name}")
                    failed_files.append(txt_path.name)
                    continue

                segments = split_transcript_into_segments(text)

                for seg in segments:
                    cur.execute(
                        """
                        INSERT INTO segments (
                            ticker, fiscal_quarter, call_datetime_et,
                            segment_type, text_content, source_file
                        ) VALUES (?, ?, ?, ?, ?, ?)
                        """,
                        (
                            ticker,
                            fiscal_quarter,
                            call_datetime_et,
                            seg["segment_type"],
                            seg["text"],
                            str(txt_path.relative_to(transcripts_dir.parent)),
                        ),
                    )
                    count_segments += 1

            except Exception as e:
                print(f"❌ 处理文件失败 {txt_path.name}: {e}")
                failed_files.append(txt_path.name)

    conn.commit()
    conn.close()

    print(f"共处理 transcript 文件数: {count_files}")
    print(f"写入 segment 记录数: {count_segments}")
    if failed_files:
        print(f"⚠️ 失败文件数: {len(failed_files)}")
        print("前5个失败文件:", failed_files[:5])


insert_segments_from_transcripts(DB_PATH, TRANSCRIPTS_DIR)

⚠️ 无法解析 Fiscal Quarter: Morgan Stanley. (MS) Management on Bank of America Future of Financials Virtual Conference 2020 - Earnings Call Transcript.txt
⚠️ 无法解析 Fiscal Quarter: The PNC Financial Services Group, Inc. (PNC) Q2 2024 Earnings Call Transcript.txt
共处理 transcript 文件数: 612
写入 segment 记录数: 1218
⚠️ 失败文件数: 2
前5个失败文件: ['Morgan Stanley. (MS) Management on Bank of America Future of Financials Virtual Conference 2020 - Earnings Call Transcript.txt', 'The PNC Financial Services Group, Inc. (PNC) Q2 2024 Earnings Call Transcript.txt']


In [43]:
# 4. 基本文本特征（长度、pronoun、adverb）

PRONOUNS_ALL = {
    "i", "me", "my", "mine",
    "you", "your", "yours",
    "he", "him", "his",
    "she", "her", "hers",
    "it", "its",
    "we", "us", "our", "ours",
    "they", "them", "their", "theirs",
}

PRONOUNS_PLURAL = {
    "we", "us", "our", "ours",
    "they", "them", "their", "theirs",
}

COMMON_ADVERBS = {
    "very", "quite", "rather", "extremely", "highly", "significantly",
    "substantially", "slightly", "barely", "rarely", "frequently",
    "usually", "typically", "generally", "probably", "possibly",
    "certainly", "clearly", "obviously",
}

WORD_PATTERN = re.compile(r"[A-Za-z']+")

def tokenize_words(text: str):
    return WORD_PATTERN.findall(text.lower())


def count_sentences(text: str):
    parts = re.split(r"[.!?]+", text)
    return sum(1 for p in parts if p.strip())


def basic_features(text: str):
    tokens = tokenize_words(text)
    n_words = len(tokens)
    n_sent = count_sentences(text)
    avg_words = n_words / n_sent if n_sent > 0 else 0.0

    pronouns = [w for w in tokens if w in PRONOUNS_ALL]
    pronouns_plural = [w for w in pronouns if w in PRONOUNS_PLURAL]

    pronoun_plural_ratio = (
        len(pronouns_plural) / len(pronouns) if len(pronouns) > 0 else 0.0
    )

    adverbs = [
        w for w in tokens
        if w in COMMON_ADVERBS or (len(w) > 3 and w.endswith("ly"))
    ]
    adverb_ratio = len(adverbs) / n_words if n_words > 0 else 0.0

    return {
        "n_words": n_words,
        "n_sentences": n_sent,
        "avg_words_per_sentence": avg_words,
        "pronoun_plural_ratio": pronoun_plural_ratio,
        "adverb_ratio": adverb_ratio,
    }

In [44]:
# 5. textstat 可读性特征

def readability_features(text: str):
    try:
        ari = automated_readability_index(text)
    except Exception:
        ari = None
    try:
        cl = coleman_liau_index(text)
    except Exception:
        cl = None
    try:
        dc = dale_chall_readability_score(text)
    except Exception:
        dc = None
    try:
        fre = flesch_reading_ease(text)
    except Exception:
        fre = None
    try:
        fk = flesch_kincaid_grade(text)
    except Exception:
        fk = None
    try:
        gf = gunning_fog(text)
    except Exception:
        gf = None
    try:
        smg = smog_index(text)
    except Exception:
        smg = None

    return {
        "ari": ari,
        "coleman_liau": cl,
        "dale_chall": dc,
        "flesch_ease": fre,
        "flesch_kincaid": fk,
        "gunning_fog": gf,
        "smog": smg,
    }

In [45]:
# 6. LM 字典特征（需要 data/LM/LM_MasterDictionary.csv）

LM_CSV_PATH = PROJECT_ROOT / "data" / "LM" / "LM_MasterDictionary.csv"

if LM_CSV_PATH.exists():
    lm_df = pd.read_csv(LM_CSV_PATH)
    lm_df["Word"] = lm_df["Word"].astype(str).str.lower()

    def build_lm_set(col):
        if col not in lm_df.columns:
            return set()
        return set(lm_df.loc[lm_df[col] > 0, "Word"])

    LM_POSITIVE = build_lm_set("Positive")
    LM_NEGATIVE = build_lm_set("Negative")
    LM_UNCERTAINTY = build_lm_set("Uncertainty")
    LM_LITIGIOUS = build_lm_set("Litigious")
    LM_SUPERFLUOUS = build_lm_set("Superfluous")
    LM_INTERESTING = build_lm_set("Interesting")
    LM_MODAL_WEAK = build_lm_set("ModalWeak")
    LM_MODAL_MODERATE = build_lm_set("ModalModerate")
    LM_MODAL_STRONG = build_lm_set("ModalStrong")
    LM_CONSTRAINING = build_lm_set("Constraining")
    LM_COMPLEXITY = build_lm_set("Complexity")

    print("LM 词典加载完成")
else:
    print("⚠️ 未找到 LM_MasterDictionary.csv，LM 特征全部为 0")
    LM_POSITIVE = LM_NEGATIVE = LM_UNCERTAINTY = LM_LITIGIOUS = set()
    LM_SUPERFLUOUS = LM_INTERESTING = set()
    LM_MODAL_WEAK = LM_MODAL_MODERATE = LM_MODAL_STRONG = set()
    LM_CONSTRAINING = LM_COMPLEXITY = set()


def lm_features(text: str):
    tokens = tokenize_words(text)
    n_words = len(tokens) if len(tokens) > 0 else 1

    def ratio(lm_set):
        return sum(1 for w in tokens if w in lm_set) / n_words

    pos = ratio(LM_POSITIVE)
    neg = ratio(LM_NEGATIVE)
    unc = ratio(LM_UNCERTAINTY)
    lit = ratio(LM_LITIGIOUS)
    sup = ratio(LM_SUPERFLUOUS)
    intr = ratio(LM_INTERESTING)
    mweak = ratio(LM_MODAL_WEAK)
    mmod = ratio(LM_MODAL_MODERATE)
    mstrong = ratio(LM_MODAL_STRONG)
    constr = ratio(LM_CONSTRAINING)
    complx = ratio(LM_COMPLEXITY)

    net_sentiment = pos - neg
    polarity = pos + neg
    subjectivity = pos + neg + unc + lit + sup + intr

    return {
        "lm_positive": pos,
        "lm_negative": neg,
        "lm_uncertainty": unc,
        "lm_litigous": lit,
        "lm_superfluous": sup,
        "lm_interesting": intr,
        "lm_modal_weak": mweak,
        "lm_modal_moderate": mmod,
        "lm_modal_strong": mstrong,
        "lm_constraining": constr,
        "lm_complexity": complx,
        "lm_net_sentiment": net_sentiment,
        "lm_polarity": polarity,
        "lm_subjectivity": subjectivity,
    }

LM 词典加载完成


In [46]:
# 7. 计算并写入所有 segment 的特征

def compute_and_store_features(db_path: Path, batch_size: int = 200):
    conn = sqlite3.connect(db_path)
    cur = conn.cursor()

    cur.execute(
        """
        SELECT s.id, s.text_content
        FROM segments s
        LEFT JOIN segment_features f ON s.id = f.segment_id
        WHERE f.segment_id IS NULL
        """
    )
    rows = cur.fetchall()

    print("需要计算特征的 segment 数量:", len(rows))

    to_insert = []
    for idx, (seg_id, text) in enumerate(rows, 1):
        text = text or ""
        feat_basic = basic_features(text)
        feat_read = readability_features(text)
        feat_lm = lm_features(text)

        record = {
            "segment_id": seg_id,
            **feat_basic,
            **feat_read,
            **feat_lm,
        }
        to_insert.append(record)

        if len(to_insert) >= batch_size:
            df = pd.DataFrame(to_insert)
            df.to_sql("segment_features", conn, if_exists="append", index=False)
            print(f"已写入 {idx} / {len(rows)}")
            to_insert = []

    if to_insert:
        df = pd.DataFrame(to_insert)
        df.to_sql("segment_features", conn, if_exists="append", index=False)

    conn.close()
    print("全部特征写入完成")


compute_and_store_features(DB_PATH)

需要计算特征的 segment 数量: 1218
已写入 200 / 1218
已写入 400 / 1218
已写入 600 / 1218
已写入 800 / 1218
已写入 1000 / 1218
已写入 1200 / 1218
全部特征写入完成
