# 3.3 Document Attributes

从 `earnings_calls.db` 读取 segment 的 content，计算文档属性特征，追加到 `earnings_calls_features.db` 的 `segments_features` 表。

**依赖**：需先运行 3.2 生成 `earnings_calls_features.db`。

**实现**：正则分词、句子切分（按 .!? 分割），副词用 -ly 后缀启发式。

**特征**：
- n_words：词数
- n_sentences：句子数
- words_per_sentence：句均词数 (words/sentences)
- pronoun_plural_ratio：复数代词占所有代词比例
- adverb_ratio：副词比例（以 -ly 结尾的词）

In [1]:
# ========== 配置 ==========

import re
import sqlite3
from pathlib import Path

import pandas as pd

PROJECT_ROOT = Path("..").resolve()
SOURCE_DB = PROJECT_ROOT / "data" / "earnings_calls.db"
OUTPUT_DB = PROJECT_ROOT / "data" / "earnings_calls_features.db"

print("SOURCE_DB:", SOURCE_DB)
print("OUTPUT_DB:", OUTPUT_DB)

SOURCE_DB: /Users/xinyuewang/Desktop/1.27/data/earnings_calls.db
OUTPUT_DB: /Users/xinyuewang/Desktop/1.27/data/earnings_calls_features.db


In [2]:
# ========== 1. 文档属性计算函数（正则分词、按 .!? 切句、副词 -ly 启发式）==========

WORD_PATTERN = re.compile(r"[A-Za-z']+")
SENTENCE_PATTERN = re.compile(r"[.!?]+")


def tokenize_words(text: str):
    return [m.group(0).lower() for m in WORD_PATTERN.finditer(text or "")]


def tokenize_sentences(text: str):
    parts = SENTENCE_PATTERN.split(text or "")
    return [s.strip() for s in parts if s.strip()]


PRONOUNS_ALL = {
    "i", "me", "my", "mine",
    "you", "your", "yours",
    "he", "him", "his",
    "she", "her", "hers",
    "it", "its",
    "we", "us", "our", "ours",
    "they", "them", "their", "theirs",
}

PRONOUNS_PLURAL = {
    "we", "us", "our", "ours",
    "they", "them", "their", "theirs",
}


def compute_doc_attributes(text: str) -> dict:
    """从文本计算 n_words, n_sentences, words_per_sentence, pronoun_plural_ratio, adverb_ratio"""
    tokens = tokenize_words(text)
    n_words = len(tokens)
    sentences = tokenize_sentences(text)
    n_sent = len(sentences)
    words_per_sentence = n_words / n_sent if n_sent > 0 else 0.0

    pronouns = [w for w in tokens if w in PRONOUNS_ALL]
    pronouns_plural = [w for w in pronouns if w in PRONOUNS_PLURAL]
    pronoun_plural_ratio = len(pronouns_plural) / len(pronouns) if len(pronouns) > 0 else 0.0

    adverb_count = sum(1 for w in tokens if len(w) > 2 and w.endswith("ly"))
    adverb_ratio = adverb_count / n_words if n_words > 0 else 0.0

    return {
        "n_words": n_words,
        "n_sentences": n_sent,
        "words_per_sentence": words_per_sentence,
        "pronoun_plural_ratio": pronoun_plural_ratio,
        "adverb_ratio": adverb_ratio,
    }

In [3]:
# ========== 2. 读取数据：若有 segments_features 则追加，否则从 segments 创建 ==========

conn_src = sqlite3.connect(SOURCE_DB)
df_segments_full = pd.read_sql_query(
    "SELECT id, ticker, quarter, section, timestamp, url, source_file, content FROM segments",
    conn_src
)
conn_src.close()

conn_out = sqlite3.connect(OUTPUT_DB)
cur = conn_out.cursor()
cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='segments_features'")
table_exists = cur.fetchone() is not None
conn_out.close()

if table_exists:
    conn_out = sqlite3.connect(OUTPUT_DB)
    df_features = pd.read_sql_query("SELECT * FROM segments_features", conn_out)
    conn_out.close()
    df_merged = df_features.merge(df_segments_full[["id", "content"]], on="id", how="left")
    print(f"segments_features 已存在，读取 {len(df_features)} 行，追加文档属性")
else:
    df_merged = df_segments_full.copy()
    print(f"segments_features 不存在，从 segments 创建基础表 + 文档属性（{len(df_merged)} 行）")

print(f"合并 content 后: {len(df_merged)} 行")

segments_features 已存在，读取 2374 行，追加文档属性
合并 content 后: 2374 行


In [4]:
# ========== 3. 计算文档属性并追加列 ==========

doc_attr_rows = []
for idx, row in df_merged.iterrows():
    feat = compute_doc_attributes(row["content"] or "")
    doc_attr_rows.append(feat)
    if (idx + 1) % 200 == 0:
        print(f"已计算 {idx + 1} / {len(df_merged)}")

df_doc = pd.DataFrame(doc_attr_rows)
df_merged = df_merged.drop(columns=["content"])
df_out = pd.concat([df_merged, df_doc], axis=1)
print(f"合并完成，共 {len(df_out)} 行")

已计算 200 / 2374
已计算 400 / 2374
已计算 600 / 2374
已计算 800 / 2374
已计算 1000 / 2374
已计算 1200 / 2374
已计算 1400 / 2374
已计算 1600 / 2374
已计算 1800 / 2374
已计算 2000 / 2374
已计算 2200 / 2374
合并完成，共 2374 行


In [5]:
# ========== 4. 写回 earnings_calls_features.db ==========

conn_out = sqlite3.connect(OUTPUT_DB)
df_out.to_sql("segments_features", conn_out, if_exists="replace", index=False)
conn_out.close()

print(f"已更新 {OUTPUT_DB}")
print(f"表 segments_features: {len(df_out)} 行，{len(df_out.columns)} 列")

已更新 /Users/xinyuewang/Desktop/1.27/data/earnings_calls_features.db
表 segments_features: 2374 行，20 列


In [6]:
# ========== 5. 预览 ==========

conn = sqlite3.connect(OUTPUT_DB)
preview = pd.read_sql_query(
    "SELECT id, ticker, quarter, section, n_words, n_sentences, words_per_sentence, pronoun_plural_ratio, adverb_ratio FROM segments_features LIMIT 5",
    conn
)
conn.close()
preview

Unnamed: 0,id,ticker,quarter,section,n_words,n_sentences,words_per_sentence,pronoun_plural_ratio,adverb_ratio
0,1,AAPL,2017-Q1,Prepared Remarks,3087,219,14.09589,0.748571,0.01231
1,2,AAPL,2017-Q1,Q&A,4769,337,14.151335,0.426859,0.016775
2,3,AAPL,2017-Q2,Prepared Remarks,3400,239,14.225941,0.905759,0.01
3,4,AAPL,2017-Q2,Q&A,4375,307,14.250814,0.446281,0.019429
4,5,AAPL,2018-Q2,Prepared Remarks,3421,215,15.911628,0.884793,0.0114
