# 3.4 Sentiment Scores

从 `earnings_calls.db` 读取 segment 的 content，使用 **Loughran-McDonald (LM) 词典** 计算情感特征，追加到 `earnings_calls_features.db` 的 `segments_features` 表。

**实现**：正则分词，LM 词典 `data/LM/LM_MasterDictionary.csv`。

**依赖**：若 `segments_features` 已存在（3.2/3.3）则追加 LM；否则从 segments 创建。

**特征**：
- lm_positive, lm_negative, lm_uncertainty, lm_litigious
- lm_modal_weak, lm_modal_strong, lm_constraining, lm_complexity
- lm_net_sentiment = pos − neg
- lm_polarity = (pos − neg) / (pos + neg + ε)
- lm_subjectivity = pos + neg + unc + lit
- distilbert_sentiment_score（DistilBERT 情感分数，POS 为正，NEG 为负）

In [1]:
# ========== 配置 ==========

import re
import sqlite3
from pathlib import Path

import pandas as pd

PROJECT_ROOT = Path("..").resolve()
SOURCE_DB = PROJECT_ROOT / "data" / "earnings_calls.db"
OUTPUT_DB = PROJECT_ROOT / "data" / "earnings_calls_features.db"
LM_CSV_PATH = PROJECT_ROOT / "data" / "LM" / "LM_MasterDictionary.csv"

EPSILON = 1e-10

print("SOURCE_DB:", SOURCE_DB)
print("OUTPUT_DB:", OUTPUT_DB)
print("LM_CSV_PATH:", LM_CSV_PATH)

SOURCE_DB: /Users/xinyuewang/Desktop/1.27/data/earnings_calls.db
OUTPUT_DB: /Users/xinyuewang/Desktop/1.27/data/earnings_calls_features.db
LM_CSV_PATH: /Users/xinyuewang/Desktop/1.27/data/LM/LM_MasterDictionary.csv


In [2]:
# ========== 1. 加载 LM 词典 & 正则分词 ==========

WORD_PATTERN = re.compile(r"[A-Za-z']+")


def tokenize_words(text: str):
    return [m.group(0).lower() for m in WORD_PATTERN.finditer(text or "")]


def build_lm_set(lm_df, col: str):
    if col not in lm_df.columns:
        return set()
    return set(lm_df.loc[lm_df[col] > 0, "Word"].astype(str).str.lower())


lm_df = pd.read_csv(LM_CSV_PATH)
lm_df["Word"] = lm_df["Word"].astype(str).str.lower()

LM_POSITIVE = build_lm_set(lm_df, "Positive")
LM_NEGATIVE = build_lm_set(lm_df, "Negative")
LM_UNCERTAINTY = build_lm_set(lm_df, "Uncertainty")
LM_LITIGIOUS = build_lm_set(lm_df, "Litigious")
LM_MODAL_WEAK = build_lm_set(lm_df, "Weak_Modal")
LM_MODAL_STRONG = build_lm_set(lm_df, "Strong_Modal")
LM_CONSTRAINING = build_lm_set(lm_df, "Constraining")
LM_COMPLEXITY = build_lm_set(lm_df, "Complexity")

print(f"LM 词典加载完成: Positive={len(LM_POSITIVE)}, Negative={len(LM_NEGATIVE)}, ...")

LM 词典加载完成: Positive=347, Negative=2345, ...


In [3]:
# ========== 2. LM 情感特征计算函数 ==========

def compute_lm_features(text: str) -> dict:
    tokens = tokenize_words(text)
    n = len(tokens) if tokens else 1

    def ratio(s):
        return sum(1 for w in tokens if w in s) / n

    pos = ratio(LM_POSITIVE)
    neg = ratio(LM_NEGATIVE)
    unc = ratio(LM_UNCERTAINTY)
    lit = ratio(LM_LITIGIOUS)
    mweak = ratio(LM_MODAL_WEAK)
    mstrong = ratio(LM_MODAL_STRONG)
    constr = ratio(LM_CONSTRAINING)
    complx = ratio(LM_COMPLEXITY)

    net_sentiment = pos - neg
    polarity = (pos - neg) / (pos + neg + EPSILON)
    subjectivity = pos + neg + unc + lit

    return {
        "lm_positive": pos,
        "lm_negative": neg,
        "lm_uncertainty": unc,
        "lm_litigious": lit,
        "lm_modal_weak": mweak,
        "lm_modal_strong": mstrong,
        "lm_constraining": constr,
        "lm_complexity": complx,
        "lm_net_sentiment": net_sentiment,
        "lm_polarity": polarity,
        "lm_subjectivity": subjectivity,
    }

In [4]:
# ========== 3. 读取数据：若有 segments_features 则追加 LM，否则从 segments 创建 ==========

conn_src = sqlite3.connect(SOURCE_DB)
df_segments_full = pd.read_sql_query(
    "SELECT id, ticker, quarter, section, timestamp, url, source_file, content FROM segments",
    conn_src
)
conn_src.close()

conn_out = sqlite3.connect(OUTPUT_DB)
cur = conn_out.cursor()
cur.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='segments_features'")
table_exists = cur.fetchone() is not None
conn_out.close()

if table_exists:
    conn_out = sqlite3.connect(OUTPUT_DB)
    df_features = pd.read_sql_query("SELECT * FROM segments_features", conn_out)
    conn_out.close()
    df_merged = df_features.merge(df_segments_full[["id", "content"]], on="id", how="left")
    print(f"segments_features 已存在，读取 {len(df_features)} 行，追加 LM 特征")
else:
    df_merged = df_segments_full.copy()
    print(f"segments_features 不存在，从 segments 创建基础表 + LM 特征（{len(df_merged)} 行）")

print(f"合并 content 后: {len(df_merged)} 行")

segments_features 已存在，读取 2374 行，追加 LM 特征
合并 content 后: 2374 行


In [5]:
# ========== 4. 计算 LM 特征并追加列 ==========

lm_rows = []
for idx, row in df_merged.iterrows():
    feat = compute_lm_features(row["content"])
    lm_rows.append(feat)
    if (idx + 1) % 200 == 0:
        print(f"已计算 {idx + 1} / {len(df_merged)}")

df_lm = pd.DataFrame(lm_rows)
df_merged = df_merged.drop(columns=["content"])
df_out = pd.concat([df_merged, df_lm], axis=1)
print(f"合并完成，共 {len(df_out)} 行")

已计算 200 / 2374
已计算 400 / 2374
已计算 600 / 2374
已计算 800 / 2374
已计算 1000 / 2374
已计算 1200 / 2374
已计算 1400 / 2374
已计算 1600 / 2374
已计算 1800 / 2374
已计算 2000 / 2374
已计算 2200 / 2374
合并完成，共 2374 行


In [6]:
# ========== 5. 写回 earnings_calls_features.db ==========

conn_out = sqlite3.connect(OUTPUT_DB)
df_out.to_sql("segments_features", conn_out, if_exists="replace", index=False)
conn_out.close()

print(f"已更新 {OUTPUT_DB}")
print(f"表 segments_features: {len(df_out)} 行，{len(df_out.columns)} 列")

已更新 /Users/xinyuewang/Desktop/1.27/data/earnings_calls_features.db
表 segments_features: 2374 行，31 列


In [7]:
# ========== 6. 预览 ==========

lm_cols = [c for c in df_out.columns if c.startswith("lm_")]
conn = sqlite3.connect(OUTPUT_DB)
preview = pd.read_sql_query(
    f"SELECT id, ticker, quarter, section, {', '.join(lm_cols)} FROM segments_features LIMIT 5",
    conn
)
conn.close()
preview

Unnamed: 0,id,ticker,quarter,section,lm_positive,lm_negative,lm_uncertainty,lm_litigious,lm_modal_weak,lm_modal_strong,lm_constraining,lm_complexity,lm_net_sentiment,lm_polarity,lm_subjectivity
0,1,AAPL,2017-Q1,Prepared Remarks,0.025591,0.003563,0.003563,0.0,0.002592,0.004535,0.00162,0.001944,0.022028,0.755556,0.032718
1,2,AAPL,2017-Q1,Q&A,0.016356,0.009226,0.007968,0.001887,0.005452,0.004403,0.000839,0.001887,0.007129,0.278689,0.035437
2,3,AAPL,2017-Q2,Prepared Remarks,0.026176,0.002353,0.004412,0.0,0.003235,0.003235,0.001471,0.003529,0.023824,0.835052,0.032941
3,4,AAPL,2017-Q2,Q&A,0.010514,0.008457,0.011657,0.001829,0.006171,0.003657,0.000914,0.000686,0.002057,0.108434,0.032457
4,5,AAPL,2018-Q2,Prepared Remarks,0.021046,0.002631,0.0076,0.000292,0.004385,0.006139,0.000877,0.002923,0.018416,0.777778,0.03157


In [10]:
# ========== 7. 覆盖版：新增 bert_sentiment_mean（句子级 POS-NEG 均值） ==========
# 从这里直接运行即可（不依赖上面旧的 DistilBERT cell）

import re
import sqlite3
from pathlib import Path

import pandas as pd

PROJECT_ROOT = Path("..").resolve()
SOURCE_DB = PROJECT_ROOT / "data" / "earnings_calls.db"
OUTPUT_DB = PROJECT_ROOT / "data" / "earnings_calls_features.db"

DISTILBERT_MODEL = "distilbert-base-uncased-finetuned-sst-2-english"
DISTILBERT_BATCH_SIZE = 32
DISTILBERT_MAX_LENGTH = 512
SENT_SPLIT_PATTERN = re.compile(r"(?<=[.!?])\s+")


def split_sentences(text: str):
    text = (text or "").strip()
    if not text:
        return []
    sents = [s.strip() for s in SENT_SPLIT_PATTERN.split(text) if s.strip()]
    return sents if sents else [text]


def compute_bert_sentiment_mean(texts):
    try:
        from transformers import pipeline
    except Exception as e:
        raise ImportError("请先安装依赖: pip install transformers torch") from e

    clf = pipeline(
        "sentiment-analysis",
        model=DISTILBERT_MODEL,
        tokenizer=DISTILBERT_MODEL,
    )

    out_scores = []
    total = len(texts)

    for i, txt in enumerate(texts):
        sents = split_sentences(txt)
        if not sents:
            out_scores.append(0.0)
            continue

        preds = clf(
            sents,
            batch_size=DISTILBERT_BATCH_SIZE,
            truncation=True,
            max_length=DISTILBERT_MAX_LENGTH,
        )

        sent_scores = []
        for p in preds:
            label = str(p.get("label", "")).upper()
            prob = float(p.get("score", 0.0))
            # POS-NEG, in [-1, 1]
            if "POS" in label:
                score = (2.0 * prob) - 1.0
            else:
                score = 1.0 - (2.0 * prob)
            sent_scores.append(score)

        out_scores.append(float(sum(sent_scores) / len(sent_scores)))

        if (i + 1) % 100 == 0 or (i + 1) == total:
            print(f"bert_sentiment_mean 已计算 {i + 1} / {total}")

    return out_scores


# 1) 读取 segments content（id + content）
conn_src = sqlite3.connect(SOURCE_DB)
df_segments = pd.read_sql_query("SELECT id, content FROM segments", conn_src)
conn_src.close()

# 2) 读取当前 features 表
conn_out = sqlite3.connect(OUTPUT_DB)
df_feat = pd.read_sql_query("SELECT * FROM segments_features", conn_out)
conn_out.close()

# 3) 对齐 content 并计算 bert_sentiment_mean
df_work = df_feat.merge(df_segments, on="id", how="left")
texts = df_work["content"].fillna("").astype(str).tolist()
bert_mean = compute_bert_sentiment_mean(texts)

if len(bert_mean) != len(df_work):
    raise ValueError("bert_sentiment_mean 输出长度与样本数不一致")

# 4) 写入新列并落库（覆盖表）
df_work["bert_sentiment_mean"] = bert_mean
df_write = df_work.drop(columns=["content"], errors="ignore")

conn_out = sqlite3.connect(OUTPUT_DB)
df_write.to_sql("segments_features", conn_out, if_exists="replace", index=False)
conn_out.close()

print("已写入列: bert_sentiment_mean")
print("行列:", df_write.shape)

# 5) 预览
conn_out = sqlite3.connect(OUTPUT_DB)
preview = pd.read_sql_query(
    "SELECT id, ticker, quarter, section, bert_sentiment_mean FROM segments_features LIMIT 5",
    conn_out,
)
conn_out.close()
preview

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



: 

In [9]:
pip install transformers torch

Collecting transformers
  Downloading transformers-5.2.0-py3-none-any.whl.metadata (32 kB)
Collecting torch
  Downloading torch-2.10.0-2-cp312-none-macosx_11_0_arm64.whl.metadata (31 kB)
Collecting huggingface-hub<2.0,>=1.3.0 (from transformers)
  Downloading huggingface_hub-1.4.1-py3-none-any.whl.metadata (13 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.2-cp39-abi3-macosx_11_0_arm64.whl.metadata (7.3 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.7.0-cp38-abi3-macosx_11_0_arm64.whl.metadata (4.1 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting hf-xet<2.0.0,>=1.2.0 (from huggingface-hub<2.0,>=1.3.0->transformers)
  Downloading hf_xet-1.2.0-cp37-abi3-macosx_11_0_arm64.whl.metadata (4.9 kB)
Downloading transformers-5.2.0-py3-none-any.whl (10.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.4/10.4 MB[0m [31m11.4 MB/s[0