In [1]:
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
import html

FILES_DIR = "Files"

# Example filename formats:
# 1) 2003-Dec-08-Acxiom LLC-140477246411-Brief.txt   (company name)
# 2) 2011-Jun-22-XRAY.OQ-138268919384-Brief.txt     (RIC)

ric_pattern = re.compile(r"^\d{4}-[A-Za-z]{3}-\d{2}-([A-Z0-9\.^]+)-\d+-Brief\.txt$")
date_pattern = re.compile(r"^(\d{4}-[A-Za-z]{3}-\d{2})-")

def parse_file_date(fname):
    m = date_pattern.match(fname)
    if not m:
        return None
    return pd.to_datetime(m.group(1), format="%Y-%b-%d", errors="coerce")

def parse_ric(fname):
    m = ric_pattern.match(fname)
    return m.group(1) if m else None

rows = []
for fname in os.listdir(FILES_DIR):
    if not fname.endswith(".txt"):
        continue
    fpath = os.path.join(FILES_DIR, fname)
    with open(fpath, "r", encoding="utf-8", errors="ignore") as f:
        txt = f.read()

    rows.append({
        "file_name": fname,
        "file_date": parse_file_date(fname),
        "ric_from_fname": parse_ric(fname),
        "raw_text": txt
    })

texts = pd.DataFrame(rows)
print("Loaded files:", len(texts))
texts.head()

Loaded files: 2549


Unnamed: 0,file_name,file_date,ric_from_fname,raw_text
0,2003-Dec-08-Acxiom LLC-140477246411-Brief.txt,2003-12-08,,\n\nRefinitiv StreetEvents Event Brief\nE D I ...
1,2003-Dec-15-MFE.N^C11-138286857648-Brief.txt,2003-12-15,MFE.N^C11,\n\nRefinitiv StreetEvents Event Brief\nE D I ...
2,2003-Dec-16-NFB.N^L06-136907498398-Brief.txt,2003-12-16,NFB.N^L06,\n\nRefinitiv StreetEvents Event Brief\nE D I ...
3,2003-Dec-18-CL.N-139888600104-Brief.txt,2003-12-18,CL.N,\n\nRefinitiv StreetEvents Event Brief\nE D I ...
4,2003-Dec-19-MDCO.OQ^A20-139635792615-Brief.txt,2003-12-19,MDCO.OQ^A20,\n\nRefinitiv StreetEvents Event Brief\nE D I ...


In [2]:
def extract_qa_raw(t: str) -> str:
    """
    Extract everything after 'QUESTIONS AND ANSWERS' (Q&A section).
    Returns empty string if Q&A header not found.
    """
    if not isinstance(t, str):
        return ""
    # decode HTML entities (&amp; -> &)
    t = html.unescape(t)

    # try to extract Q&A section
    m = re.search(r"QUESTIONS AND ANSWERS(.*)$", t, flags=re.IGNORECASE | re.DOTALL)
    return m.group(1) if m else ""


def clean_qa(t: str) -> str:
    if not isinstance(t, str) or not t:
        return ""

    # remove Sync markers like <Sync id="L91"/>
    t = re.sub(r"<Sync id=\"L\d+\"/>\s*", " ", t)

    # remove disclaimer if it appears inside Q&A text
    t = re.split(r"Disclaimer\s*-{5,}", t, flags=re.IGNORECASE)[0]

    # remove repetitive separators and extra whitespace
    t = re.sub(r"={5,}", " ", t)
    t = re.sub(r"-{5,}", " ", t)
    t = re.sub(r"\s+", " ", t).strip()

    return t


# build QA only text column
texts["qa_text"] = texts["raw_text"].apply(extract_qa_raw).apply(clean_qa)

# basic sanity checks
texts["n_chars"] = texts["qa_text"].str.len()
texts["n_words"] = texts["qa_text"].str.split().str.len()

print("Docs with Q&A found:", (texts["n_words"] > 0).sum(), "/", len(texts))
texts[["file_name","file_date","ric_from_fname","n_words"]].head(10)

Docs with Q&A found: 2494 / 2549


Unnamed: 0,file_name,file_date,ric_from_fname,n_words
0,2003-Dec-08-Acxiom LLC-140477246411-Brief.txt,2003-12-08,,5136
1,2003-Dec-15-MFE.N^C11-138286857648-Brief.txt,2003-12-15,MFE.N^C11,1888
2,2003-Dec-16-NFB.N^L06-136907498398-Brief.txt,2003-12-16,NFB.N^L06,6565
3,2003-Dec-18-CL.N-139888600104-Brief.txt,2003-12-18,CL.N,9704
4,2003-Dec-19-MDCO.OQ^A20-139635792615-Brief.txt,2003-12-19,MDCO.OQ^A20,6220
5,2003-Dec-19-SSP.OQ-140915381200-Brief.txt,2003-12-19,SSP.OQ,5075
6,2003-Dec-22-EXE.OQ-137723937239-Brief.txt,2003-12-22,EXE.OQ,6566
7,2003-Dec-23-KTOS.OQ-138170059987-Brief.txt,2003-12-23,KTOS.OQ,6446
8,2003-Dec-24-Duane Reade-138984373011-Brief.txt,2003-12-24,,7756
9,2003-Dec-30-FDX.N-137019634234-Brief.txt,2003-12-30,FDX.N,6396


In [3]:
texts["n_words"].describe()

count     2549.000000
mean      4821.017654
std       2184.905895
min          0.000000
25%       3344.000000
50%       4707.000000
75%       6187.000000
max      17678.000000
Name: n_words, dtype: float64

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    min_df=2,
    max_df=0.9,
    ngram_range=(1,2)
)

X_tfidf = tfidf.fit_transform(texts["qa_text"].fillna(""))
print("TF-IDF shape:", X_tfidf.shape)

TF-IDF shape: (2549, 512110)


In [5]:
# simple indicators
texts["num_count"] = texts["qa_text"].str.count(r"\b\d+(\.\d+)?\b")
texts["pct_count"] = texts["qa_text"].str.count(r"\b\d+(\.\d+)?\s*%")
texts["money_count"] = texts["qa_text"].str.count(r"(\$|USD|EUR|€)\s*\d")

# keywords — more "skeptical / risk" focused
keywords = {
    "synergy": r"\bsynerg(y|ies)\b",
    "accretive": r"\baccretive\b",
    "dilutive": r"\bdilutive\b",
    "debt": r"\bdebt\b|\bleverage\b",
    "integration": r"\bintegration\b",
    "guidance": r"\bguidance\b|\boutlook\b",
    "growth": r"\bgrowth\b",
    "risk": r"\brisk(s)?\b",
    "concern": r"\bconcern(s)?\b",
    "uncertain": r"\buncertain(ty)?\b",
    "delay": r"\bdelay(s|ed)?\b",
    "margin": r"\bmargin(s)?\b",
    "cost": r"\bcost(s)?\b",
}

for k, pat in keywords.items():
    texts[f"kw_{k}"] = texts["qa_text"].str.contains(pat, case=False, regex=True).astype(int)

texts[["file_name","n_words","num_count","kw_risk","kw_concern","kw_debt","kw_dilutive"]].head(10)

Unnamed: 0,file_name,n_words,num_count,kw_risk,kw_concern,kw_debt,kw_dilutive
0,2003-Dec-08-Acxiom LLC-140477246411-Brief.txt,5136,137,0,0,1,1
1,2003-Dec-15-MFE.N^C11-138286857648-Brief.txt,1888,71,0,1,0,0
2,2003-Dec-16-NFB.N^L06-136907498398-Brief.txt,6565,251,0,0,1,0
3,2003-Dec-18-CL.N-139888600104-Brief.txt,9704,287,0,1,1,1
4,2003-Dec-19-MDCO.OQ^A20-139635792615-Brief.txt,6220,808,1,0,0,0
5,2003-Dec-19-SSP.OQ-140915381200-Brief.txt,5075,863,1,0,1,0
6,2003-Dec-22-EXE.OQ-137723937239-Brief.txt,6566,970,0,0,1,0
7,2003-Dec-23-KTOS.OQ-138170059987-Brief.txt,6446,192,0,0,0,0
8,2003-Dec-24-Duane Reade-138984373011-Brief.txt,7756,148,1,0,1,1
9,2003-Dec-30-FDX.N-137019634234-Brief.txt,6396,169,0,0,1,0


In [6]:
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import time

MODEL_NAME = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.model_max_length = 10**9  #prevents long seq warning (we chunk manually)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME)
model.eval()

MAX_TOKENS = 510  # safe (510 + special tokens <= 512)

@torch.no_grad()
def bert_compound_chunked(t: str) -> float:
    if not isinstance(t, str) or not t.strip():
        return np.nan

    ids = tokenizer.encode(t, add_special_tokens=False)
    if len(ids) == 0:
        return np.nan

    scores = []

    for i in range(0, len(ids), MAX_TOKENS):
        chunk_ids = ids[i:i + MAX_TOKENS]

        input_ids = tokenizer.build_inputs_with_special_tokens(chunk_ids)
        input_ids = torch.tensor([input_ids], dtype=torch.long)

        logits = model(input_ids=input_ids).logits.squeeze(0)
        probs = torch.softmax(logits, dim=-1).cpu().numpy()

        # SST-2 label order: [NEGATIVE, POSITIVE]
        scores.append(float(probs[1] - probs[0]))  # [-1, +1]

    return float(np.mean(scores))


t0 = time.time()
sentiments = []
n = len(texts)

for i, t in enumerate(texts["qa_text"].values, start=1):
    sentiments.append(bert_compound_chunked(t))

    if i % 25 == 0 or i == n:
        elapsed = time.time() - t0
        rate = i / elapsed if elapsed > 0 else 0
        eta = (n - i) / rate if rate > 0 else float("inf")
        print(f"[{i}/{n}] elapsed={elapsed:.1f}s | rate={rate:.2f} files/s | ETA~{eta/60:.1f} min")

texts["sent_bert"] = sentiments
texts[["file_name", "sent_bert"]].head(10)

[25/2549] elapsed=56.9s | rate=0.44 files/s | ETA~95.8 min
[50/2549] elapsed=102.1s | rate=0.49 files/s | ETA~85.0 min
[75/2549] elapsed=151.7s | rate=0.49 files/s | ETA~83.4 min
[100/2549] elapsed=201.1s | rate=0.50 files/s | ETA~82.1 min
[125/2549] elapsed=258.7s | rate=0.48 files/s | ETA~83.6 min
[150/2549] elapsed=322.6s | rate=0.46 files/s | ETA~86.0 min
[175/2549] elapsed=371.5s | rate=0.47 files/s | ETA~84.0 min
[200/2549] elapsed=423.7s | rate=0.47 files/s | ETA~82.9 min
[225/2549] elapsed=477.3s | rate=0.47 files/s | ETA~82.2 min
[250/2549] elapsed=520.8s | rate=0.48 files/s | ETA~79.8 min
[275/2549] elapsed=558.7s | rate=0.49 files/s | ETA~77.0 min
[300/2549] elapsed=612.7s | rate=0.49 files/s | ETA~76.6 min
[325/2549] elapsed=665.9s | rate=0.49 files/s | ETA~75.9 min
[350/2549] elapsed=719.1s | rate=0.49 files/s | ETA~75.3 min
[375/2549] elapsed=768.6s | rate=0.49 files/s | ETA~74.3 min
[400/2549] elapsed=826.7s | rate=0.48 files/s | ETA~74.0 min
[425/2549] elapsed=895.8s | 

Unnamed: 0,file_name,sent_bert
0,2003-Dec-08-Acxiom LLC-140477246411-Brief.txt,0.201596
1,2003-Dec-15-MFE.N^C11-138286857648-Brief.txt,0.443032
2,2003-Dec-16-NFB.N^L06-136907498398-Brief.txt,-0.724407
3,2003-Dec-18-CL.N-139888600104-Brief.txt,-0.437547
4,2003-Dec-19-MDCO.OQ^A20-139635792615-Brief.txt,-0.682637
5,2003-Dec-19-SSP.OQ-140915381200-Brief.txt,-0.346998
6,2003-Dec-22-EXE.OQ-137723937239-Brief.txt,-0.195264
7,2003-Dec-23-KTOS.OQ-138170059987-Brief.txt,-0.205215
8,2003-Dec-24-Duane Reade-138984373011-Brief.txt,-0.634752
9,2003-Dec-30-FDX.N-137019634234-Brief.txt,0.303716


In [7]:
from sklearn.decomposition import NMF

n_topics = 8
nmf = NMF(n_components=n_topics, random_state=42)
W = nmf.fit_transform(X_tfidf)  # document-topic matrix

for i in range(n_topics):
    texts[f"topic_{i}"] = W[:, i]

# inspect top words per topic
feature_names = np.array(tfidf.get_feature_names_out())
for i, comp in enumerate(nmf.components_):
    top = feature_names[np.argsort(comp)[-10:]]
    print(f"Topic {i}: {', '.join(top)}")

Topic 0: 00 23, 00 24, 00 26, 00 27, 00 25, id, sync, sync id, 00, time 00
Topic 1: 01 06, 01 11, 01 12, 01 04, 01 10, id, sync, sync id, 01, time 01
Topic 2: customers, growth, capital, cfo, market, ve, chairman ceo, chairman, president ceo, president
Topic 3: executive, division md, ve, senior, md, ceo director, director, division, research division, research
Topic 4: corporation cfo, chairman president, west corporation, corporation evp, president, corporation ceo, president ceo, corporation chairman, corporation president, corporation
Topic 5: 01 59, 02 00, 02 01, 02 02, 01 57, id, sync, sync id, 02, time 02
Topic 6: vodafone, executive, group chief, chief, plc ceo, plc group, group, chief executive, group plc, plc
Topic 7: president, president ceo, corp ceo, corp evp, apache, microsemi, corp cfo, corp chairman, corp president, corp


In [8]:
keep_cols = [
    "file_name","file_date","ric_from_fname",
    "n_words","n_chars","num_count","pct_count","money_count",
    "sent_bert"
] + [c for c in texts.columns if c.startswith("kw_") or c.startswith("topic_")]

text_features = texts[keep_cols].copy()
text_features.to_csv("text_features_qa_only.csv", index=False)

print("Saved: text_features_qa_only.csv")
text_features.head()

Saved: text_features_qa_only.csv


Unnamed: 0,file_name,file_date,ric_from_fname,n_words,n_chars,num_count,pct_count,money_count,sent_bert,kw_synergy,...,kw_margin,kw_cost,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7
0,2003-Dec-08-Acxiom LLC-140477246411-Brief.txt,2003-12-08,,5136,29354,137,0,2,0.201596,1,...,1,1,0.0,0.0,0.060209,0.005751,0.00329,7.9e-05,0.006208,0.0
1,2003-Dec-15-MFE.N^C11-138286857648-Brief.txt,2003-12-15,MFE.N^C11,1888,10775,71,0,0,0.443032,0,...,1,0,0.0,0.0,0.043835,0.0,0.0,0.003539,0.0,0.011314
2,2003-Dec-16-NFB.N^L06-136907498398-Brief.txt,2003-12-16,NFB.N^L06,6565,37640,251,0,18,-0.724407,0,...,1,1,0.0,0.0,0.04093,0.001415,0.006005,0.012583,0.0,0.004564
3,2003-Dec-18-CL.N-139888600104-Brief.txt,2003-12-18,CL.N,9704,55665,287,26,12,-0.437547,0,...,1,1,0.0,0.000384,0.072337,0.0,0.000818,0.0,0.0,0.018509
4,2003-Dec-19-MDCO.OQ^A20-139635792615-Brief.txt,2003-12-19,MDCO.OQ^A20,6220,39829,808,8,3,-0.682637,0,...,0,1,0.0,0.235314,0.006838,0.001781,0.0,0.000388,0.002733,0.0
