In [9]:
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime
import html

FILES_DIR = "Files"

# Example filename formats:
# 1) 2003-Dec-08-Acxiom LLC-140477246411-Brief.txt   (company name)
# 2) 2011-Jun-22-XRAY.OQ-138268919384-Brief.txt     (RIC)
# We'll handle both.

ric_pattern = re.compile(r"^\d{4}-[A-Za-z]{3}-\d{2}-([A-Z0-9\.^]+)-\d+-Brief\.txt$")
date_pattern = re.compile(r"^(\d{4}-[A-Za-z]{3}-\d{2})-")

def parse_file_date(fname):
    m = date_pattern.match(fname)
    if not m:
        return None
    return pd.to_datetime(m.group(1), format="%Y-%b-%d", errors="coerce")

def parse_ric(fname):
    m = ric_pattern.match(fname)
    return m.group(1) if m else None

rows = []
for fname in os.listdir(FILES_DIR):
    if not fname.endswith(".txt"):
        continue
    fpath = os.path.join(FILES_DIR, fname)
    with open(fpath, "r", encoding="utf-8", errors="ignore") as f:
        txt = f.read()

    rows.append({
        "file_name": fname,
        "file_date": parse_file_date(fname),
        "ric_from_fname": parse_ric(fname),
        "raw_text": txt
    })

texts = pd.DataFrame(rows)
print("Loaded files:", len(texts))
texts.head()

Loaded files: 2549


Unnamed: 0,file_name,file_date,ric_from_fname,raw_text
0,2003-Dec-08-Acxiom LLC-140477246411-Brief.txt,2003-12-08,,\n\nRefinitiv StreetEvents Event Brief\nE D I ...
1,2003-Dec-15-MFE.N^C11-138286857648-Brief.txt,2003-12-15,MFE.N^C11,\n\nRefinitiv StreetEvents Event Brief\nE D I ...
2,2003-Dec-16-NFB.N^L06-136907498398-Brief.txt,2003-12-16,NFB.N^L06,\n\nRefinitiv StreetEvents Event Brief\nE D I ...
3,2003-Dec-18-CL.N-139888600104-Brief.txt,2003-12-18,CL.N,\n\nRefinitiv StreetEvents Event Brief\nE D I ...
4,2003-Dec-19-MDCO.OQ^A20-139635792615-Brief.txt,2003-12-19,MDCO.OQ^A20,\n\nRefinitiv StreetEvents Event Brief\nE D I ...


In [10]:
def extract_qa_raw(t: str) -> str:
    """
    Extract everything after 'QUESTIONS AND ANSWERS' (Q&A section).
    Returns empty string if Q&A header not found.
    """
    if not isinstance(t, str):
        return ""
    # Decode HTML entities (&amp; -> &)
    t = html.unescape(t)

    # Try to extract Q&A section
    m = re.search(r"QUESTIONS AND ANSWERS(.*)$", t, flags=re.IGNORECASE | re.DOTALL)
    return m.group(1) if m else ""


def clean_qa(t: str) -> str:
    if not isinstance(t, str) or not t:
        return ""

    # Remove Sync markers like <Sync id="L91"/>
    t = re.sub(r"<Sync id=\"L\d+\"/>\s*", " ", t)

    # Remove disclaimer if it appears inside Q&A text
    t = re.split(r"Disclaimer\s*-{5,}", t, flags=re.IGNORECASE)[0]

    # Remove repetitive separators and extra whitespace
    t = re.sub(r"={5,}", " ", t)
    t = re.sub(r"-{5,}", " ", t)
    t = re.sub(r"\s+", " ", t).strip()

    return t


# Build QA-only text column
texts["qa_text"] = texts["raw_text"].apply(extract_qa_raw).apply(clean_qa)

# Basic sanity checks
texts["n_chars"] = texts["qa_text"].str.len()
texts["n_words"] = texts["qa_text"].str.split().str.len()

print("Docs with Q&A found:", (texts["n_words"] > 0).sum(), "/", len(texts))
texts[["file_name","file_date","ric_from_fname","n_words"]].head(10)

Docs with Q&A found: 2494 / 2549


Unnamed: 0,file_name,file_date,ric_from_fname,n_words
0,2003-Dec-08-Acxiom LLC-140477246411-Brief.txt,2003-12-08,,5136
1,2003-Dec-15-MFE.N^C11-138286857648-Brief.txt,2003-12-15,MFE.N^C11,1888
2,2003-Dec-16-NFB.N^L06-136907498398-Brief.txt,2003-12-16,NFB.N^L06,6565
3,2003-Dec-18-CL.N-139888600104-Brief.txt,2003-12-18,CL.N,9704
4,2003-Dec-19-MDCO.OQ^A20-139635792615-Brief.txt,2003-12-19,MDCO.OQ^A20,6220
5,2003-Dec-19-SSP.OQ-140915381200-Brief.txt,2003-12-19,SSP.OQ,5075
6,2003-Dec-22-EXE.OQ-137723937239-Brief.txt,2003-12-22,EXE.OQ,6566
7,2003-Dec-23-KTOS.OQ-138170059987-Brief.txt,2003-12-23,KTOS.OQ,6446
8,2003-Dec-24-Duane Reade-138984373011-Brief.txt,2003-12-24,,7756
9,2003-Dec-30-FDX.N-137019634234-Brief.txt,2003-12-30,FDX.N,6396


In [11]:
texts["n_words"].describe()

count     2549.000000
mean      4821.017654
std       2184.905895
min          0.000000
25%       3344.000000
50%       4707.000000
75%       6187.000000
max      17678.000000
Name: n_words, dtype: float64

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    lowercase=True,
    stop_words="english",
    min_df=2,
    max_df=0.9,
    ngram_range=(1,2)
)

X_tfidf = tfidf.fit_transform(texts["qa_text"].fillna(""))
print("TF-IDF shape:", X_tfidf.shape)

TF-IDF shape: (2549, 512110)


In [13]:
# Simple indicators (Q&A only)
texts["num_count"] = texts["qa_text"].str.count(r"\b\d+(\.\d+)?\b")
texts["pct_count"] = texts["qa_text"].str.count(r"\b\d+(\.\d+)?\s*%")
texts["money_count"] = texts["qa_text"].str.count(r"(\$|USD|EUR|€)\s*\d")

# Keywords (Q&A only) — more "skeptical / risk" focused
keywords = {
    "synergy": r"\bsynerg(y|ies)\b",
    "accretive": r"\baccretive\b",
    "dilutive": r"\bdilutive\b",
    "debt": r"\bdebt\b|\bleverage\b",
    "integration": r"\bintegration\b",
    "guidance": r"\bguidance\b|\boutlook\b",
    "growth": r"\bgrowth\b",
    "risk": r"\brisk(s)?\b",
    "concern": r"\bconcern(s)?\b",
    "uncertain": r"\buncertain(ty)?\b",
    "delay": r"\bdelay(s|ed)?\b",
    "margin": r"\bmargin(s)?\b",
    "cost": r"\bcost(s)?\b",
}

for k, pat in keywords.items():
    texts[f"kw_{k}"] = texts["qa_text"].str.contains(pat, case=False, regex=True).astype(int)

texts[["file_name","n_words","num_count","kw_risk","kw_concern","kw_debt","kw_dilutive"]].head(10)

Unnamed: 0,file_name,n_words,num_count,kw_risk,kw_concern,kw_debt,kw_dilutive
0,2003-Dec-08-Acxiom LLC-140477246411-Brief.txt,5136,137,0,0,1,1
1,2003-Dec-15-MFE.N^C11-138286857648-Brief.txt,1888,71,0,1,0,0
2,2003-Dec-16-NFB.N^L06-136907498398-Brief.txt,6565,251,0,0,1,0
3,2003-Dec-18-CL.N-139888600104-Brief.txt,9704,287,0,1,1,1
4,2003-Dec-19-MDCO.OQ^A20-139635792615-Brief.txt,6220,808,1,0,0,0
5,2003-Dec-19-SSP.OQ-140915381200-Brief.txt,5075,863,1,0,1,0
6,2003-Dec-22-EXE.OQ-137723937239-Brief.txt,6566,970,0,0,1,0
7,2003-Dec-23-KTOS.OQ-138170059987-Brief.txt,6446,192,0,0,0,0
8,2003-Dec-24-Duane Reade-138984373011-Brief.txt,7756,148,1,0,1,1
9,2003-Dec-30-FDX.N-137019634234-Brief.txt,6396,169,0,0,1,0


In [14]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import time
import numpy as np

analyzer = SentimentIntensityAnalyzer()

def vader_compound(t):
    if not isinstance(t, str) or not t.strip():
        return np.nan
    return analyzer.polarity_scores(t)["compound"]

t0 = time.time()
sentiments = []
n = len(texts)

for i, t in enumerate(texts["qa_text"].values, start=1):
    sentiments.append(vader_compound(t))

    if i % 50 == 0 or i == n:
        elapsed = time.time() - t0
        rate = i / elapsed if elapsed > 0 else 0
        eta = (n - i) / rate if rate > 0 else float("inf")
        print(f"[{i}/{n}] elapsed={elapsed:.1f}s | rate={rate:.2f} files/s | ETA~{eta/60:.1f} min")

texts["sent_vader"] = sentiments
texts[["file_name", "sent_vader"]].head(10)

[50/2549] elapsed=21.1s | rate=2.37 files/s | ETA~17.6 min
[100/2549] elapsed=44.6s | rate=2.24 files/s | ETA~18.2 min
[150/2549] elapsed=69.0s | rate=2.17 files/s | ETA~18.4 min
[200/2549] elapsed=88.5s | rate=2.26 files/s | ETA~17.3 min
[250/2549] elapsed=108.0s | rate=2.32 files/s | ETA~16.6 min
[300/2549] elapsed=126.7s | rate=2.37 files/s | ETA~15.8 min
[350/2549] elapsed=147.9s | rate=2.37 files/s | ETA~15.5 min
[400/2549] elapsed=168.1s | rate=2.38 files/s | ETA~15.0 min
[450/2549] elapsed=198.5s | rate=2.27 files/s | ETA~15.4 min
[500/2549] elapsed=221.6s | rate=2.26 files/s | ETA~15.1 min
[550/2549] elapsed=242.6s | rate=2.27 files/s | ETA~14.7 min
[600/2549] elapsed=268.8s | rate=2.23 files/s | ETA~14.6 min
[650/2549] elapsed=294.4s | rate=2.21 files/s | ETA~14.3 min
[700/2549] elapsed=314.1s | rate=2.23 files/s | ETA~13.8 min
[750/2549] elapsed=336.3s | rate=2.23 files/s | ETA~13.4 min
[800/2549] elapsed=359.8s | rate=2.22 files/s | ETA~13.1 min
[850/2549] elapsed=384.8s | r

Unnamed: 0,file_name,sent_vader
0,2003-Dec-08-Acxiom LLC-140477246411-Brief.txt,1.0
1,2003-Dec-15-MFE.N^C11-138286857648-Brief.txt,0.9998
2,2003-Dec-16-NFB.N^L06-136907498398-Brief.txt,1.0
3,2003-Dec-18-CL.N-139888600104-Brief.txt,1.0
4,2003-Dec-19-MDCO.OQ^A20-139635792615-Brief.txt,1.0
5,2003-Dec-19-SSP.OQ-140915381200-Brief.txt,1.0
6,2003-Dec-22-EXE.OQ-137723937239-Brief.txt,1.0
7,2003-Dec-23-KTOS.OQ-138170059987-Brief.txt,0.9999
8,2003-Dec-24-Duane Reade-138984373011-Brief.txt,1.0
9,2003-Dec-30-FDX.N-137019634234-Brief.txt,1.0


In [15]:
from sklearn.decomposition import NMF

n_topics = 8
nmf = NMF(n_components=n_topics, random_state=42)
W = nmf.fit_transform(X_tfidf)  # document-topic matrix

for i in range(n_topics):
    texts[f"topic_{i}"] = W[:, i]

# (Optional) inspect top words per topic
feature_names = np.array(tfidf.get_feature_names_out())
for i, comp in enumerate(nmf.components_):
    top = feature_names[np.argsort(comp)[-10:]]
    print(f"Topic {i}: {', '.join(top)}")

Topic 0: 00 23, 00 24, 00 26, 00 27, 00 25, id, sync, sync id, 00, time 00
Topic 1: 01 06, 01 11, 01 12, 01 04, 01 10, id, sync, sync id, 01, time 01
Topic 2: customers, growth, capital, cfo, market, ve, chairman ceo, chairman, president ceo, president
Topic 3: executive, division md, ve, senior, md, ceo director, director, division, research division, research
Topic 4: corporation cfo, chairman president, west corporation, corporation evp, president, corporation ceo, president ceo, corporation chairman, corporation president, corporation
Topic 5: 01 59, 02 00, 02 01, 02 02, 01 57, id, sync, sync id, 02, time 02
Topic 6: vodafone, executive, group chief, chief, plc ceo, plc group, group, chief executive, group plc, plc
Topic 7: president, president ceo, corp ceo, corp evp, apache, microsemi, corp cfo, corp chairman, corp president, corp


In [16]:
keep_cols = [
    "file_name","file_date","ric_from_fname",
    "n_words","n_chars","num_count","pct_count","money_count",
    "sent_vader"
] + [c for c in texts.columns if c.startswith("kw_") or c.startswith("topic_")]

text_features = texts[keep_cols].copy()
text_features.to_csv("text_features_qa_only.csv", index=False)

print("Saved: text_features_qa_only.csv")
text_features.head()

Saved: text_features_qa_only.csv


Unnamed: 0,file_name,file_date,ric_from_fname,n_words,n_chars,num_count,pct_count,money_count,sent_vader,kw_synergy,...,kw_margin,kw_cost,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7
0,2003-Dec-08-Acxiom LLC-140477246411-Brief.txt,2003-12-08,,5136,29354,137,0,2,1.0,1,...,1,1,0.0,0.0,0.060209,0.005751,0.00329,7.9e-05,0.006208,0.0
1,2003-Dec-15-MFE.N^C11-138286857648-Brief.txt,2003-12-15,MFE.N^C11,1888,10775,71,0,0,0.9998,0,...,1,0,0.0,0.0,0.043835,0.0,0.0,0.003539,0.0,0.011314
2,2003-Dec-16-NFB.N^L06-136907498398-Brief.txt,2003-12-16,NFB.N^L06,6565,37640,251,0,18,1.0,0,...,1,1,0.0,0.0,0.04093,0.001415,0.006005,0.012583,0.0,0.004564
3,2003-Dec-18-CL.N-139888600104-Brief.txt,2003-12-18,CL.N,9704,55665,287,26,12,1.0,0,...,1,1,0.0,0.000384,0.072337,0.0,0.000818,0.0,0.0,0.018509
4,2003-Dec-19-MDCO.OQ^A20-139635792615-Brief.txt,2003-12-19,MDCO.OQ^A20,6220,39829,808,8,3,1.0,0,...,0,1,0.0,0.235314,0.006838,0.001781,0.0,0.000388,0.002733,0.0
