# 03 - Qualitative Analysis

Term frequencies, lightweight NMF topic modeling


## Overview

- Load merged dataset
- Compute term frequencies with CountVectorizer
- NMF topic modeling (k=3–5) and top terms per topic
- Join topic intensities by date with avg daily returns


## Imports and Data Load


In [28]:
import pandas as pd
from pathlib import Path

from src.utils import read_csv_safe, validate_columns

# Resolve project root so paths are correct when running from notebooks/
def _resolve_root() -> Path:
    candidates = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent]
    for base in candidates:
        if (base / "pyproject.toml").exists() and (base / "data").exists():
            return base
    return Path.cwd()

PROJECT_ROOT = _resolve_root()
MERGED_PATH = PROJECT_ROOT / "data/processed/merged.csv"
RAW_HEADLINES = PROJECT_ROOT / "data/raw/headlines.csv"

df = read_csv_safe(MERGED_PATH, parse_dates=["date"]) if MERGED_PATH.exists() else pd.DataFrame()
if not df.empty:
    validate_columns(df, [
        "date","ticker","sector","close","volume","volatility","return","sentiment_score","n_headlines"
    ])

# Build a headlines collection from raw if needed
corpus_df = read_csv_safe(RAW_HEADLINES, parse_dates=["date"]) if RAW_HEADLINES.exists() else pd.DataFrame()
if not corpus_df.empty:
    validate_columns(corpus_df, ["date", "symbol", "headline"])


## Term Frequencies (CountVectorizer)


In [29]:
# Prepare text collection
texts = corpus_df["headline"].astype(str).tolist() if not corpus_df.empty else []

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = None
X_counts = None
feature_names = []
if texts:
    vectorizer = CountVectorizer(lowercase=True, stop_words="english", max_features=5000)


### What this does
Fits TF-IDF + NMF topic model (k=4), extracts top terms per topic, and prints a short summary.


## NMF Topics (k=3–5) and Top Terms


### What this does
Averages document-topic weights by date and left-joins with average daily returns; saves `data/processed/topics_returns.csv` for later use.


In [30]:
# NMF topics (k=4) and top terms per topic
from sklearn.decomposition import NMF
from sklearn.feature_extraction.text import TfidfVectorizer

k = 4
W = None
H = None
terms_by_topic = []
feature_names = []

texts = corpus_df["headline"].astype(str).tolist() if not corpus_df.empty else []
if texts:
    tfidf = TfidfVectorizer(lowercase=True, stop_words="english", max_features=5000)
    X_tfidf = tfidf.fit_transform(texts)
    nmf = NMF(n_components=k, init="nndsvda", random_state=42)
    W = nmf.fit_transform(X_tfidf)
    H = nmf.components_
    feature_names = tfidf.get_feature_names_out().tolist()
    topn = 10
    for topic_idx, row in enumerate(H):
        top_idx = row.argsort()[::-1][:topn]
        terms_by_topic.append([feature_names[i] for i in top_idx])
    print({"nmf_components": k, "docs": len(texts), "top_terms_first_topic": terms_by_topic[0] if terms_by_topic else []})
else:
    print("No texts available for topic modeling")



{'nmf_components': 4, 'docs': 141, 'top_terms_first_topic': ['misses', 'costs', 'guidance', 'rise', 'nvda', 'amzn', 'msft', 'meta', 'aapl', 'goog']}


## Topic Intensities by Date and Avg Daily Returns


In [31]:
# Topic intensities by date + avg daily returns
import pandas as pd

topics_returns = pd.DataFrame()
if 'W' in globals() and W is not None and not corpus_df.empty:
    doc_topics = pd.DataFrame(W).add_prefix("topic_")
    doc_topics["date"] = corpus_df["date"].values
    daily_topics = doc_topics.groupby("date", as_index=False).mean()
    if not df.empty:
        daily_returns = (
            df.groupby("date", as_index=False)["return"].mean()
              .rename(columns={"return": "avg_return"})
        )
        topics_returns = daily_topics.merge(daily_returns, on="date", how="left")
        out_path = PROJECT_ROOT / "data/processed/topics_returns.csv"
        out_path.parent.mkdir(parents=True, exist_ok=True)
        topics_returns.to_csv(out_path, index=False)
        print({"topics_returns_rows": int(len(topics_returns)), "saved": str(out_path)})
    else:
        print("No merged returns available for join")
else:
    print("No topic weights available for aggregation")


{'topics_returns_rows': 65, 'saved': '/Users/christopherdefazio/Code/mixed-methods-market-analysis/data/processed/topics_returns.csv'}
