In [None]:
# Seed and Versions
import sys
import platform
from importlib.metadata import version

from src.utils import set_seed

set_seed(42)
print({"python": sys.version.split()[0], "platform": platform.platform(), "sklearn": version("scikit-learn")})


# 03 - Qualitative Analysis

Term frequencies, lightweight NMF topic modeling, and a simple hand-coding rubric.


## Overview

- Load merged dataset
- Compute term frequencies with CountVectorizer
- NMF topic modeling (k=3–5) and top terms per topic
- Optionally join topic intensities by date with avg daily returns
- Include a short hand-coding rubric


## Imports and Data Load


In [None]:
import pandas as pd
from pathlib import Path

from src.utils import read_csv_safe, validate_columns

MERGED_PATH = Path("data/processed/merged.csv")

df = read_csv_safe(MERGED_PATH, parse_dates=["date"]) if MERGED_PATH.exists() else pd.DataFrame()
if not df.empty:
    validate_columns(df, [
        "date","ticker","sector","close","volume","volatility","return","sentiment_score","n_headlines"
    ])

# Build a headlines corpus from raw if needed
RAW_HEADLINES = Path("data/raw/headlines.csv")
corpus_df = read_csv_safe(RAW_HEADLINES, parse_dates=["date"]) if RAW_HEADLINES.exists() else pd.DataFrame()
if not corpus_df.empty:
    validate_columns(corpus_df, ["date", "symbol", "headline"])


## Term Frequencies (CountVectorizer)


In [None]:
# Prepare text corpus
texts = corpus_df["headline"].astype(str).tolist() if not corpus_df.empty else []

from sklearn.feature_extraction.text import CountVectorizer

vectorizer = None
X_counts = None
feature_names = []
if texts:
    vectorizer = CountVectorizer(lowercase=True, stop_words="english", max_features=5000)
    # Structure only; do not fit/transform now
    # X_counts = vectorizer.fit_transform(texts)
    # feature_names = vectorizer.get_feature_names_out().tolist()


## NMF Topics (k=3–5) and Top Terms


## Optional: Topic Intensities by Date and Avg Daily Returns


In [None]:
# Structure only: map document indices to dates/symbols to aggregate
# if W is not None and not corpus_df.empty:
#     doc_topics = pd.DataFrame(W).add_prefix("topic_")
#     doc_topics["date"] = corpus_df["date"].values
#     daily_topics = doc_topics.groupby("date", as_index=False).mean()
#     if not df.empty:
#         daily_returns = df.groupby("date", as_index=False)["return"].mean().rename(columns={"return": "avg_return"})
#         topics_returns = daily_topics.merge(daily_returns, on="date", how="left")


## Hand-Coding Rubric (Short)

- Relevance: Is the headline directly related to firm fundamentals? (0/1)
- Sentiment: Negative / Neutral / Positive (choose one)
- Actionability: Does the headline suggest an actionable event? (0/1)
- Uncertainty: Does the headline introduce uncertainty/ambiguity? (0/1)

Annotators should read the full headline context when available and apply consistent criteria across days.


## Completed

Topic modeling steps prepared. Outputs will be saved after execution.
