In [11]:
# Seed and Versions
import sys
import platform
from importlib.metadata import version
from pathlib import Path

# Ensure project root is on sys.path (handles running from repo root or notebooks/)
def _add_project_root_to_sys_path() -> None:
    candidates = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent]
    for base in candidates:
        if (base / "pyproject.toml").exists() or (base / "src").exists():
            if str(base) not in sys.path:
                sys.path.insert(0, str(base))
            return

_add_project_root_to_sys_path()

from src.utils import set_seed

set_seed(42)
print({
    "python": sys.version.split()[0],
    "platform": platform.platform(),
    "pandas": version("pandas"),
})


{'python': '3.11.3', 'platform': 'macOS-15.6-arm64-arm-64bit', 'pandas': '2.2.2'}


# 02 - Quantitative Analysis

Descriptives, ANOVA, OLS regression with HC3, PCA (structured, not executed).

## Overview

- Load `data/processed/merged.csv`
- Descriptives: summary stats and simple distributions
- One-way ANOVA across sectors (and optional by sentiment bin)
- OLS: `return ~ sentiment_score + volume + volatility + C(sector)` with HC3 SEs
- PCA: standardize features, variance explained, 2D scatter


## Imports and Data Load


In [12]:
import pandas as pd
import numpy as np

from pathlib import Path

from src.utils import read_csv_safe, validate_columns

# Resolve project root so paths are correct when running from notebooks/
def _resolve_root() -> Path:
    candidates = [Path.cwd(), Path.cwd().parent, Path.cwd().parent.parent]
    for base in candidates:
        if (base / "pyproject.toml").exists() and (base / "data").exists():
            return base
    return Path.cwd()

PROJECT_ROOT = _resolve_root()
MERGED_PATH = PROJECT_ROOT / "data/processed/merged.csv"

df = read_csv_safe(MERGED_PATH, parse_dates=["date"]) if MERGED_PATH.exists() else pd.DataFrame()
if not df.empty:
    validate_columns(df, [
        "date","ticker","sector","close","volume","volatility","return","sentiment_score","n_headlines"
    ])


## Descriptives


In [13]:
# Print key descriptives for quick reporting
if not df.empty:
    print("N rows:", len(df))
    print("Tickers:", df["ticker"].nunique())
    print("Sectors:", df["sector"].nunique())


N rows: 414
Tickers: 6
Sectors: 3


In [14]:
# Basic summary stats and distributions (to be executed later)
if not df.empty:
    numeric_cols = ["close", "volume", "volatility", "return", "sentiment_score", "n_headlines"]
    desc = df[numeric_cols].describe()
    display(desc)

    # Simple distribution prep (counts per sector)
    sector_counts = df["sector"].value_counts().to_frame(name="count").reset_index(names="sector")


Unnamed: 0,close,volume,volatility,return,sentiment_score,n_headlines
count,414.0,414.0,414.0,414.0,140.0,140.0
mean,99.664377,190287.6,0.306846,0.00122,0.046825,1.0
std,33.776359,133799.6,0.055287,0.020344,0.098904,0.0
min,44.7643,23277.0,0.041806,-0.051751,-0.111111,1.0
25%,75.123075,106081.5,0.276396,-0.012992,0.0,1.0
50%,94.23705,159169.5,0.305362,0.001818,0.0,1.0
75%,129.554225,233598.8,0.34333,0.015848,0.111111,1.0
max,173.885,1642350.0,0.539746,0.067946,0.222222,1.0


In [15]:
# ANOVA across sectors (compute + print)
from scipy import stats

if not df.empty:
    groups = [g["return"].dropna().values for _, g in df.groupby("sector")]
    if len(groups) >= 2 and all(len(g) > 1 for g in groups):
        res = stats.f_oneway(*groups)
        print({"anova_F": float(res.statistic), "anova_p": float(res.pvalue)})
    else:
        print("ANOVA not computed (insufficient groups)")
else:
    print("No data")


{'anova_F': 1.5629731795630133, 'anova_p': 0.21075490953220516}


## One-way ANOVA


In [16]:
# OLS with HC3: define, fit, and print (safe target name)
import numpy as np
import statsmodels.formula.api as smf

if not df.empty:
    data = df.rename(columns={"return": "ret"})
    data = data.dropna(subset=["ret", "sentiment_score", "volume", "volatility", "sector"])
    if not data.empty:
        model = smf.ols(
            formula="ret ~ sentiment_score + volume + volatility + C(sector)",
            data=data,
        )
        ols_results = model.fit(cov_type="HC3")
        params = ols_results.params.to_dict()
        pvals = ols_results.pvalues.to_dict()
        print({
            "coef_sentiment": float(params.get("sentiment_score", np.nan)),
            "p_sentiment": float(pvals.get("sentiment_score", np.nan)),
            "r2": float(ols_results.rsquared),
        })
    else:
        print("OLS: no rows after dropna")
else:
    print("No data")


{'coef_sentiment': 0.0043829529421939226, 'p_sentiment': 0.8300988406713853, 'r2': 0.03285253560594781}


In [17]:
# Optional: sentiment bins (for stratified analyses)
def bin_sentiment(x: float) -> str:
    if x <= -0.2:
        return "neg"
    if x >= 0.2:
        return "pos"
    return "neu"

if not df.empty and "sentiment_score" in df.columns:
    df["sentiment_bin"] = df["sentiment_score"].apply(bin_sentiment)


In [18]:
# PCA variance explained (self-contained)
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

if not df.empty:
    feat_cols = ["return", "volume", "volatility", "sentiment_score", "n_headlines"]
    X = df[feat_cols].dropna()
    if not X.empty:
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X.values)
        pca = PCA(n_components=2)
        pca_fit = pca.fit(X_scaled)
        explained_var = pca_fit.explained_variance_ratio_
        print({"pca_var_pc1": float(explained_var[0]), "pca_var_pc2": float(explained_var[1])})
    else:
        print("PCA: no rows after dropna")
else:
    print("No data")


{'pca_var_pc1': 0.31230771398881724, 'pca_var_pc2': 0.25425454707082723}


## OLS Regression (HC3)


In [19]:
# This cell was superseded by the self-contained OLS cell above.
# Left intentionally blank to avoid duplicate execution.


## PCA


In [20]:
# Standardize features and prepare PCA (no execution here)
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pca, scaler, pca_input = None, None, None
if not df.empty:
    feat_cols = ["return", "volume", "volatility", "sentiment_score", "n_headlines"]
    X = df[feat_cols].dropna()
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X.values)
    pca = PCA(n_components=2)
    # Do not fit now; structure only
    # pca_components = pca.fit_transform(X_scaled)
    # explained_var = pca.explained_variance_ratio_


## Completed

Figures and model specifications prepared. See `reports/figures/` after execution.
