# 02 - Quantitative Analysis

Descriptives, ANOVA, OLS regression with HC3, PCA (structured, not executed).

## Overview

- Load `data/processed/merged.csv`
- Descriptives: summary stats and simple distributions
- One-way ANOVA across sectors (and optional by sentiment bin)
- OLS: `return ~ sentiment_score + volume + volatility + C(sector)` with HC3 SEs
- PCA: standardize features, variance explained, 2D scatter


## Imports and Data Load


In [None]:
import pandas as pd
import numpy as np

from pathlib import Path

from src.utils import read_csv_safe, validate_columns

MERGED_PATH = Path("data/processed/merged.csv")

df = read_csv_safe(MERGED_PATH, parse_dates=["date"]) if MERGED_PATH.exists() else pd.DataFrame()
if not df.empty:
    validate_columns(df, [
        "date","ticker","sector","close","volume","volatility","return","sentiment_score","n_headlines"
    ])


## Descriptives


In [None]:
# Basic summary stats and distributions (to be executed later)
if not df.empty:
    numeric_cols = ["close", "volume", "volatility", "return", "sentiment_score", "n_headlines"]
    desc = df[numeric_cols].describe()
    display(desc)

    # Simple distribution prep (counts per sector)
    sector_counts = df["sector"].value_counts().to_frame(name="count").reset_index(names="sector")


## One-way ANOVA


In [None]:
# Prepare data for ANOVA: return by sector
from scipy import stats

anova_result = None
if not df.empty:
    groups = [g["return"].dropna().values for _, g in df.groupby("sector")]
    if len(groups) >= 2 and all(len(g) > 1 for g in groups):
        anova_result = stats.f_oneway(*groups)

# Optional: by sentiment bin (prepare binning; execution later)
def bin_sentiment(x: float) -> str:
    if x <= -0.2:
        return "neg"
    if x >= 0.2:
        return "pos"
    return "neu"

if not df.empty and "sentiment_score" in df.columns:
    df["sentiment_bin"] = df["sentiment_score"].apply(bin_sentiment)


## OLS Regression (HC3)


In [None]:
# Formula: return ~ sentiment_score + volume + volatility + C(sector)
import statsmodels.formula.api as smf

ols_results = None
if not df.empty:
    model = smf.ols(
        formula="return ~ sentiment_score + volume + volatility + C(sector)",
        data=df.dropna(subset=["return", "sentiment_score", "volume", "volatility", "sector"]),
    )
    # Do not fit now; structure for HC3 robust SEs:
    # ols_results = model.fit(cov_type="HC3")


## PCA


In [None]:
# Standardize features and prepare PCA (no execution here)
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

pca, scaler, pca_input = None, None, None
if not df.empty:
    feat_cols = ["return", "volume", "volatility", "sentiment_score", "n_headlines"]
    X = df[feat_cols].dropna()
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X.values)
    pca = PCA(n_components=2)
    # Do not fit now; structure only
    # pca_components = pca.fit_transform(X_scaled)
    # explained_var = pca.explained_variance_ratio_
