In [1]:
# =========================
# IMPORTS Y CONFIGURACIÓN
# =========================
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")

# Directorios
NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.dirname(NOTEBOOK_DIR)
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "data", "processed")
REG_DIR = os.path.join(PROCESSED_DIR, "regression")
PLOTS_DIR = os.path.join(REG_DIR, "plots")

os.makedirs(REG_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

print("Resultados de regresión se guardarán en:", REG_DIR)


Resultados de regresión se guardarán en: d:\Projects\University-Projects\AlphaTech-Analyzer\data\processed\regression


In [2]:
# =========================
# CARGAR DATOS PROCESADOS
# =========================

panel_df = pd.read_csv(
    os.path.join(PROCESSED_DIR, "tech30_panel_monthly_2018_2024.csv"),
    parse_dates=['Date']
)
agg_df = pd.read_csv(
    os.path.join(PROCESSED_DIR, "tech30_aggregated_stats_2018_2024.csv")
)

print("Panel mensual:", panel_df.shape)
print("Dataset agregado:", agg_df.shape)


Panel mensual: (2403, 6)
Dataset agregado: (30, 7)


In [3]:
# =========================
# RETORNOS DEL MERCADO
# =========================

import yfinance as yf

# Descargar QQQ (fallback si no existe variable)
market = yf.download("QQQ", start="2018-01-01", end="2024-12-31", progress=False)
price_col = "Adj Close" if "Adj Close" in market.columns else "Close"

market["MarketReturn"] = np.log(market[price_col] / market[price_col].shift(1))
market = market.resample("ME").last()[["MarketReturn"]].dropna()
market.index = market.index.strftime("%Y-%m-%d")

print("MarketReturn listo:", market.shape)


MarketReturn listo: (84, 1)


In [4]:
# =========================
# FUNCIÓN REGRESIÓN POR EMPRESA
# =========================
def estimate_company_regression(company, panel_df, market_returns):
    df_i = panel_df[panel_df["Company"] == company].copy()
    df_i["Date"] = df_i["Date"].astype(str)
    df = df_i.merge(
        market_returns,
        left_on="Date",
        right_index=True,
        how="inner"
    ).dropna(subset=["Return","MarketReturn","Volume"])

    if len(df) < 12:
        return None

    df["logVol"] = np.log(df["Volume"].replace(0,np.nan)).fillna(method='ffill').fillna(method='bfill')
    X = sm.add_constant(df[["MarketReturn","logVol"]])
    y = df["Return"]

    model = sm.OLS(y, X).fit(cov_type="HC1")
    resid = model.resid
    dw = durbin_watson(resid)
    bp = het_breuschpagan(resid, model.model.exog)

    return {
        "Company": company,
        "n_obs": len(df),
        "Intercept": float(model.params.get("const", np.nan)),
        "Coef_Market": float(model.params.get("MarketReturn", np.nan)),
        "SE_Market": float(model.bse.get("MarketReturn", np.nan)),
        "p_Market": float(model.pvalues.get("MarketReturn", np.nan)),
        "Coef_logVol": float(model.params.get("logVol", np.nan)),
        "SE_logVol": float(model.bse.get("logVol", np.nan)),
        "p_logVol": float(model.pvalues.get("logVol", np.nan)),
        "R2": float(model.rsquared),
        "DW": float(dw),
        "BP_stat": float(bp[0]),
        "BP_p": float(bp[1])
    }


In [7]:
# =========================
# PREPARAR RETORNOS DEL MERCADO (QQQ)
# =========================
import yfinance as yf
import numpy as np
import pandas as pd
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson

# Descargar mercado
market = yf.download("QQQ", start="2018-01-01", end="2024-12-31", progress=False)

# Determinar columna de precio ajustado
price_col = None
for col in ["Adj Close", "AdjClose", "Close"]:
    if col in market.columns:
        price_col = col
        break
if price_col is None:
    raise RuntimeError("No se encontró columna de precio válido en QQQ")

# Calcular retornos logarítmicos mensuales
market["MarketReturn"] = np.log(market[price_col] / market[price_col].shift(1))
market = market.resample("ME").last()[["MarketReturn"]].dropna()

# Convertir index a strings YYYY-MM-DD para merge
market.index = market.index.strftime("%Y-%m-%d")

# =========================
# FUNCION CAPM ROBUSTA
# =========================
def estimate_company_regression(company, panel_df, market_returns):
    df_i = panel_df[panel_df["Company"] == company].copy()
    df_i["Date"] = df_i["Date"].astype(str)

    # Merge con market_returns
    df = df_i.merge(
        market_returns,
        left_on="Date",
        right_index=True,
        how="inner"
    ).dropna(subset=["Return","MarketReturn","Volume"])

    if len(df) < 12:  # mínimo 1 año
        return None

    df["logVol"] = np.log(df["Volume"].replace(0,np.nan)).fillna(method='ffill').fillna(method='bfill')

    X = sm.add_constant(df[["MarketReturn","logVol"]])
    y = df["Return"]

    model = sm.OLS(y, X).fit(cov_type="HC1")
    resid = model.resid
    dw = durbin_watson(resid)
    bp = het_breuschpagan(resid, model.model.exog)

    return {
        "Company": company,
        "n_obs": len(df),
        "Intercept": float(model.params.get("const", np.nan)),
        "Coef_Market": float(model.params.get("MarketReturn", np.nan)),
        "SE_Market": float(model.bse.get("MarketReturn", np.nan)),
        "p_Market": float(model.pvalues.get("MarketReturn", np.nan)),
        "Coef_logVol": float(model.params.get("logVol", np.nan)),
        "SE_logVol": float(model.bse.get("logVol", np.nan)),
        "p_logVol": float(model.pvalues.get("logVol", np.nan)),
        "R2": float(model.rsquared),
        "DW": float(dw),
        "BP_stat": float(bp[0]),
        "BP_p": float(bp[1])
    }


In [8]:
# =========================
# REGRESIÓN CROSS-SECTION
# =========================
cross = agg_df.copy().dropna(subset=["MeanReturn","Beta","Volatility","AvgVolume"])
cross["logAvgVol"] = np.log(cross["AvgVolume"].replace(0,np.nan)).fillna(method='ffill').fillna(method='bfill')

X = sm.add_constant(cross[["Beta","Volatility","logAvgVol"]])
y = cross["MeanReturn"].astype(float)
model_cs = sm.OLS(y, X).fit(cov_type="HC1")

# Guardar resultados
cs_summary = {
    "params": model_cs.params.to_dict(),
    "bse": model_cs.bse.to_dict(),
    "pvalues": model_cs.pvalues.to_dict(),
    "R2": float(model_cs.rsquared),
    "n": int(model_cs.nobs)
}

# CSV coeficientes
pd.DataFrame({
    "param": model_cs.params.index,
    "coef": model_cs.params.values,
    "se": model_cs.bse.values,
    "pvalue": model_cs.pvalues.values
}).to_csv(os.path.join(REG_DIR,"cross_section_regression_coefficients.csv"), index=False)

# JSON summary
with open(os.path.join(REG_DIR,"cross_section_regression_summary.json"), "w") as f:
    import json
    json.dump(cs_summary,f,indent=4)

print("✅ Regresión cross-section guardada. R2:", model_cs.rsquared)
print(model_cs.summary())


✅ Regresión cross-section guardada. R2: 0.4895026269349185
                            OLS Regression Results                            
Dep. Variable:             MeanReturn   R-squared:                       0.490
Model:                            OLS   Adj. R-squared:                  0.431
Method:                 Least Squares   F-statistic:                     26.91
Date:                Tue, 30 Dec 2025   Prob (F-statistic):           3.91e-08
Time:                        15:37:04   Log-Likelihood:                 101.08
No. Observations:                  30   AIC:                            -194.2
Df Residuals:                      26   BIC:                            -188.5
Df Model:                           3                                         
Covariance Type:                  HC1                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------

  cross["logAvgVol"] = np.log(cross["AvgVolume"].replace(0,np.nan)).fillna(method='ffill').fillna(method='bfill')
