In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.tools.tools import add_constant
from scipy.stats import t

In [2]:
MASTER = Path("../data/processed/master_teams_2000_2019.csv")
TAB_OUT = Path("../data/processed")
FIG_OUT = Path("../plots")
TAB_OUT.mkdir(parents=True, exist_ok=True)
FIG_OUT.mkdir(parents=True, exist_ok=True)

In [3]:
def run_simple_ols(df: pd.DataFrame, y_col: str, x_col: str, nice_name: str):
    sub = df[[y_col, x_col]].dropna().copy()
    X = add_constant(sub[x_col])
    y = sub[y_col]

    model = sm.OLS(y, X).fit()

    sub["yhat"] = model.predict(X)
    sub["resid"] = y - sub["yhat"]

    rmse = np.sqrt(np.mean(sub["resid"] ** 2))
    mae = np.mean(np.abs(sub["resid"]))

    beta1 = model.params[x_col]
    beta0 = model.params["const"]
    p_beta1 = model.pvalues[x_col]
    conf_beta1 = model.conf_int().loc[x_col].tolist()

    print(f"\n=== Modelo: {y_col} ~ {x_col} ===")
    print(model.summary().tables[1])
    print(f"R^2 = {model.rsquared:.4f} | R^2 adj = {model.rsquared_adj:.4f}")
    print(
        f"F({int(model.df_model)}, {int(model.df_resid)}) = {model.fvalue:.2f}  p(F) = {model.f_pvalue:.3e}"
    )
    print(f"RMSE = {rmse:.3f} | MAE = {mae:.3f}")
    print(
        f"Coef {x_col} = {beta1:.4f}  p={p_beta1:.3e}  95%CI=({conf_beta1[0]:.4f}, {conf_beta1[1]:.4f})"
    )
    if p_beta1 < 0.05:
        print(f"→ {x_col} es SIGNIFICATIVA al 5% ✅")
    else:
        print(f"→ {x_col} NO es significativa al 5% ❌")

    fig, ax = plt.subplots(figsize=(6.8, 4.8))
    ax.scatter(sub[x_col], sub[y_col], alpha=0.6)
    xs = np.linspace(sub[x_col].min(), sub[x_col].max(), 100)
    ys = beta0 + beta1 * xs
    ax.plot(xs, ys, lw=2)
    ax.set_title(f"{y_col} vs {x_col} (recta OLS)")
    ax.set_xlabel(x_col)
    ax.set_ylabel(y_col)
    fig.tight_layout()
    fig.savefig(
        FIG_OUT / f"ols_scatter_{y_col}_vs_{x_col}.png",
        dpi=150,
        bbox_inches="tight",
        transparent=True,
    )
    plt.close(fig)

    fig, ax = plt.subplots(figsize=(6.8, 4.8))
    ax.scatter(sub["yhat"], sub["resid"], alpha=0.6)
    ax.axhline(0, color="black", lw=1)
    ax.set_title(f"Residuales vs Ajustados: {y_col} ~ {x_col}")
    ax.set_xlabel("Valores ajustados (ŷ)")
    ax.set_ylabel("Residuales (y - ŷ)")
    fig.tight_layout()
    fig.savefig(
        FIG_OUT / f"ols_resid_vs_fit_{y_col}_vs_{x_col}.png",
        dpi=150,
        bbox_inches="tight",
        transparent=True,
    )
    plt.close(fig)

    return {
        "modelo": f"{y_col} ~ {x_col}",
        "beta0_const": beta0,
        "beta1": beta1,
        "p_beta1": p_beta1,
        "beta1_ci_low": conf_beta1[0],
        "beta1_ci_high": conf_beta1[1],
        "R2": model.rsquared,
        "R2_adj": model.rsquared_adj,
        "F": model.fvalue,
        "p_F": model.f_pvalue,
        "AIC": model.aic,
        "BIC": model.bic,
        "N": int(model.nobs),
        "RMSE": rmse,
        "MAE": mae,
    }

In [4]:
df = pd.read_csv(MASTER, parse_dates=["season_date"])

In [5]:
targets = "W"
predictors = ["RunDiff", "ERA", "HR", "logHR1"]

In [9]:
resultados = []
for x in predictors:
    resultados.append(run_simple_ols(df, targets, x, x))


=== Modelo: W ~ RunDiff ===
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         80.9700      0.165    490.838      0.000      80.646      81.294
RunDiff        0.0997      0.001     67.071      0.000       0.097       0.103
R^2 = 0.8827 | R^2 adj = 0.8825
F(1, 598) = 4498.48  p(F) = 1.996e-280
RMSE = 4.034 | MAE = 3.212
Coef RunDiff = 0.0997  p=1.996e-280  95%CI=(0.0967, 0.1026)
→ RunDiff es SIGNIFICATIVA al 5% ✅

=== Modelo: W ~ ERA ===
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        142.3221      2.914     48.845      0.000     136.600     148.045
ERA          -14.4426      0.681    -21.222      0.000     -15.779     -13.106
R^2 = 0.4296 | R^2 adj = 0.4286
F(1, 598) = 450.37  p(F) = 6.251e-75
RMSE = 8.894 | MAE = 7.156
Coef ERA = -14.4426  p=6.

In [7]:
res_df = pd.DataFrame(resultados)
cols_orden = [
    "modelo",
    "beta0_const",
    "beta1",
    "p_beta1",
    "beta1_ci_low",
    "beta1_ci_high",
    "R2",
    "R2_adj",
    "F",
    "p_F",
    "AIC",
    "BIC",
    "RMSE",
    "MAE",
    "N",
]
res_df = res_df[cols_orden]

res_df.to_csv(TAB_OUT / "ols_simple_summary.csv", index=False, float_format="%.6f")
print("\nResumen guardado en data/processed/ols_simple_summary.csv")
print(res_df)


Resumen guardado en data/processed/ols_simple_summary.csv
        modelo  beta0_const      beta1        p_beta1  beta1_ci_low  \
0  W ~ RunDiff    80.970000   0.099664  1.996439e-280      0.096746   
1      W ~ ERA   142.322107 -14.442586   6.250593e-75    -15.779148   
2       W ~ HR    59.230312   0.125326   1.784321e-23      0.101705   
3   W ~ logHR1   -29.323832  21.461450   2.995635e-23     17.393651   

   beta1_ci_high        R2    R2_adj            F            p_F          AIC  \
0       0.102583  0.882664  0.882468  4498.479416  1.996439e-280  3380.438373   
1     -13.106025  0.429590  0.428636   450.368361   6.250593e-75  4329.227359   
2       0.148947  0.153664  0.152249   108.575442   1.784321e-23  4565.963548   
3      25.529250  0.152210  0.150792   107.363107   2.995635e-23  4566.993906   

           BIC       RMSE       MAE    N  
0  3389.232232   4.034007  3.212020  600  
1  4338.021218   8.894362  7.155736  600  
2  4574.757407  10.834091  8.911703  600  
3  4575

In [8]:
try:
    with open(TAB_OUT / "ols_simple_summary.tex", "w") as f:
        f.write(
            res_df.rename(
                columns={
                    "modelo": "Modelo",
                    "beta0_const": "$\\beta_0$",
                    "beta1": "$\\beta_1$",
                    "p_beta1": "$p(\\beta_1)$",
                    "beta1_ci_low": "CI95\% Inf",
                    "beta1_ci_high": "CI95\% Sup",
                    "R2": "$R^2$",
                    "R2_adj": "$R^2_{adj}$",
                    "F": "F",
                    "p_F": "$p(F)$",
                    "AIC": "AIC",
                    "BIC": "BIC",
                    "RMSE": "RMSE",
                    "MAE": "MAE",
                    "N": "N",
                }
            ).to_latex(
                index=False,
                float_format="%.4f",
                caption="Modelos de regresión lineal simple: coeficientes y bondad de ajuste",
                label="tab:ols_simples",
            )
        )
    print("Tabla LaTeX exportada: data/processed/ols_simple_summary.tex")
except Exception as e:
    print("No se exportó LaTeX (opcional). Razón:", e)

Tabla LaTeX exportada: data/processed/ols_simple_summary.tex
