In [13]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

In [14]:
MASTER = Path("../data/processed/master_teams_2000_2019.csv")
TAB_OUT = Path("../data/processed")
FIG_OUT = Path("../plots")
TAB_OUT.mkdir(parents=True, exist_ok=True)
FIG_OUT.mkdir(parents=True, exist_ok=True)

In [15]:
def run_simple_ols(df: pd.DataFrame, y_col: str, x_col: str):
    sub = df[[y_col, x_col]].dropna().copy()
    X = add_constant(sub[x_col])
    y = sub[y_col]
    model = sm.OLS(y, X).fit()

    sub["yhat"] = model.predict(X)
    sub["resid"] = y - sub["yhat"]

    rmse = float(np.sqrt(np.mean(sub["resid"] ** 2)))
    mae = float(np.mean(np.abs(sub["resid"])))

    beta0 = float(model.params["const"])
    beta1 = float(model.params[x_col])
    p_beta1 = float(model.pvalues[x_col])
    ci_low, ci_high = model.conf_int().loc[x_col].tolist()

    print(f"\n=== Modelo: {y_col} ~ {x_col} ===")
    print(model.summary().tables[1])
    print(f"R^2 = {model.rsquared:.4f} | R^2 adj = {model.rsquared_adj:.4f}")
    print(
        f"F({int(model.df_model)}, {int(model.df_resid)}) = {model.fvalue:.2f}  p(F) = {model.f_pvalue:.3e}"
    )
    print(f"RMSE = {rmse:.3f} | MAE = {mae:.3f}")
    print(
        f"Coef {x_col} = {beta1:.4f}  p={p_beta1:.3e}  95%CI=({ci_low:.4f}, {ci_high:.4f})"
    )
    print(
        "→ SIGNIFICATIVA al 5% ✅" if p_beta1 < 0.05 else "→ NO significativa al 5% ❌"
    )

    return {
        "modelo": f"{y_col} ~ {x_col}",
        "x_name": x_col,
        "y_name": y_col,
        "beta0": beta0,
        "beta1": beta1,
        "p_beta1": p_beta1,
        "ci_low": ci_low,
        "ci_high": ci_high,
        "R2": float(model.rsquared),
        "R2_adj": float(model.rsquared_adj),
        "F": float(model.fvalue),
        "p_F": float(model.f_pvalue),
        "AIC": float(model.aic),
        "BIC": float(model.bic),
        "N": int(model.nobs),
        "RMSE": rmse,
        "MAE": mae,
        "x": sub[x_col].to_numpy(),
        "y": sub[y_col].to_numpy(),
        "yhat": sub["yhat"].to_numpy(),
        "resid": sub["resid"].to_numpy(),
    }

In [16]:
df = pd.read_csv(MASTER, parse_dates=["season_date"])

In [17]:
predictors_all = ["RunDiff", "ERA", "HR", "logHR1"]
target = "W"

In [18]:
resultados = []
for x in predictors_all:
    resultados.append(run_simple_ols(df, target, x))


=== Modelo: W ~ RunDiff ===
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         80.9700      0.165    490.838      0.000      80.646      81.294
RunDiff        0.0997      0.001     67.071      0.000       0.097       0.103
R^2 = 0.8827 | R^2 adj = 0.8825
F(1, 598) = 4498.48  p(F) = 1.996e-280
RMSE = 4.034 | MAE = 3.212
Coef RunDiff = 0.0997  p=1.996e-280  95%CI=(0.0967, 0.1026)
→ SIGNIFICATIVA al 5% ✅

=== Modelo: W ~ ERA ===
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        142.3221      2.914     48.845      0.000     136.600     148.045
ERA          -14.4426      0.681    -21.222      0.000     -15.779     -13.106
R^2 = 0.4296 | R^2 adj = 0.4286
F(1, 598) = 450.37  p(F) = 6.251e-75
RMSE = 8.894 | MAE = 7.156
Coef ERA = -14.4426  p=6.251e-75  95

In [19]:
res_df = pd.DataFrame(
    [
        {
            "modelo": r["modelo"],
            "beta0_const": r["beta0"],
            "beta1": r["beta1"],
            "p_beta1": r["p_beta1"],
            "beta1_ci_low": r["ci_low"],
            "beta1_ci_high": r["ci_high"],
            "R2": r["R2"],
            "R2_adj": r["R2_adj"],
            "F": r["F"],
            "p_F": r["p_F"],
            "AIC": r["AIC"],
            "BIC": r["BIC"],
            "RMSE": r["RMSE"],
            "MAE": r["MAE"],
            "N": r["N"],
        }
        for r in resultados
    ]
)

In [20]:
cols_orden = [
    "modelo",
    "beta0_const",
    "beta1",
    "p_beta1",
    "beta1_ci_low",
    "beta1_ci_high",
    "R2",
    "R2_adj",
    "F",
    "p_F",
    "AIC",
    "BIC",
    "RMSE",
    "MAE",
    "N",
]
res_df = res_df[cols_orden]
res_df.to_csv(TAB_OUT / "ols_simple_summary.csv", index=False, float_format="%.6f")
print("\nResumen guardado en data/processed/ols_simple_summary.csv")
print(res_df)


Resumen guardado en data/processed/ols_simple_summary.csv
        modelo  beta0_const      beta1        p_beta1  beta1_ci_low  \
0  W ~ RunDiff    80.970000   0.099664  1.996439e-280      0.096746   
1      W ~ ERA   142.322107 -14.442586   6.250593e-75    -15.779148   
2       W ~ HR    59.230312   0.125326   1.784321e-23      0.101705   
3   W ~ logHR1   -29.323832  21.461450   2.995635e-23     17.393651   

   beta1_ci_high        R2    R2_adj            F            p_F          AIC  \
0       0.102583  0.882664  0.882468  4498.479416  1.996439e-280  3380.438373   
1     -13.106025  0.429590  0.428636   450.368361   6.250593e-75  4329.227359   
2       0.148947  0.153664  0.152249   108.575442   1.784321e-23  4565.963548   
3      25.529250  0.152210  0.150792   107.363107   2.995635e-23  4566.993906   

           BIC       RMSE       MAE    N  
0  3389.232232   4.034007  3.212020  600  
1  4338.021218   8.894362  7.155736  600  
2  4574.757407  10.834091  8.911703  600  
3  4575

In [21]:
try:
    with open(TAB_OUT / "ols_simple_summary.tex", "w") as f:
        f.write(
            res_df.rename(
                columns={
                    "modelo": "Modelo",
                    "beta0_const": "$\\beta_0$",
                    "beta1": "$\\beta_1$",
                    "p_beta1": "$p(\\beta_1)$",
                    "beta1_ci_low": "CI95\\% Inf",
                    "beta1_ci_high": "CI95\\% Sup",
                    "R2": "$R^2$",
                    "R2_adj": "$R^2_{adj}$",
                    "F": "F",
                    "p_F": "$p(F)$",
                    "AIC": "AIC",
                    "BIC": "BIC",
                    "RMSE": "RMSE",
                    "MAE": "MAE",
                    "N": "N",
                }
            ).to_latex(
                index=False,
                float_format="%.4f",
                caption="Modelos de regresión lineal simple: coeficientes y bondad de ajuste",
                label="tab:ols_simples",
            )
        )
    print("Tabla LaTeX exportada: data/processed/ols_simple_summary.tex")
except Exception as e:
    print("No se exportó LaTeX (opcional). Razón:", e)

Tabla LaTeX exportada: data/processed/ols_simple_summary.tex


In [22]:
plot_predictors = ["RunDiff", "ERA", "HR", "logHR1"]
to_plot = [r for r in resultados if r["x_name"] in plot_predictors]

In [23]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4.8), sharey=True)
for ax, r in zip(axes, to_plot):
    x = r["x"]
    y = r["y"]
    beta0 = r["beta0"]
    beta1 = r["beta1"]

    ax.scatter(x, y, alpha=0.6)
    xs = np.linspace(x.min(), x.max(), 200)
    ys = beta0 + beta1 * xs
    ax.plot(xs, ys, lw=2)
    ax.set_title(f"{r['y_name']} vs {r['x_name']} (OLS)")
    ax.set_xlabel(r["x_name"])
axes[0].set_ylabel(target)
fig.tight_layout()
fig.savefig(
    FIG_OUT / "ols_scatter_grid_RunDiff_ERA_HR.png",
    dpi=150,
    bbox_inches="tight",
    transparent=True,
)
plt.close(fig)

In [24]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4.8), sharey=True)
for ax, r in zip(axes, to_plot):
    yhat = r["yhat"]
    resid = r["resid"]

    ax.scatter(yhat, resid, alpha=0.6)
    ax.axhline(0, color="black", lw=1)
    ax.set_title(f"Residuales vs ŷ: {r['y_name']} ~ {r['x_name']}")
    ax.set_xlabel("Valores ajustados (ŷ)")
axes[0].set_ylabel("Residuales (y - ŷ)")
fig.tight_layout()
fig.savefig(
    FIG_OUT / "ols_resid_grid_RunDiff_ERA_HR.png",
    dpi=150,
    bbox_inches="tight",
    transparent=True,
)
plt.close(fig)