In [8]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

import statsmodels.api as sm
from statsmodels.tools.tools import add_constant

In [9]:
MASTER = Path("../data/processed/master_teams_2000_2019.csv")
TAB_OUT = Path("../data/processed/")
FIG_OUT = Path("../plots/final_version")
TAB_OUT.mkdir(parents=True, exist_ok=True)
FIG_OUT.mkdir(parents=True, exist_ok=True)
LATEX_OUT = Path("../latex/tables/final_version")
LATEX_OUT.mkdir(parents=True, exist_ok=True)

In [None]:
def run_multiple_ols(df: pd.DataFrame, y_col: str, x_cols: list):
    sub = df[[y_col] + x_cols].dropna().copy()
    X = add_constant(sub[x_cols])
    y = sub[y_col]
    model = sm.OLS(y, X).fit()

    sub["yhat"] = model.predict(X)
    sub["resid"] = y - sub["yhat"]

    rmse = float(np.sqrt(np.mean(sub["resid"] ** 2)))
    mae = float(np.mean(np.abs(sub["resid"])))

    print(f"\n=== Modelo múltiple: {y_col} ~ {', '.join(x_cols)} ===")
    print(model.summary())
    print(f"R^2 = {model.rsquared:.4f} | R^2 adj = {model.rsquared_adj:.4f}")
    print(
        f"F({int(model.df_model)}, {int(model.df_resid)}) = {model.fvalue:.2f}  p(F) = {model.f_pvalue:.3e}"
    )
    print(f"RMSE = {rmse:.3f} | MAE = {mae:.3f}")

    coefs = model.params.to_dict()
    pvals = model.pvalues.to_dict()
    conf_int = model.conf_int().to_dict("index")

    return {
        "modelo": f"{y_col} ~ {', '.join(x_cols)}",
        "coef": coefs,
        "pval": pvals,
        "ci": conf_int,
        "R2": float(model.rsquared),
        "R2_adj": float(model.rsquared_adj),
        "F": float(model.fvalue),
        "p_F": float(model.f_pvalue),
        "AIC": float(model.aic),
        "BIC": float(model.bic),
        "N": int(model.nobs),
        "RMSE": rmse,
        "MAE": mae,
        "yhat": sub["yhat"].to_numpy(),
        "resid": sub["resid"].to_numpy(),
        "model": model,
    }

In [11]:
df = pd.read_csv(MASTER, parse_dates=["season_date"])

In [12]:
predictors_all = ["RunDiff", "ERA", "HR"]
target = "W"
res_multi = run_multiple_ols(df, target, predictors_all)


=== Modelo múltiple: W ~ RunDiff, ERA, HR ===
                            OLS Regression Results                            
Dep. Variable:                      W   R-squared:                       0.885
Model:                            OLS   Adj. R-squared:                  0.885
Method:                 Least Squares   F-statistic:                     1534.
Date:                Wed, 01 Oct 2025   Prob (F-statistic):          9.94e-280
Time:                        00:25:13   Log-Likelihood:                -1681.4
No. Observations:                 600   AIC:                             3371.
Df Residuals:                     596   BIC:                             3388.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const

In [13]:
coef_order = [c for c in ["const", "RunDiff", "ERA", "HR"] if c in res_multi["coef"]]

coef_rows = []
for var in coef_order:
    beta = float(res_multi["coef"][var])
    pval = float(res_multi["pval"][var])
    ci_low = float(res_multi["ci"][var][0])
    ci_high = float(res_multi["ci"][var][1])
    coef_rows.append(
        {
            "variable": var,
            "beta_hat": beta,
            "p_value": pval,
            "ci95_low": ci_low,
            "ci95_high": ci_high,
        }
    )

coef_df = pd.DataFrame(coef_rows)

In [14]:
metrics = pd.DataFrame(
    [
        {
            "variable": "R2",
            "beta_hat": res_multi["R2"],
            "p_value": np.nan,
            "ci95_low": np.nan,
            "ci95_high": np.nan,
        },
        {
            "variable": "R2_adj",
            "beta_hat": res_multi["R2_adj"],
            "p_value": np.nan,
            "ci95_low": np.nan,
            "ci95_high": np.nan,
        },
        {
            "variable": "F_stat",
            "beta_hat": res_multi["F"],
            "p_value": res_multi["p_F"],
            "ci95_low": np.nan,
            "ci95_high": np.nan,
        },
        {
            "variable": "AIC",
            "beta_hat": res_multi["AIC"],
            "p_value": np.nan,
            "ci95_low": np.nan,
            "ci95_high": np.nan,
        },
        {
            "variable": "BIC",
            "beta_hat": res_multi["BIC"],
            "p_value": np.nan,
            "ci95_low": np.nan,
            "ci95_high": np.nan,
        },
        {
            "variable": "N",
            "beta_hat": res_multi["N"],
            "p_value": np.nan,
            "ci95_low": np.nan,
            "ci95_high": np.nan,
        },
        {
            "variable": "RMSE",
            "beta_hat": res_multi["RMSE"],
            "p_value": np.nan,
            "ci95_low": np.nan,
            "ci95_high": np.nan,
        },
        {
            "variable": "MAE",
            "beta_hat": res_multi["MAE"],
            "p_value": np.nan,
            "ci95_low": np.nan,
            "ci95_high": np.nan,
        },
    ]
)

In [15]:
coef_df.to_csv(
    TAB_OUT / "ols_multiple_coefficients.csv", index=False, float_format="%.6f"
)
metrics.to_csv(TAB_OUT / "ols_multiple_metrics.csv", index=False, float_format="%.6f")
print(
    "\nTablas guardadas en data/processed/ols_multiple_coefficients.csv y ols_multiple_metrics.csv"
)


Tablas guardadas en data/processed/ols_multiple_coefficients.csv y ols_multiple_metrics.csv


In [16]:
latex_coef = coef_df.rename(
    columns={
        "variable": "Variable",
        "beta_hat": "$\\hat{\\beta}$",
        "p_value": "$p$",
        "ci95_low": "CI95\\% Inf",
        "ci95_high": "CI95\\% Sup",
    }
).to_latex(
    index=False,
    float_format="%.4f",
    caption="Modelo de regresión lineal múltiple: $W \\sim RunDiff + ERA + HR$ (coeficientes)",
    label="tab:ols_multiple_coef",
)
with open(LATEX_OUT / "ols_multiple_coefficients.tex", "w") as f:
    f.write(latex_coef)
print("Tabla LaTeX exportada: latex/tables/final_version/ols_multiple_coefficients.tex")

Tabla LaTeX exportada: latex/tables/final_version/ols_multiple_coefficients.tex


In [17]:
latex_metrics = metrics.rename(
    columns={
        "variable": "Métrica",
        "beta_hat": "Valor",
        "p_value": "$p$",
        "ci95_low": "CI95\\% Inf",
        "ci95_high": "CI95\\% Sup",
    }
).to_latex(
    index=False,
    float_format="%.4f",
    caption="Bondad de ajuste y métricas del modelo múltiple",
    label="tab:ols_multiple_metrics",
)
with open(LATEX_OUT / "ols_multiple_metrics.tex", "w") as f:
    f.write(latex_metrics)
print("Tabla LaTeX exportada: latex/tables/final_version/ols_multiple_metrics.tex")

Tabla LaTeX exportada: latex/tables/final_version/ols_multiple_metrics.tex


In [None]:
sub_plot = df[[target] + predictors_all].dropna().copy()
params = res_multi["coef"]

fig, axes = plt.subplots(1, 3, figsize=(15, 4.8), sharey=True)
for ax, var in zip(axes, predictors_all):
    xj = sub_plot[var].to_numpy()
    y_adj = sub_plot[target].to_numpy() - float(params.get("const", 0.0))
    for k in predictors_all:
        if k != var:
            y_adj = y_adj - float(params[k]) * sub_plot[k].to_numpy()
    bj = float(params[var])
    ax.scatter(xj, y_adj, alpha=0.6)
    xs = np.linspace(xj.min(), xj.max(), 200)
    ys = bj * xs
    ax.plot(xs, ys, lw=2)
    ax.set_title(f"Añadida: {target} vs {var} | resto controlado")
    ax.set_xlabel(var)
axes[0].set_ylabel(f"{target} ajustada (y*)")
fig.tight_layout()
fig.savefig(
    FIG_OUT / "ols_scatter_grid_RunDiff_ERA_HR.png",
    dpi=150,
    bbox_inches="tight",
    transparent=True,
)
plt.close(fig)

In [19]:
yhat = res_multi["yhat"]
resid = res_multi["resid"]
fig, axes = plt.subplots(1, 3, figsize=(15, 4.8), sharey=True, sharex=True)
for ax, var in zip(axes, predictors_all):
    ax.scatter(yhat, resid, alpha=0.6)
    ax.axhline(0, color="black", lw=1)
    ax.set_title(f"Residuales vs ŷ (múltiple) | {var}")
    ax.set_xlabel("Valores ajustados (ŷ)")
axes[0].set_ylabel("Residuales (y - ŷ)")
fig.tight_layout()
fig.savefig(
    FIG_OUT / "ols_resid_grid_RunDiff_ERA_HR.png",
    dpi=150,
    bbox_inches="tight",
    transparent=True,
)
plt.close(fig)