## **2 Overfitting (8 points)**

In [1]:
import numpy as np
import pandas as pd
from numpy.polynomial.legendre import legvander
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

### (1) Generación de variables y utilidades

In [2]:
rng = np.random.default_rng(2025)
n = 1000
X = rng.uniform(-1, 1, size=n).astype(np.float32)
eps = rng.standard_normal(n).astype(np.float32)
Y = (X + eps).astype(np.float32)  # intercepto = 0

# grados a probar (número de features)
p_list = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]



def make_leg_features(x, p):
    """
    Devuelve [P1(x), P2(x), ..., Pp(x)], polinomios de Legendre (sin P0).
    Mucho más estable que potencias x**k.
    """
    V = legvander(x.astype(np.float64), p)  # columnas P0..Pp
    return V[:, 1:].astype(np.float32)      # quitar P0 (intercepto)

def adj_r2(r2, n_obs, p):
    df = n_obs - p - 1
    if df <= 0:
        return np.nan
    return 1.0 - (1.0 - r2) * (n_obs - 1) / df

def fit_no_intercept(Xm, y):
    """
    OLS sin intercepto con lstsq (SVD) y rcond explícito para estabilidad.
    Evita problemas de inversión con matrices mal condicionadas.
    """
    beta, *_ = np.linalg.lstsq(Xm, y, rcond=1e-7)
    return Xm @ beta




In [None]:
rows = []
for p in p_list:
    Xp = make_leg_features(X, p)

    # R2 en toda la muestra
    yhat_full = fit_no_intercept(Xp, Y)
    r2_full = r2_score(Y, yhat_full)
    ar2_full = adj_r2(r2_full, n, p)

    # R2 out-of-sample (75/25)
    Xtr, Xte, ytr, yte = train_test_split(Xp, Y, test_size=0.25, random_state=2025)
    yhat_tr = fit_no_intercept(Xtr, ytr)             # entrena en train
    # obtener beta explícito para predecir en test
    beta_tr, *_ = np.linalg.lstsq(Xtr, ytr, rcond=1e-7)
    yhat_te = Xte @ beta_tr
    r2_oos = r2_score(yte, yhat_te)

    rows.append({
        "p_features": p,
        "R2_full": float(r2_full),
        "Adj_R2_full": float(ar2_full) if np.isfinite(ar2_full) else np.nan,
        "R2_out_of_sample": float(r2_oos)
    })

df = pd.DataFrame(rows)
print(df)


### (5) Gráficos


In [None]:

import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# (1) Generación de variables y utilidades
rng = np.random.default_rng(3324)
n = 1000
sigma = 1.0
X = rng.standard_normal(n)
eps = sigma * rng.standard_normal(n)
Y = X + eps

p_list = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]

def datasplitting(df: pd.DataFrame, proportion: float, seed: int = 2025):
    rng_local = np.random.default_rng(seed)
    idx = np.arange(len(df))
    k = int(proportion * len(df))
    tr_idx = np.sort(rng_local.choice(idx, size=k, replace=False))
    te_mask = np.ones(len(df), dtype=bool)
    te_mask[tr_idx] = False
    return df.iloc[tr_idx].copy(), df.iloc[te_mask].copy()

def r2_in_sample(y, yhat):
    y = np.asarray(y); yhat = np.asarray(yhat)
    return 1.0 - np.mean((y - yhat)**2) / np.var(y, ddof=0)

def r2_adj(y, yhat, p):
    y = np.asarray(y); yhat = np.asarray(yhat)
    n = y.size
    if p >= n: 
        return np.nan
    return 1.0 - ((n/(n - p)) * np.mean((y - yhat)**2)) / np.var(y, ddof=0)

def r2_oos(ytest, yhat_test):
    ytest = np.asarray(ytest); yhat_test = np.asarray(yhat_test)
    return 1.0 - np.mean((ytest - yhat_test)**2) / np.var(ytest, ddof=0)

def make_poly_df(x: np.ndarray, y: np.ndarray, p: int) -> pd.DataFrame:
    df = pd.DataFrame({"y": y})
    for k in range(1, p+1):
        df[f"x{k}"] = x**k
    return df

# (3) Contenedores
R2_full, R2_adj, R2_oos = [], [], []

# (4) Estimación con regresión lineal sin intercepto y split 75/25
for p in p_list:
    df = make_poly_df(X, Y, p)
    covs = [f"x{k}" for k in range(1, p+1)]
    Xfull = df[covs].to_numpy()
    yfull = df["y"].to_numpy()

    # full sample
    m_full = LinearRegression(fit_intercept=False).fit(Xfull, yfull)
    yhat_full = m_full.predict(Xfull)
    R2_full.append(r2_in_sample(yfull, yhat_full))
    R2_adj.append(r2_adj(yfull, yhat_full, p))

    # train/test split 75/25
    train, test = datasplitting(df, 0.75, seed=2025)
    Xtr, ytr = train[covs].to_numpy(), train["y"].to_numpy()
    Xte, yte = test[covs].to_numpy(), test["y"].to_numpy()

    m_tr = LinearRegression(fit_intercept=False).fit(Xtr, ytr)
    yhat_te = m_tr.predict(Xte)
    R2_oos.append(r2_oos(yte, yhat_te))

# (5) Gráficos
plt.figure()
plt.plot(p_list, R2_full, marker="o")
plt.xscale("log")
plt.ylim(-0.5, 1.0)
plt.xlabel("Número de features (log)")
plt.ylabel("R²")
plt.title("R² en muestra completa vs complejidad")
plt.show()

plt.figure()
plt.plot(p_list, R2_adj, marker="o")
plt.xscale("log")
plt.ylim(-0.5, 1.0)
plt.xlabel("Número de features (log)")
plt.ylabel("R² ajustado")
plt.title("R² ajustado vs complejidad")
plt.show()

plt.figure()
plt.plot(p_list, R2_oos, marker="o")
plt.xscale("log")
plt.ylim(-1.0, 1.0)
plt.xlabel("Número de features (log)")
plt.ylabel("R² out-of-sample")
plt.title("R² out-of-sample (split 75/25) vs complejidad")
plt.show()

In [None]:
# Ejercicio estable SIN tocar hilos ni librerías raras.
# Idea: usar base de Legendre y OLS vía SVD (np.linalg.lstsq).
# Para p >= n-1 no hay grados de libertad: reportamos NaN y seguimos.
import numpy as np
import pandas as pd
from numpy.polynomial.legendre import legvander
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

# ---------- DGP ----------
rng = np.random.default_rng(2025)
n = 1000
X = rng.uniform(-1, 1, size=n).astype(np.float64)
Y = (X + rng.standard_normal(n)).astype(np.float64)  # intercepto = 0

p_list = [1, 2, 5, 10, 20, 50, 100, 200, 500, 1000]

# ---------- utilidades ----------
def make_leg_features(x, p):
    # Matriz de Legendre P0..Pp; quitamos P0 (intercepto)
    V = legvander(x, p)[:, 1:]
    # normalizamos cada columna para evitar escalas extremas
    norms = np.linalg.norm(V, axis=0, keepdims=True)
    norms[norms == 0.0] = 1.0
    return V / norms

def adj_r2(r2, n_obs, p):
    df = n_obs - p - 1
    return np.nan if df <= 0 else 1.0 - (1.0 - r2) * (n_obs - 1) / df

def ols_no_intercept(Xm, y):
    # OLS estable por SVD; evita invertir X'X
    beta, *_ = np.linalg.lstsq(Xm, y, rcond=None)
    return Xm @ beta, beta

# ---------- loop ----------
rows = []
for p in p_list:
    if p >= n - 1:
        rows.append({"p_features": p, "R2_full": np.nan, "Adj_R2_full": np.nan, "R2_out_of_sample": np.nan})
        continue

    Xp = make_leg_features(X, p)

    # full sample
    yhat_full, _ = ols_no_intercept(Xp, Y)
    r2_full = r2_score(Y, yhat_full)
    ar2_full = adj_r2(r2_full, n, p)

    # split 75/25
    Xtr, Xte, ytr, yte = train_test_split(Xp, Y, test_size=0.25, random_state=2025)
    _, beta_tr = ols_no_intercept(Xtr, ytr)
    r2_oos = r2_score(yte, Xte @ beta_tr)

    rows.append({"p_features": p, "R2_full": r2_full, "Adj_R2_full": ar2_full, "R2_out_of_sample": r2_oos})

df = pd.DataFrame(rows)
print(df)

# ---------- gráficos ----------
def plot_col(col, title, ylabel):
    plt.figure()
    plt.plot(df["p_features"], df[col], marker="o")
    plt.xscale("log")
    plt.ylim(-1, 1)
    plt.xlabel("Número de características (p)")
    plt.ylabel(ylabel)
    plt.title(title)
    plt.grid(True, which="both")
    plt.show()

plot_col("R2_full", "R² vs p (muestra completa)", "R² (full)")
plot_col("Adj_R2_full", "R² ajustado vs p", "R² ajustado")
plot_col("R2_out_of_sample", "R² out-of-sample vs p", "R² OOS")

In [None]:
# Estable: calcula R² hasta p<=300 y salta 500/1000
import numpy as np, pandas as pd
from numpy.polynomial.legendre import legvander
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

# --- DGP ---
rng = np.random.default_rng(2025)
n = 1000
X = rng.uniform(-1, 1, n).astype(np.float64)
Y = (X + rng.standard_normal(n)).astype(np.float64)   # intercepto = 0

p_targets = [1,2,5,10,20]
p_safe_max = 300                                     # evita cuelgues

def make_leg(X, p):
    V = legvander(X, p)[:,1:]                        # P1..Pp
    # normaliza columnas
    s = np.linalg.norm(V, axis=0, keepdims=True); s[s==0]=1
    return V/s

def adj_r2(r2, n, p):
    df = n - p - 1
    return np.nan if df<=0 else 1 - (1-r2)*(n-1)/df

def ols_no_intercept(Xm, y):
    beta, *_ = np.linalg.lstsq(Xm, y, rcond=None)    # SVD estable
    return Xm@beta, beta

rows=[]
for p in p_targets:
    if p>p_safe_max or p>=n-1:
        # evitar cómputos que matan el kernel
        rows.append({"p_features": p, "R2_full": np.nan,
                     "Adj_R2_full": np.nan, "R2_out_of_sample": np.nan})
        continue

    Xp = make_leg(X, p)

    # full sample
    yhat, _ = ols_no_intercept(Xp, Y)
    r2f = r2_score(Y, yhat)
    ar2 = adj_r2(r2f, n, p)

    # 75/25
    Xtr,Xte,ytr,yte = train_test_split(Xp, Y, test_size=0.25, random_state=2025)
    _, btr = ols_no_intercept(Xtr, ytr)
    r2o = r2_score(yte, Xte@btr)

    rows.append({"p_features": p, "R2_full": r2f, "Adj_R2_full": ar2, "R2_out_of_sample": r2o})

df = pd.DataFrame(rows)
print(df)

# gráficos
def plot_col(col,tit,ylabel):
    plt.figure(); plt.plot(df["p_features"], df[col], marker="o")
    plt.xscale("log"); plt.ylim(-1,1); plt.xlabel("Número de características (p)")
    plt.ylabel(ylabel); plt.title(tit); plt.grid(True, which="both"); plt.show()

plot_col("R2_full","R² vs p (muestra completa)","R² (full)")
plot_col("Adj_R2_full","R² ajustado vs p","R² ajustado")
plot_col("R2_out_of_sample","R² out-of-sample vs p","R² OOS")


In [1]:
# Celda 0: obligatorio antes de cualquier import
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"


In [None]:
# Celda 1: solución estable, sin regularización
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

rng = np.random.default_rng(2025)
n = 1000
X = rng.uniform(-1, 1, size=n).astype(np.float64)
Y = (X + rng.standard_normal(n)).astype(np.float64)  # intercepto = 0

p_list = [1, 2, 5, 10, 20, 50, 100, 200]

def make_poly_features(x, p):
    # polinomios de Legendre P1..Pp; más estables que x**k
    from numpy.polynomial.legendre import legvander
    V = legvander(x, p)[:, 1:]
    # normalizar columnas para mejorar condición
    s = np.linalg.norm(V, axis=0, keepdims=True); s[s==0]=1.0
    return V / s

def adj_r2(r2, n_obs, p):
    df = n_obs - p - 1
    return np.nan if df <= 0 else 1 - (1 - r2) * (n_obs - 1) / df

def ols_no_intercept(Xm, y):
    # OLS por SVD (estable). Evita invertir X'X.
    beta, *_ = np.linalg.lstsq(Xm, y, rcond=1e-7)
    return Xm @ beta, beta

rows = []
for p in p_list:
    if p >= n - 1:
        rows.append({"p_features": p, "R2_full": np.nan, "Adj_R2_full": np.nan, "R2_out_of_sample": np.nan})
        continue

    Xp = make_poly_features(X, p)

    # full sample
    yhat_full, _ = ols_no_intercept(Xp, Y)
    r2_full = r2_score(Y, yhat_full)
    ar2_full = adj_r2(r2_full, n, p)

    # 75/25
    Xtr, Xte, ytr, yte = train_test_split(Xp, Y, test_size=0.25, random_state=2025)
    _, beta_tr = ols_no_intercept(Xtr, ytr)
    r2_oos = r2_score(yte, Xte @ beta_tr)

    rows.append({"p_features": p, "R2_full": r2_full, "Adj_R2_full": ar2_full, "R2_out_of_sample": r2_oos})

df = pd.DataFrame(rows)
print(df)

# gráficos
for col, title, ylabel in [
    ("R2_full", "R² vs p (muestra completa)", "R² (full)"),
    ("Adj_R2_full", "R² ajustado vs p", "R² ajustado"),
    ("R2_out_of_sample", "R² out-of-sample vs p", "R² OOS"),
]:
    plt.figure()
    plt.plot(df["p_features"], df[col], marker="o")
    plt.xscale("log"); plt.ylim(-1, 1)
    plt.xlabel("Número de características (p)"); plt.ylabel(ylabel)
    plt.title(title); plt.grid(True, which="both"); plt.show()


In [None]:
# Ejercicio estable sin invertir matrices ni usar SVD: OLS por descenso estocástico (SGD)
# Cumple: p ∈ {1,2,5,10,20,50,100,200,500,1000}, R2, R2 ajustado, R2 out-of-sample y 3 gráficos.
# Evita cuelgues porque no usa lstsq/inversión ni descomposiciones.

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor
import matplotlib.pyplot as plt

# ---------- DGP ----------
rng = np.random.default_rng(2025)
n = 10
X = rng.uniform(-1, 1, size=n).astype(np.float64)
Y = (X + rng.standard_normal(n)).astype(np.float64)  # intercepto = 0

p_list = [1]

# ---------- utils ----------
def make_poly_features(x, p):
    # monomios x, x^2, ..., x^p (con escalado posterior para estabilidad)
    cols = [x]
    cur = x.copy()
    for _ in range(2, p + 1):
        cur = cur * x
        cols.append(cur)
    return np.column_stack(cols)

def adj_r2(r2, n_obs, p):
    df = n_obs - p - 1
    return np.nan if df <= 0 else 1.0 - (1.0 - r2) * (n_obs - 1) / df

def fit_predict_sgd(Xm, y):
    """
    OLS sin intercepto aproximado con SGD sin penalización.
    Estable y rápido; evita llamadas a LAPACK/BLAS de alto costo.
    """
    scaler = StandardScaler(with_mean=True, with_std=True)
    Xs = scaler.fit_transform(Xm)

    sgd = SGDRegressor(
        loss="squared_error",
        penalty=None,          # sin regularización (OLS)
        fit_intercept=False,   # intercepto = 0 como pide el enunciado
        learning_rate="invscaling",
        eta0=0.05, power_t=0.25,
        max_iter=4000, tol=1e-6, random_state=2025,
        average=True
    )
    sgd.fit(Xs, y)
    yhat = sgd.predict(Xs)
    return yhat, (sgd, scaler)

def predict_with(model_scaler, Xnew):
    sgd, scaler = model_scaler
    Xs = scaler.transform(Xnew)
    return sgd.predict(Xs)

# ---------- loop ----------
rows = []
for p in p_list:
    Xp = make_poly_features(X, p)

    # full sample
    yhat_full, ms = fit_predict_sgd(Xp, Y)
    r2_full = r2_score(Y, yhat_full)
    ar2_full = adj_r2(r2_full, n, p)

    # 75/25
    Xtr, Xte, ytr, yte = train_test_split(Xp, Y, test_size=0.25, random_state=2025)
    yhat_tr, ms_tr = fit_predict_sgd(Xtr, ytr)
    yhat_te = predict_with(ms_tr, Xte)
    r2_oos = r2_score(yte, yhat_te)

    rows.append({"p_features": p, "R2_full": r2_full, "Adj_R2_full": ar2_full, "R2_out_of_sample": r2_oos})

df = pd.DataFrame(rows)
print(df)

# ---------- gráficos ----------
def plot_col(col, title, ylabel):
    plt.figure()
    plt.plot(df["p_features"], df[col], marker="o")
    plt.xscale("log")
    plt.ylim(-1, 1)
    plt.xlabel("Número de características (p)")
    plt.ylabel(ylabel)
    plt.title(title)
    plt.grid(True, which="both")
    plt.show()

plot_col("R2_full", "R² vs p (muestra completa)", "R² (full)")
plot_col("Adj_R2_full", "R² ajustado vs p", "R² ajustado")
plot_col("R2_out_of_sample", "R² out-of-sample vs p", "R² OOS")


   p_features   R2_full  Adj_R2_full  R2_out_of_sample
0           1  0.129855     0.021087          0.042403
