In [23]:
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import r2_score
from sklearn.linear_model import LogisticRegression

# ADVERTISING

In [85]:
df = pd.read_csv("Advertising.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper,sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [86]:
X = df[["TV", "radio", "newspaper"]].values
y = df["sales"].values

In [87]:
modelo = LinearRegression()
modelo.fit(X, y)

beta0_hat = modelo.intercept_
betas = modelo.coef_
print("β0:", beta0_hat)
print("β1 (TV):", betas[0])
print("β2 (radio):", betas[1])
print("β3 (newspaper):", betas[2])

y_pred = modelo.predict(X)
resid = y - y_pred

n = len(y)
p = X.shape[1] + 1

RSS = np.sum(resid**2)
sigma2 = RSS / (n - p)

X_with_intercept = np.column_stack([np.ones(n), X])

cov_matrix = sigma2 * np.linalg.inv(X_with_intercept.T @ X_with_intercept)

SE_betas = np.sqrt(np.diag(cov_matrix))
print()
print("SE(β0):", SE_betas[0])
print("SE(β1 TV):", SE_betas[1])
print("SE(β2 radio):", SE_betas[2])
print("SE(β3 newspaper):", SE_betas[3])

β0: 2.938889369459412
β1 (TV): 0.0457646454553976
β2 (radio): 0.18853001691820448
β3 (newspaper): -0.0010374930424763285

SE(β0): 0.31190823632179143
SE(β1 TV): 0.0013948968069749735
SE(β2 radio): 0.008611233967301958
SE(β3 newspaper): 0.005871009647086364


# DEFAULT

In [88]:
df=pd.read_csv("Default.csv")
df["default"]=df["default"].astype("category")
df["student"]=df["student"].astype("category")
y=df["default"]== "Yes"
x=df["balance"].values.reshape(-1,1)
rl=LogisticRegression()
rl.fit(x,y)
y_pred=rl.predict(x)
y_prob =rl.predict_proba(x)[:,1]

In [89]:
incertidumbre = y_prob * (1 - y_prob)
V = np.diagflat(incertidumbre)
X = df[["balance"]].values
# Se agrega una columna de 1s
X = np.column_stack((np.ones(len(X)), X))
cov = np.linalg.inv(X.T @ V @ X)
se = np.sqrt(np.diag(cov))
z0=rl.intercept_/se[0]
z1=rl.coef_/se[1]


In [91]:
from scipy.stats import norm
p_value0 = 2 * (1 - norm.cdf(abs(z0)))
p_value1 = 2 * (1 - norm.cdf(abs(z1)))
X_f = (df["student"] == "Yes").astype(int).values.reshape(-1,1)
X1 = np.column_stack([np.ones(len(X_f)), X_f])
rl=LogisticRegression()
rl.fit(X_f,y)
y_pred=rl.predict(X_f)
y_prob =rl.predict_proba(X_f)[:,1]
print(rl.intercept_,rl.coef_)
incertidumbre = y_prob * (1 - y_prob)
V = np.diagflat(incertidumbre)
cov = np.linalg.inv(X1.T @ V @ X1)
se = np.sqrt(np.diag(cov))
z0=rl.intercept_/se[0]
z1=rl.coef_/se[1]
p_value0 = 2 * (1 - norm.cdf(abs(z0)))
p_value1 = 2 * (1 - norm.cdf(abs(z1)))

[-3.50257249] [[0.39620888]]


In [92]:
import pandas as pd

beta0_hat = rl.intercept_[0]
beta1_hat = rl.coef_[0][0]

SE_beta0 = se[0]
SE_beta1 = se[1]

print("Error estándar de β0:", SE_beta0)
print("Error estándar de β1:", SE_beta1)


Error estándar de β0: 0.07066142671069267
Error estándar de β1: 0.11522061447776961


# ADVERTISING CON BOOTSTRAPPING

In [77]:
df = pd.read_csv("Advertising.csv")
X = df[["TV", "radio", "newspaper"]].values
y = df["sales"].values


In [78]:
B = 1000
boot_coefs = []

n = len(X)

for b in range(B):
    idx = np.random.choice(n, n, replace=True)
    X_b = X[idx]
    y_b = y[idx]

    model_boot = LinearRegression()
    model_boot.fit(X_b, y_b)

    boot_coefs.append([model_boot.intercept_, *model_boot.coef_])

boot_coefs = np.array(boot_coefs)

media_boot = np.mean(boot_coefs, axis=0)
std_boot = np.std(boot_coefs, axis=0)

print("Mean of bootstrap coefficients (Intercept, TV, radio, newspaper):", media_boot)
print("Standard deviation of bootstrap coefficients (Intercept, TV, radio, newspaper):", std_boot)

Mean of bootstrap coefficients (Intercept, TV, radio, newspaper): [ 2.94520397e+00  4.56931705e-02  1.88964305e-01 -9.49794878e-04]
Standard deviation of bootstrap coefficients (Intercept, TV, radio, newspaper): [0.33732486 0.00192199 0.010933   0.00644088]


# DEFAULT BOOTSTRAP

In [83]:
B = 1000
boot_coefs_logistic = []

df_default = pd.read_csv("Default.csv")
df_default["default"] = df_default["default"].astype("category")
y_default = df_default["default"] == "Yes"
x_default = df_default["balance"].values.reshape(-1, 1)

n_default = len(x_default)
for b in range(B):

    idx = np.random.choice(n_default, n_default, replace=True)
    x_b = x_default[idx]
    y_b = y_default[idx]

    model_boot_logistic = LogisticRegression()
    model_boot_logistic.fit(x_b, y_b)

    boot_coefs_logistic.append([model_boot_logistic.intercept_[0], model_boot_logistic.coef_[0][0]])

boot_coefs_logistic = np.array(boot_coefs_logistic)

media_boot_logistic = np.mean(boot_coefs_logistic, axis=0)
std_boot_logistic = np.std(boot_coefs_logistic, axis=0)

print("Mean of bootstrap coefficients (Intercept, balance):", media_boot_logistic)
print("Standard deviation of bootstrap coefficients (Intercept, balance):", std_boot_logistic)

Mean of bootstrap coefficients (Intercept, balance): [-1.06681537e+01  5.50696232e-03]
Standard deviation of bootstrap coefficients (Intercept, balance): [3.57712358e-01 2.17142038e-04]


# PUNTO 3
Compara los resultados obtenidos con el método visto en los laboratorios contra los resultados obtenidos con bootstrap. ¿Por qué podría haber diferencias en los resultados?

* Para Advertising que se usa regresión líneal la desviación estandar de los coeficientes con formulazo me salen más pequeños, mientras que con bootstrap me salen más grandes.

* Para default si me salen mucho más grande la desviación estandar con bootstrap.

Creo que esto se puede dar porque con formulazo se puede subestimar la variabilidad.

Y en cuanto la media de los coeficientes en adversiting me salen demasiado parecidas, no varia mucho ahí.

Y en default si me salen demasiado diferentes puede que sea porque no cumplia con los supuestos desde un inicio y con formulazo estuvo mal siempre.

# REGULARIZACIÓN L2

In [99]:
!pip install scikit-optimize

Collecting scikit-optimize
  Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl.metadata (9.7 kB)
Collecting pyaml>=16.9 (from scikit-optimize)
  Downloading pyaml-25.7.0-py3-none-any.whl.metadata (12 kB)
Downloading scikit_optimize-0.10.2-py2.py3-none-any.whl (107 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.8/107.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyaml-25.7.0-py3-none-any.whl (26 kB)
Installing collected packages: pyaml, scikit-optimize
Successfully installed pyaml-25.7.0 scikit-optimize-0.10.2


In [100]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from skopt import gp_minimize
from skopt.space import Real



df = pd.read_csv("Advertising.csv")
X = df[["TV", "radio", "newspaper"]].values
y = df["sales"].values



def objective(alpha):
    alpha = alpha[0]
    model = Ridge(alpha=alpha)
    score = -np.mean(cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error"))
    return score


space = [Real(1e-4, 1e4, prior="log-uniform", name="alpha")]

result = gp_minimize(
    func=objective,
    dimensions=space,
    n_calls=40,
    random_state=42
)

alpha_opt = result.x[0]
print("Mejor alpha encontrado (Optimización Bayesiana):", alpha_opt)


Mejor alpha encontrado (Optimización Bayesiana): 858.4058105243847


In [101]:
B = 1000
n = len(y)
coef_boot = []

for b in range(B):
    idx = np.random.choice(n, n, replace=True)
    X_b = X[idx]
    y_b = y[idx]

    model = Ridge(alpha=alpha_opt)
    model.fit(X_b, y_b)

    coef_boot.append([model.intercept_, *model.coef_])

coef_boot = np.array(coef_boot)


std_boot = coef_boot.std(axis=0)

print("\n===== DESVIACIÓN ESTÁNDAR BOOTSTRAP (RIDGE + Optimización Bayesiana) =====")
print("SE(β0):", std_boot[0])
print("SE(β1 TV):", std_boot[1])
print("SE(β2 radio):", std_boot[2])
print("SE(β3 newspaper):", std_boot[3])


===== DESVIACIÓN ESTÁNDAR BOOTSTRAP (RIDGE + Optimización Bayesiana) =====
SE(β0): 0.3277694068453404
SE(β1 TV): 0.001945280293994069
SE(β2 radio): 0.010827269737270186
SE(β3 newspaper): 0.006415871109022883
