|                |   |
:----------------|---|
| **Nombre**     |Emilio Navarro   |
| **Fecha**      |  08-02-2026 |
| **Expediente** |757955   |

# Carga de datos

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_excel("Motor Trend Car Road Tests.xlsx")

In [4]:
df.head()

Unnamed: 0,model,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
0,Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
1,Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
2,Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
3,Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
4,Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   model   32 non-null     object 
 1   mpg     32 non-null     float64
 2   cyl     32 non-null     int64  
 3   disp    32 non-null     float64
 4   hp      32 non-null     int64  
 5   drat    32 non-null     float64
 6   wt      32 non-null     float64
 7   qsec    32 non-null     float64
 8   vs      32 non-null     int64  
 9   am      32 non-null     int64  
 10  gear    32 non-null     int64  
 11  carb    32 non-null     int64  
dtypes: float64(5), int64(6), object(1)
memory usage: 3.1+ KB


# Carga de librerias, excel, shape y columnas

In [12]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import r2_score

path = "Motor Trend Car Road Tests.xlsx"
df = pd.read_excel(path)

print("Shape:", df.shape)
print("Columnas:", list(df.columns))

Shape: (32, 12)
Columnas: ['model', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs', 'am', 'gear', 'carb']


# mpg como salida eliminar model

### R2 y signos de betas 

In [13]:
target = "mpg"
X = df.drop(columns=[target, "model"]).astype(float)
y = df[target].astype(float)

lr_11 = LinearRegression()
lr_11.fit(X, y)

yhat = lr_11.predict(X)
r2_11_full = r2_score(y, yhat)

betas_11 = pd.Series(lr_11.coef_, index=X.columns).sort_values()

print(f"1.1a R2 (entrenando con todo el dataset): {r2_11_full:.4f}\n")

print("1.1a Signo de betas (interpretación directa):")
for name, b in betas_11.items():
    sign = "positivo (+)" if b > 0 else ("negativo (-)" if b < 0 else "cero (0)")
    print(f"  {name:>6}: {b:>10.4f}  -> {sign}")

print("\nNota: beta positivo = aumenta mpg (manteniendo lo demás constante).")
print("      beta negativo = disminuye mpg (manteniendo lo demás constante).")

1.1a R2 (entrenando con todo el dataset): 0.8690

1.1a Signo de betas (interpretación directa):
      wt:    -3.7153  -> negativo (-)
    carb:    -0.1994  -> negativo (-)
     cyl:    -0.1114  -> negativo (-)
      hp:    -0.0215  -> negativo (-)
    disp:     0.0133  -> positivo (+)
      vs:     0.3178  -> positivo (+)
    gear:     0.6554  -> positivo (+)
    drat:     0.7871  -> positivo (+)
    qsec:     0.8210  -> positivo (+)
      am:     2.5202  -> positivo (+)

Nota: beta positivo = aumenta mpg (manteniendo lo demás constante).
      beta negativo = disminuye mpg (manteniendo lo demás constante).


### Train-test split (40%) + R2 train/test

In [14]:
target = "mpg"
X = df.drop(columns=[target, "model"]).astype(float)
y = df[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.40, random_state=42
)

lr_11_split = LinearRegression()
lr_11_split.fit(X_train, y_train)

r2_11_train = lr_11_split.score(X_train, y_train)
r2_11_test  = lr_11_split.score(X_test, y_test)

print(f"1.1b R2 entrenamiento (40% train): {r2_11_train:.4f}")
print(f"1.1b R2 prueba:                  {r2_11_test:.4f}")

1.1b R2 entrenamiento (40% train): 0.9982
1.1b R2 prueba:                  -7.1071


### Ridge (L2) con varios lambdas y comparar R2 train/test

In [29]:
target = "mpg"
X = df.drop(columns=[target, "model"]).astype(float)
y = df[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.40, random_state=42
)

# lambdas (alpha en Ridge). Puedes cambiar la lista.
lambdas = [0.0, 0.1, 1.0, 10.0, 100.0]

rows = []
for lam in lambdas:
    if lam == 0.0:
        model = LinearRegression()
    else:
        model = Ridge(alpha=lam)

    model.fit(X_train, y_train)
    r2_tr = model.score(X_train, y_train)
    r2_te = model.score(X_test, y_test)

    rows.append((lam, r2_tr, r2_te))

print("1.1c Comparación Ridge (L2):")
print("lambda(alpha)\tR2_train\tR2_test")
for lam, r2_tr, r2_te in rows:
    print(f"{lam:>11}\t{r2_tr:>7.4f}\t{r2_te:>7.4f}")

print(" lambda más grande = más regularización.")

1.1c Comparación Ridge (L2):
lambda(alpha)	R2_train	R2_test
        0.0	 0.9982	-7.1071
        0.1	 0.9794	 0.2607
        1.0	 0.9279	 0.6311
       10.0	 0.8634	 0.6567
      100.0	 0.8073	 0.5990
 lambda más grande = más regularización.


# Usando qsec como salida

### R2 y signos de betas qsec

In [16]:
target = "qsec"
X = df.drop(columns=[target, "model"]).astype(float)
y = df[target].astype(float)

lr_12 = LinearRegression()
lr_12.fit(X, y)

yhat = lr_12.predict(X)
r2_12_full = r2_score(y, yhat)

betas_12 = pd.Series(lr_12.coef_, index=X.columns).sort_values()

print(f"1.2a R2 (entrenando con todo el dataset): {r2_12_full:.4f}\n")

print("1.2a Signo de betas:")
for name, b in betas_12.items():
    sign = "positivo (+)" if b > 0 else ("negativo (-)" if b < 0 else "cero (0)")
    print(f"  {name:>6}: {b:>10.4f}  -> {sign}")

1.2a R2 (entrenando con todo el dataset): 0.8747

1.2a Signo de betas:
      am:    -0.9012  -> negativo (-)
     cyl:    -0.3627  -> negativo (-)
    carb:    -0.2736  -> negativo (-)
    gear:    -0.2013  -> negativo (-)
    drat:    -0.1311  -> negativo (-)
    disp:    -0.0075  -> negativo (-)
      hp:    -0.0016  -> negativo (-)
     mpg:     0.0690  -> positivo (+)
      vs:     0.9700  -> positivo (+)
      wt:     1.4963  -> positivo (+)


### Train- test split 40 % + R2 train/test qsec

In [17]:
target = "qsec"
X = df.drop(columns=[target, "model"]).astype(float)
y = df[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.40, random_state=42
)

lr_12_split = LinearRegression()
lr_12_split.fit(X_train, y_train)

r2_12_train = lr_12_split.score(X_train, y_train)
r2_12_test  = lr_12_split.score(X_test, y_test)

print(f"1.2b R2 entrenamiento (40% train): {r2_12_train:.4f}")
print(f"1.2b R2 prueba:                  {r2_12_test:.4f}")

1.2b R2 entrenamiento (40% train): 0.9989
1.2b R2 prueba:                  -1.0013


### Ridge L2 con varios lambdas y comparar R2 qsec

In [18]:
target = "qsec"
X = df.drop(columns=[target, "model"]).astype(float)
y = df[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.40, random_state=42
)

lambdas = [0.0, 0.1, 1.0, 10.0, 100.0]

rows = []
for lam in lambdas:
    model = LinearRegression() if lam == 0.0 else Ridge(alpha=lam)
    model.fit(X_train, y_train)
    rows.append((lam, model.score(X_train, y_train), model.score(X_test, y_test)))

print("1.2c Comparación Ridge (L2):")
print("lambda(alpha)\tR2_train\tR2_test")
for lam, r2_tr, r2_te in rows:
    print(f"{lam:>11}\t{r2_tr:>7.4f}\t{r2_te:>7.4f}")

1.2c Comparación Ridge (L2):
lambda(alpha)	R2_train	R2_test
        0.0	 0.9989	-1.0013
        0.1	 0.9856	 0.6878
        1.0	 0.9347	 0.7141
       10.0	 0.8454	 0.5044
      100.0	 0.7837	 0.4232


# mpg como salida junto con los dummies en cyl, gear , carb

### R2 y signos de betas dummies

In [19]:
target = "mpg"

# Base sin target ni model
base = df.drop(columns=[target, "model"]).copy()

# Dummies para cyl, gear, carb (drop_first para evitar colinealidad perfecta)
base_dum = pd.get_dummies(base, columns=["cyl", "gear", "carb"], drop_first=True)

X = base_dum.astype(float)
y = df[target].astype(float)

lr_21 = LinearRegression()
lr_21.fit(X, y)

yhat = lr_21.predict(X)
r2_21_full = r2_score(y, yhat)

betas_21 = pd.Series(lr_21.coef_, index=X.columns).sort_values()

print(f"2.1a R2 (entrenando con todo el dataset): {r2_21_full:.4f}\n")

print("2.1a Signo de betas:")
print("(En dummies: el beta se interpreta vs la categoría base que se eliminó con drop_first.)\n")
for name, b in betas_21.items():
    sign = "positivo (+)" if b > 0 else ("negativo (-)" if b < 0 else "cero (0)")
    print(f"  {name:>10}: {b:>10.4f}  -> {sign}")

2.1a R2 (entrenando con todo el dataset): 0.8931

2.1a Signo de betas:
(En dummies: el beta se interpreta vs la categoría base que se eliminó con drop_first.)

          wt:    -4.5298  -> negativo (-)
       cyl_6:    -2.6487  -> negativo (-)
      carb_2:    -0.9794  -> negativo (-)
       cyl_8:    -0.3362  -> negativo (-)
          hp:    -0.0705  -> negativo (-)
        disp:     0.0355  -> positivo (+)
        qsec:     0.3678  -> positivo (+)
      carb_4:     1.0914  -> positivo (+)
      gear_4:     1.1144  -> positivo (+)
        drat:     1.1828  -> positivo (+)
          am:     1.2121  -> positivo (+)
          vs:     1.9309  -> positivo (+)
      gear_5:     2.5284  -> positivo (+)
      carb_3:     2.9996  -> positivo (+)
      carb_6:     4.4776  -> positivo (+)
      carb_8:     7.2504  -> positivo (+)


### Train-test split 40% + R2 train/test con dummies

In [20]:
target = "mpg"
base = df.drop(columns=[target, "model"]).copy()
X = pd.get_dummies(base, columns=["cyl", "gear", "carb"], drop_first=True).astype(float)
y = df[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.40, random_state=42
)

lr_21_split = LinearRegression()
lr_21_split.fit(X_train, y_train)

r2_21_train = lr_21_split.score(X_train, y_train)
r2_21_test  = lr_21_split.score(X_test, y_test)

print(f"2.1b R2 entrenamiento (40% train): {r2_21_train:.4f}")
print(f"2.1b R2 prueba:                  {r2_21_test:.4f}")

2.1b R2 entrenamiento (40% train): 1.0000
2.1b R2 prueba:                  -1.3253


### Ridge L2 con dummies y lambdas

In [21]:
target = "mpg"
base = df.drop(columns=[target, "model"]).copy()
X = pd.get_dummies(base, columns=["cyl", "gear", "carb"], drop_first=True).astype(float)
y = df[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.40, random_state=42
)

lambdas = [0.0, 0.1, 1.0, 10.0, 100.0]

print("2.1c Comparación Ridge (L2) con dummies:")
print("lambda(alpha)\tR2_train\tR2_test")
for lam in lambdas:
    model = LinearRegression() if lam == 0.0 else Ridge(alpha=lam)
    model.fit(X_train, y_train)
    print(f"{lam:>11}\t{model.score(X_train, y_train):>7.4f}\t{model.score(X_test, y_test):>7.4f}")

2.1c Comparación Ridge (L2) con dummies:
lambda(alpha)	R2_train	R2_test
        0.0	 1.0000	-1.3253
        0.1	 0.9888	-0.0322
        1.0	 0.9398	 0.4823
       10.0	 0.8592	 0.6158
      100.0	 0.8058	 0.5907


# qsec como salida

### R2 y signos de las betas qsec y dummies

In [22]:
target = "qsec"
base = df.drop(columns=[target, "model"]).copy()
X = pd.get_dummies(base, columns=["cyl", "gear", "carb"], drop_first=True).astype(float)
y = df[target].astype(float)

lr_22 = LinearRegression()
lr_22.fit(X, y)

yhat = lr_22.predict(X)
r2_22_full = r2_score(y, yhat)

betas_22 = pd.Series(lr_22.coef_, index=X.columns).sort_values()

print(f"2.2a R2 (entrenando con todo el dataset): {r2_22_full:.4f}\n")
print("2.2a Signo de betas (dummies vs categoría base):")
for name, b in betas_22.items():
    sign = "positivo (+)" if b > 0 else ("negativo (-)" if b < 0 else "cero (0)")
    print(f"  {name:>10}: {b:>10.4f}  -> {sign}")

2.2a R2 (entrenando con todo el dataset): 0.9083

2.2a Signo de betas (dummies vs categoría base):
       cyl_8:    -2.9669  -> negativo (-)
      carb_4:    -1.9470  -> negativo (-)
          am:    -1.6943  -> negativo (-)
      carb_6:    -1.5652  -> negativo (-)
      carb_8:    -1.3323  -> negativo (-)
       cyl_6:    -1.1043  -> negativo (-)
      carb_2:    -0.8344  -> negativo (-)
      carb_3:    -0.2293  -> negativo (-)
          hp:    -0.0021  -> negativo (-)
        disp:     0.0035  -> positivo (+)
         mpg:     0.0277  -> positivo (+)
        drat:     0.1079  -> positivo (+)
      gear_5:     0.1823  -> positivo (+)
          vs:     0.2657  -> positivo (+)
          wt:     0.8105  -> positivo (+)
      gear_4:     1.3323  -> positivo (+)


### Train-test split 40% train + R2 train/split qsec y dummies

In [23]:
target = "qsec"
base = df.drop(columns=[target, "model"]).copy()
X = pd.get_dummies(base, columns=["cyl", "gear", "carb"], drop_first=True).astype(float)
y = df[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.40, random_state=42
)

lr_22_split = LinearRegression()
lr_22_split.fit(X_train, y_train)

r2_22_train = lr_22_split.score(X_train, y_train)
r2_22_test  = lr_22_split.score(X_test, y_test)

print(f"2.2b R2 entrenamiento (40% train): {r2_22_train:.4f}")
print(f"2.2b R2 prueba:                  {r2_22_test:.4f}")

2.2b R2 entrenamiento (40% train): 1.0000
2.2b R2 prueba:                  -0.0600


### Ridge L2 con dummies y lambdas qsec

In [24]:
target = "qsec"
base = df.drop(columns=[target, "model"]).copy()
X = pd.get_dummies(base, columns=["cyl", "gear", "carb"], drop_first=True).astype(float)
y = df[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.40, random_state=42
)

lambdas = [0.0, 0.1, 1.0, 10.0, 100.0]

print("2.2c Comparación Ridge (L2) con dummies:")
print("lambda(alpha)\tR2_train\tR2_test")
for lam in lambdas:
    model = LinearRegression() if lam == 0.0 else Ridge(alpha=lam)
    model.fit(X_train, y_train)
    print(f"{lam:>11}\t{model.score(X_train, y_train):>7.4f}\t{model.score(X_test, y_test):>7.4f}")

2.2c Comparación Ridge (L2) con dummies:
lambda(alpha)	R2_train	R2_test
        0.0	 1.0000	-0.0600
        0.1	 0.9955	 0.5833
        1.0	 0.9674	 0.5092
       10.0	 0.8638	 0.4301
      100.0	 0.7864	 0.4139


# Comparar R2 de 1.1 y 2.1

In [28]:
print("3.1 Comparación de R2 (mpg):")
print(f"R2 1.1 (sin dummies, todo numérico/ordinal): {r2_11_full:.4f}")
print(f"R2 2.1 (con dummies en cyl, gear, carb):       {r2_21_full:.4f}")

if r2_21_full > r2_11_full:
    print(" 2.1 explica mejor la variación de mpg (mayor R2).")
elif r2_21_full < r2_11_full:
    print(" 1.1 explica mejor la variación de mpg (mayor R2).")
else:
    print(" Ambos tienen el mismo R2.")

3.1 Comparación de R2 (mpg):
R2 1.1 (sin dummies, todo numérico/ordinal): 0.8690
R2 2.1 (con dummies en cyl, gear, carb):       0.8931
 2.1 explica mejor la variación de mpg (mayor R2).


# Comparar R2 de 1.2 y 2.2

In [27]:
print("3.2 Comparación de R2 (qsec):")
print(f"R2 1.2 (sin dummies, todo numérico/ordinal): {r2_12_full:.4f}")
print(f"R2 2.2 (con dummies en cyl, gear, carb):       {r2_22_full:.4f}")

if r2_22_full > r2_12_full:
    print(" 2.2 explica mejor la variación de qsec (mayor R2).")
elif r2_22_full < r2_12_full:
    print(" 1.2 explica mejor la variación de qsec (mayor R2).")
else:
    print("Ambos tienen el mismo R2.")

3.2 Comparación de R2 (qsec):
R2 1.2 (sin dummies, todo numérico/ordinal): 0.8747
R2 2.2 (con dummies en cyl, gear, carb):       0.9083
 2.2 explica mejor la variación de qsec (mayor R2).
