# Automobile Dataset — Regularization Activity

This notebook uses the **Automobile dataset** from UCI ML Repository:  
[Automobile Dataset (imports-85.data)](https://archive.ics.uci.edu/ml/datasets/Automobile)

We will treat this as a **regression problem**, predicting **price** from technical and categorical features.

## Steps:
1. Load dataset with appropriate headers (since the original file has none).
2. Preprocess: 
   - One-hot encode categorical features.
   - Scale numerical features.
3. Train and compare:
   - **Linear Regression**
   - **Ridge Regression**
   - **Lasso Regression**
   - **ElasticNet** (bonus)
4. Evaluate using **R²** scores on train/test sets and plot results.

---

## Important
- Download the dataset file (`imports-85.data`) from UCI and set the path in the `CSV_PATH` variable below.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.metrics import r2_score

# Mostrar tabla en ChatGPT (ignorado fuera)
try:
    from caas_jupyter_tools import display_dataframe_to_user
except Exception:
    display_dataframe_to_user = None

# --- CONFIG ---
CSV_PATH = 'C:\Users\emi_g\ML2025\mlct\data\external\car.data'  # <-- pon aquí la ruta a tu 'imports-85.data', por ejemplo: r'C:\ruta\imports-85.data'

# Column headers from UCI docs
headers = ["symboling", "normalized-losses", "make", "fuel-type", "aspiration",
           "num-of-doors", "body-style", "drive-wheels", "engine-location",
           "wheel-base", "length", "width", "height", "curb-weight",
           "engine-type", "num-of-cylinders", "engine-size", "fuel-system",
           "bore", "stroke", "compression-ratio", "horsepower", "peak-rpm",
           "city-mpg", "highway-mpg", "price"]

if CSV_PATH is None:
    raise ValueError("Please set CSV_PATH to your local imports-85.data file.")

# Load data (handle missing values marked as '?')
df = pd.read_csv(CSV_PATH, names=headers, na_values='?')

# Drop rows with missing target (price)
df = df.dropna(subset=['price'])

# Convert numeric columns to float
numeric_cols = ["symboling","normalized-losses","wheel-base","length","width",
                "height","curb-weight","engine-size","bore","stroke",
                "compression-ratio","horsepower","peak-rpm","city-mpg","highway-mpg","price"]
for col in numeric_cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows still with NaN (missing values)
df = df.dropna()

# Define features and target
X = df.drop("price", axis=1)
y = df["price"].values

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Categorical vs numeric split
categorical_cols = [c for c in X.columns if c not in numeric_cols]
numeric_features = [c for c in X.columns if c in numeric_cols]

preprocess = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols),
        ('num', StandardScaler(), numeric_features)
    ],
    remainder='drop'
)

alphas = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0]
results = []

# Linear
lin = Pipeline(steps=[('prep', preprocess),
                     ('model', LinearRegression())])
lin.fit(X_train, y_train)
results.append({"model":"Linear","alpha":0.0,
                "R2_train": lin.score(X_train,y_train),
                "R2_test": lin.score(X_test,y_test)})

# Ridge
for a in alphas:
    ridge = Pipeline(steps=[('prep', preprocess),
                           ('model', Ridge(alpha=a, random_state=42))])
    ridge.fit(X_train, y_train)
    results.append({"model":"Ridge","alpha":a,
                    "R2_train": ridge.score(X_train,y_train),
                    "R2_test": ridge.score(X_test,y_test)})

# Lasso
for a in alphas:
    lasso = Pipeline(steps=[('prep', preprocess),
                           ('model', Lasso(alpha=a, random_state=42, max_iter=20000))])
    lasso.fit(X_train, y_train)
    results.append({"model":"Lasso","alpha":a,
                    "R2_train": lasso.score(X_train,y_train),
                    "R2_test": lasso.score(X_test,y_test)})

# ElasticNet
for a in alphas:
    en = Pipeline(steps=[('prep', preprocess),
                        ('model', ElasticNet(alpha=a, l1_ratio=0.5, random_state=42, max_iter=20000))])
    en.fit(X_train, y_train)
    results.append({"model":"ElasticNet","alpha":a,
                    "R2_train": en.score(X_train,y_train),
                    "R2_test": en.score(X_test,y_test)})

res_df = pd.DataFrame(results).sort_values(["model","alpha"]).reset_index(drop=True)

# Mostrar tabla (intenta en ChatGPT, si no, imprime)
if display_dataframe_to_user:
    display_dataframe_to_user("Automobile — R2 scores", res_df)
else:
    print(res_df)

# Plots
for mdl in ["Ridge","Lasso","ElasticNet"]:
    sub = res_df[res_df["model"]==mdl].sort_values("alpha")
    plt.figure()
    plt.semilogx(sub["alpha"].values, sub["R2_test"].values, marker="o")
    plt.title(f"{mdl}: R^2 (test) vs alpha — Automobile")
    plt.xlabel("alpha (λ)")
    plt.ylabel("R^2 (test)")
    plt.show()

print("Linear Regression R^2 (test):", res_df[res_df["model"]=="Linear"]["R2_test"].iloc[0])


ValueError: Please set CSV_PATH to your local imports-85.data file.