In [None]:
# ============================================================
# 📌 Celda 1: Instalación y carga de librerías
# ============================================================
#!pip install scikit-learn matplotlib seaborn pandas


In [None]:
# ============================================================
# 📌 Celda 2: Importar librerías
# ============================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [None]:
# ============================================================
# 📌 Celda 3: Cargar dataset California Housing
# ============================================================
# Este dataset contiene información de precios de casas en California.
data = fetch_california_housing(as_frame=True)
X = data.data
y = data.target   # variable objetivo: precio medio de la vivienda

print("Dimensiones:", X.shape)
X.head()


In [None]:
# ============================================================
# 📌 Celda 4: Separar en entrenamiento y prueba
# ============================================================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

print("Tamaño entrenamiento:", X_train.shape)
print("Tamaño prueba:", X_test.shape)


In [None]:
# ============================================================
# 📌 Celda 5: Árbol de regresión sin poda (overfitting)
# ============================================================
reg_big = DecisionTreeRegressor(random_state=42)
reg_big.fit(X_train, y_train)

# Predicciones
y_train_pred = reg_big.predict(X_train)
y_test_pred = reg_big.predict(X_test)

# Métricas
print("🔹 Árbol grande (sin poda)")
print("MAE train:", mean_absolute_error(y_train, y_train_pred))
print("MAE test:", mean_absolute_error(y_test, y_test_pred))
print("MSE test:", mean_squared_error(y_test, y_test_pred))
print("R² train:", r2_score(y_train, y_train_pred))
print("R² test:", r2_score(y_test, y_test_pred))

# Visualización del árbol (solo hasta profundidad 3 para legibilidad)
plt.figure(figsize=(20,8))
plot_tree(reg_big, filled=True, feature_names=X.columns, max_depth=3)
plt.title("Árbol de regresión grande (sin poda, max_depth completo)")
plt.show()


In [None]:
# ============================================================
# 📌 Celda 6: Árbol de regresión podado (max_depth=5)
# ============================================================
reg_pruned = DecisionTreeRegressor(max_depth=5, random_state=42)
reg_pruned.fit(X_train, y_train)

# Predicciones
y_train_pred_pruned = reg_pruned.predict(X_train)
y_test_pred_pruned = reg_pruned.predict(X_test)

# Métricas
print("🔹 Árbol podado (max_depth=5)")
print("MAE train:", mean_absolute_error(y_train, y_train_pred_pruned))
print("MAE test:", mean_absolute_error(y_test, y_test_pred_pruned))
print("MSE test:", mean_squared_error(y_test, y_test_pred_pruned))
print("R² train:", r2_score(y_train, y_train_pred_pruned))
print("R² test:", r2_score(y_test, y_test_pred_pruned))

# Visualización del árbol podado
plt.figure(figsize=(20,8))
plot_tree(reg_pruned, filled=True, feature_names=X.columns, max_depth=3)
plt.title("Árbol de regresión podado (max_depth=5)")
plt.show()


In [None]:
# ============================================================
# 📌 Celda 7: Comparación gráfica Predicción vs Real
# ============================================================
plt.figure(figsize=(10,5))

# Árbol grande
plt.subplot(1,2,1)
plt.scatter(y_test, y_test_pred, alpha=0.3, color="red")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "k--")
plt.xlabel("Valores reales")
plt.ylabel("Predicción")
plt.title("Árbol grande (overfitting)")

# Árbol podado
plt.subplot(1,2,2)
plt.scatter(y_test, y_test_pred_pruned, alpha=0.3, color="blue")
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "k--")
plt.xlabel("Valores reales")
plt.ylabel("Predicción")
plt.title("Árbol podado (max_depth=5)")

plt.tight_layout()
plt.show()


## 📌 Conclusión

El **árbol sin poda** obtiene un R² muy alto en entrenamiento, pero cae significativamente en el conjunto de prueba, mostrando un claro **sobreajuste**.  
En cambio, el **árbol podado (max_depth=5)** tiene un rendimiento más equilibrado: menor exactitud en entrenamiento, pero mejor capacidad de generalización en test.  

👉 Este ejemplo ilustra por qué **poda o limitación de hiperparámetros** es fundamental en regresión con árboles: no buscamos memorizar los datos, sino capturar patrones que sean **robustos y aplicables a nuevos casos**.
