# Lab 04: Funciones de P√©rdida y Optimizaci√≥n

## Objetivos
1. Implementar funciones de p√©rdida desde cero
2. Comparar MSE, MAE y Cross-Entropy
3. Implementar Gradient Descent
4. Experimentar con learning rates
5. Detectar overfitting

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import sys
sys.path.append('codigo/')

plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

## Parte 1: Funciones de P√©rdida para Regresi√≥n

### Mean Squared Error (MSE)

In [None]:
def mse(y_pred, y_true):
    """
    Mean Squared Error
    MSE = (1/n) * Œ£(y_pred - y_true)¬≤
    """
    return np.mean((y_pred - y_true) ** 2)

def mse_gradient(y_pred, y_true):
    """
    Gradiente de MSE
    ‚àÇMSE/‚àÇy_pred = 2(y_pred - y_true) / n
    """
    n = y_true.shape[0]
    return 2 * (y_pred - y_true) / n

# Probar
y_true = np.array([1, 2, 3, 4, 5])
y_pred = np.array([1.1, 2.3, 2.8, 4.2, 4.9])

print(f"MSE: {mse(y_pred, y_true):.4f}")
print(f"Gradiente: {mse_gradient(y_pred, y_true)}")

### Mean Absolute Error (MAE)

In [None]:
def mae(y_pred, y_true):
    """
    Mean Absolute Error
    MAE = (1/n) * Œ£|y_pred - y_true|
    """
    return np.mean(np.abs(y_pred - y_true))

# Probar
print(f"MAE: {mae(y_pred, y_true):.4f}")

### Comparaci√≥n: MSE vs MAE con Outliers

In [None]:
# Datos sin outlier
y_true_clean = np.array([1, 2, 3, 4, 5])
y_pred_clean = np.array([1.1, 2.2, 2.9, 4.1, 5.0])

# Datos con outlier
y_true_outlier = y_true_clean.copy()
y_pred_outlier = y_pred_clean.copy()
y_pred_outlier[2] = 10  # Outlier!

print("SIN OUTLIER:")
print(f"  MSE: {mse(y_pred_clean, y_true_clean):.4f}")
print(f"  MAE: {mae(y_pred_clean, y_true_clean):.4f}")

print("\nCON OUTLIER:")
print(f"  MSE: {mse(y_pred_outlier, y_true_outlier):.4f}")
print(f"  MAE: {mae(y_pred_outlier, y_true_outlier):.4f}")

print("\n‚ö†Ô∏è MSE aumenta dram√°ticamente con el outlier debido al t√©rmino cuadr√°tico.")

## Parte 2: Funciones de P√©rdida para Clasificaci√≥n

### Binary Cross-Entropy

In [None]:
def binary_crossentropy(y_pred, y_true):
    """
    Binary Cross-Entropy
    BCE = -[y*log(≈∑) + (1-y)*log(1-≈∑)]
    """
    epsilon = 1e-15
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    return -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))

# Ejemplos
print("EJEMPLOS DE BINARY CROSS-ENTROPY:\n")

# Caso 1: Predicci√≥n correcta con alta confianza
print("1. Clase real=1, Predicci√≥n=0.9 (correcto, alta confianza)")
print(f"   BCE: {binary_crossentropy(np.array([0.9]), np.array([1])):.4f} (baja p√©rdida)\n")

# Caso 2: Predicci√≥n incorrecta con alta confianza
print("2. Clase real=1, Predicci√≥n=0.1 (incorrecto, alta confianza)")
print(f"   BCE: {binary_crossentropy(np.array([0.1]), np.array([1])):.4f} (alta p√©rdida)\n")

# Caso 3: Predicci√≥n incierta
print("3. Clase real=1, Predicci√≥n=0.5 (incierto)")
print(f"   BCE: {binary_crossentropy(np.array([0.5]), np.array([1])):.4f} (p√©rdida media)")

### Visualizaci√≥n de Binary Cross-Entropy

In [None]:
y_pred_range = np.linspace(0.01, 0.99, 100)

# P√©rdida cuando y_true = 1
loss_class_1 = [-np.log(p) for p in y_pred_range]

# P√©rdida cuando y_true = 0
loss_class_0 = [-np.log(1 - p) for p in y_pred_range]

plt.figure(figsize=(10, 6))
plt.plot(y_pred_range, loss_class_1, label='Clase Real = 1', linewidth=2)
plt.plot(y_pred_range, loss_class_0, label='Clase Real = 0', linewidth=2)
plt.xlabel('Probabilidad Predicha', fontsize=12)
plt.ylabel('P√©rdida (Binary Cross-Entropy)', fontsize=12)
plt.title('Binary Cross-Entropy vs Predicci√≥n', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.ylim(0, 5)
plt.show()

print("Observaciones:")
print("- Cuando y=1: p√©rdida baja si predicci√≥n cercana a 1")
print("- Cuando y=0: p√©rdida baja si predicci√≥n cercana a 0")
print("- Predicciones confiadas pero incorrectas tienen p√©rdida muy alta")

## Parte 3: Gradient Descent

Implementemos gradient descent para minimizar una funci√≥n simple.

In [None]:
def funcion_objetivo(x):
    """Funci√≥n a minimizar: f(x) = x¬≤ - 4x + 4 = (x-2)¬≤"""
    return (x - 2) ** 2

def gradiente(x):
    """Gradiente de f(x)"""
    return 2 * (x - 2)

def gradient_descent(x_init, learning_rate, num_iterations):
    """
    Implementaci√≥n b√°sica de Gradient Descent
    """
    x = x_init
    history = [x]
    
    for i in range(num_iterations):
        # Calcular gradiente
        grad = gradiente(x)
        
        # Actualizar par√°metro
        x = x - learning_rate * grad
        
        history.append(x)
    
    return x, history

# Ejecutar con diferentes learning rates
x_init = 5.0
iterations = 15

lrs = [0.01, 0.1, 0.5, 1.0]

plt.figure(figsize=(15, 4))

for idx, lr in enumerate(lrs):
    x_final, history = gradient_descent(x_init, lr, iterations)
    
    plt.subplot(1, 4, idx + 1)
    
    # Graficar funci√≥n
    x_range = np.linspace(-1, 6, 100)
    y_range = [funcion_objetivo(x) for x in x_range]
    plt.plot(x_range, y_range, 'b-', alpha=0.3, linewidth=2)
    
    # Graficar trayectoria de GD
    y_history = [funcion_objetivo(x) for x in history]
    plt.plot(history, y_history, 'ro-', markersize=4, linewidth=1)
    
    # Marcar m√≠nimo
    plt.plot(2, 0, 'g*', markersize=15, label='M√≠nimo')
    
    plt.title(f'LR = {lr}', fontsize=12, fontweight='bold')
    plt.xlabel('x')
    plt.ylabel('f(x)')
    plt.grid(True, alpha=0.3)
    plt.ylim(-1, 12)
    
    print(f"Learning Rate {lr}:")
    print(f"  Inicio: x = {x_init}")
    print(f"  Final: x = {x_final:.4f}")
    print(f"  Error: {abs(x_final - 2):.4f}\n")

plt.tight_layout()
plt.show()

print("\n‚ö†Ô∏è Observaciones:")
print("- LR muy peque√±o (0.01): Convergencia lenta")
print("- LR moderado (0.1, 0.5): Buena convergencia")
print("- LR muy grande (1.0): Puede oscilar o diverger")

## Parte 4: Entrenamiento con Gradient Descent

Apliquemos GD a un problema real de regresi√≥n lineal.

In [None]:
# Generar datos sint√©ticos
np.random.seed(42)
X = 2 * np.random.rand(100, 1)
y = 4 + 3 * X + np.random.randn(100, 1) * 0.5

# Visualizar datos
plt.figure(figsize=(8, 6))
plt.scatter(X, y, alpha=0.6)
plt.xlabel('X', fontsize=12)
plt.ylabel('y', fontsize=12)
plt.title('Datos de Entrenamiento', fontsize=14, fontweight='bold')
plt.grid(True, alpha=0.3)
plt.show()

print(f"N√∫mero de muestras: {len(X)}")
print(f"Relaci√≥n real: y = 4 + 3*X + ruido")

In [None]:
# Inicializar par√°metros
theta = np.random.randn(2, 1)  # [bias, weight]
X_b = np.c_[np.ones((len(X), 1)), X]  # A√±adir columna de 1s para el bias

# Hiperpar√°metros
learning_rate = 0.1
n_iterations = 100

# Historia
loss_history = []

# Gradient Descent
for iteration in range(n_iterations):
    # Predicciones
    y_pred = X_b.dot(theta)
    
    # Calcular p√©rdida (MSE)
    loss = mse(y_pred, y)
    loss_history.append(loss)
    
    # Calcular gradiente
    gradients = 2/len(X) * X_b.T.dot(y_pred - y)
    
    # Actualizar par√°metros
    theta = theta - learning_rate * gradients
    
    if iteration % 20 == 0:
        print(f"Iteraci√≥n {iteration}: Loss = {loss:.4f}")

print(f"\nPar√°metros finales:")
print(f"  Bias (Œ∏0): {theta[0][0]:.4f} (esperado: ~4)")
print(f"  Weight (Œ∏1): {theta[1][0]:.4f} (esperado: ~3)")

In [None]:
# Visualizar resultados
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Ajuste del modelo
ax1.scatter(X, y, alpha=0.6, label='Datos')
X_new = np.array([[0], [2]])
X_new_b = np.c_[np.ones((2, 1)), X_new]
y_predict = X_new_b.dot(theta)
ax1.plot(X_new, y_predict, 'r-', linewidth=2, label='Modelo')
ax1.set_xlabel('X', fontsize=12)
ax1.set_ylabel('y', fontsize=12)
ax1.set_title('Regresi√≥n Lineal con Gradient Descent', fontsize=14, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Curva de aprendizaje
ax2.plot(loss_history, linewidth=2)
ax2.set_xlabel('Iteraci√≥n', fontsize=12)
ax2.set_ylabel('MSE Loss', fontsize=12)
ax2.set_title('Curva de Aprendizaje', fontsize=14, fontweight='bold')
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("‚úì El modelo converge correctamente!")

## Parte 5: Overfitting

Demostremos el concepto de overfitting.

In [None]:
# Generar datos
np.random.seed(42)
n = 15
X_train = np.linspace(0, 3, n)
y_train = 2 * np.sin(X_train) + np.random.randn(n) * 0.3

# Crear conjunto de test m√°s grande
X_test = np.linspace(0, 3, 100)
y_test = 2 * np.sin(X_test)

# Ajustar polinomios de diferentes grados
degrees = [1, 3, 10]

plt.figure(figsize=(15, 5))

for i, degree in enumerate(degrees):
    plt.subplot(1, 3, i + 1)
    
    # Ajustar polinomio
    coeffs = np.polyfit(X_train, y_train, degree)
    poly = np.poly1d(coeffs)
    
    # Predicciones
    y_train_pred = poly(X_train)
    y_test_pred = poly(X_test)
    
    # Calcular errores
    train_mse = mse(y_train_pred, y_train)
    test_mse = mse(y_test_pred, y_test)
    
    # Graficar
    plt.scatter(X_train, y_train, s=50, alpha=0.7, label='Train')
    plt.plot(X_test, y_test, 'g--', alpha=0.5, label='Verdadero')
    plt.plot(X_test, y_test_pred, 'r-', linewidth=2, label='Modelo')
    
    plt.xlabel('X', fontsize=11)
    plt.ylabel('y', fontsize=11)
    
    status = "Underfitting" if degree == 1 else ("Good fit" if degree == 3 else "Overfitting")
    plt.title(f'Grado {degree} - {status}\nTrain MSE={train_mse:.3f}, Test MSE={test_mse:.3f}', 
             fontsize=11, fontweight='bold')
    plt.legend(fontsize=9)
    plt.grid(True, alpha=0.3)
    plt.ylim(-3, 3)

plt.tight_layout()
plt.show()

print("‚ö†Ô∏è Observaciones:")
print("- Grado 1: Modelo muy simple, no captura la complejidad (underfitting)")
print("- Grado 3: Balance adecuado")
print("- Grado 10: Modelo muy complejo, memoriza el ruido (overfitting)")
print("\n  En overfitting: Train MSE bajo, pero Test MSE alto")

## Desaf√≠o: Mini-batch Gradient Descent

Implementa mini-batch gradient descent y comp√°ralo con batch GD.

In [None]:
def minibatch_gradient_descent(X, y, learning_rate, n_epochs, batch_size):
    """
    Mini-batch Gradient Descent
    """
    m = len(X)
    theta = np.random.randn(2, 1)
    loss_history = []
    
    for epoch in range(n_epochs):
        # Mezclar datos
        indices = np.random.permutation(m)
        X_shuffled = X[indices]
        y_shuffled = y[indices]
        
        # Dividir en mini-batches
        for i in range(0, m, batch_size):
            X_batch = X_shuffled[i:i+batch_size]
            y_batch = y_shuffled[i:i+batch_size]
            
            # Gradiente en el batch
            y_pred = X_batch.dot(theta)
            gradients = 2/len(X_batch) * X_batch.T.dot(y_pred - y_batch)
            theta = theta - learning_rate * gradients
        
        # Calcular p√©rdida en todos los datos
        y_pred_all = X.dot(theta)
        loss = mse(y_pred_all, y)
        loss_history.append(loss)
    
    return theta, loss_history

# Comparar batch sizes
batch_sizes = [len(X_b), 32, 1]  # Batch, Mini-batch, Stochastic
labels = ['Batch GD', 'Mini-batch (32)', 'SGD (1)']

plt.figure(figsize=(10, 6))

for batch_size, label in zip(batch_sizes, labels):
    theta, loss_hist = minibatch_gradient_descent(X_b, y, 0.1, 50, batch_size)
    plt.plot(loss_hist, label=label, linewidth=2)

plt.xlabel('√âpoca', fontsize=12)
plt.ylabel('MSE Loss', fontsize=12)
plt.title('Comparaci√≥n de Variantes de Gradient Descent', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(True, alpha=0.3)
plt.show()

print("\n‚úì Mini-batch GD balancea velocidad y estabilidad.")

## Resumen

### Funciones de P√©rdida:
- **MSE**: Regresi√≥n, sensible a outliers
- **MAE**: Regresi√≥n, robusta a outliers
- **Binary CE**: Clasificaci√≥n binaria
- **Categorical CE**: Clasificaci√≥n multiclase

### Gradient Descent:
- Minimiza la p√©rdida iterativamente
- Learning rate es crucial
- Mini-batch es el est√°ndar

### Overfitting:
- Modelo muy complejo
- Memoriza en lugar de aprender
- Detectar: comparar train vs test loss

**Pr√≥ximo paso**: En Lab 05 aprenderemos Backpropagation, el algoritmo que hace posible calcular gradientes en redes profundas.

---

**¬°Excelente trabajo! üéâ**