In [1]:
%load_ext autoreload
%autoreload 2

# Tuning de Hiperparámetros - Random Forest
## Predicción de Características de Vuelo en Planeador

**Objetivo:** Optimizar hiperparámetros de RF para maximizar R² en targets problemáticos

**Autor:** Estanislao  
**Fecha:** Diciembre 2024

---

## Estrategia

1. **Grid Search** en espacio de hiperparámetros
2. **Foco especial** en targets problemáticos:
   - intensidad_termicas_mean_ms
   - tasa_ascenso_mean_ms
3. **Trade-off**: Performance vs Overfitting
4. **Justificación** de hiperparámetros finales

## 1. Setup

In [None]:
import sys
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score

sys.path.append('../01_Preprocesamiento')
from eda_functions import configurar_visualizacion
from modelo_utils import preparar_datos, evaluar_modelo

warnings.filterwarnings('ignore')
configurar_visualizacion()
np.random.seed(42)

## 2. Cargar Datos

In [None]:
dev = pd.read_csv('../data/processed/dev.csv', index_col=0)
test = pd.read_csv('../data/processed/test.csv', index_col=0)

targets_reg = [
    'altura_max_m', 'ganancia_altura_m', 'duracion_min',
    'distancia_km', 'velocidad_promedio_kmh', 'num_termicas',
    'intensidad_termicas_mean_ms', 'tiempo_en_termicas_min',
    'tasa_ascenso_mean_ms'
]

# Targets problemáticos
targets_problematicos = ['intensidad_termicas_mean_ms', 'tasa_ascenso_mean_ms']

print(f"Dev: {dev.shape}, Test: {test.shape}")
print(f"Targets: {len(targets_reg)}")
print(f"Targets problemáticos: {targets_problematicos}")

## 3. Preparar Features

In [None]:
# Features simples (14 promedios)
X_dev, y_dev, X_test, y_test = preparar_datos(dev, test, targets_reg, modo='simple')

print(f"Features: {X_dev.shape[1]}")
print(f"Samples - Dev: {X_dev.shape[0]}, Test: {X_test.shape[0]}")
print(f"Ratio: {X_dev.shape[0]/X_dev.shape[1]:.1f}:1")

## 4. Grid de Hiperparámetros

Explorar espacio sistemáticamente

In [None]:
# Grid de búsqueda
param_grid = {
    'max_depth': [5, 8, 10, 12, 15, None],
    'min_samples_leaf': [2, 5, 8, 10, 15],
    'max_features': [0.2, 0.3, 0.5, 0.7, 'sqrt']
}

print("Grid de Hiperparámetros:")
print(f"  max_depth: {param_grid['max_depth']}")
print(f"  min_samples_leaf: {param_grid['min_samples_leaf']}")
print(f"  max_features: {param_grid['max_features']}")
print(f"\nCombinaciones totales: {len(param_grid['max_depth']) * len(param_grid['min_samples_leaf']) * len(param_grid['max_features'])}")
print("\n⚠️ Esto tomará tiempo (~10-15 min)")

## 5. Grid Search en Targets Problemáticos

Primero optimizar para los que NO funcionan

In [None]:
resultados_grid = {}
mejores_params = {}

print("="*70)
print("GRID SEARCH - TARGETS PROBLEMÁTICOS")
print("="*70)

for target in targets_problematicos:
    print(f"\n{target}:")
    t0 = time()
    
    # Grid Search con CV
    rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    
    grid_search = GridSearchCV(
        rf,
        param_grid,
        cv=5,
        scoring='r2',
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_dev, y_dev[target])
    
    # Guardar resultados
    resultados_grid[target] = pd.DataFrame(grid_search.cv_results_)
    mejores_params[target] = grid_search.best_params_
    
    print(f"  Mejor CV R²: {grid_search.best_score_:.4f}")
    print(f"  Mejores params: {grid_search.best_params_}")
    
    # Evaluar en test
    y_pred_test = grid_search.predict(X_test)
    r2_test = evaluar_modelo(y_test[target], y_pred_test, target)['R2']
    
    print(f"  Test R²: {r2_test:.4f}")
    print(f"  Tiempo: {time()-t0:.1f}s")

print("\n" + "="*70)

## 6. Grid Search en TODOS los Targets

Buscar configuración óptima general

In [None]:
# Grid más reducido para todos los targets
param_grid_reduced = {
    'max_depth': [8, 10, 12],
    'min_samples_leaf': [5, 8, 10],
    'max_features': [0.2, 0.3, 0.5]
}

print("="*70)
print("GRID SEARCH - TODOS LOS TARGETS")
print("="*70)
print(f"Grid reducido: {len(param_grid_reduced['max_depth']) * len(param_grid_reduced['min_samples_leaf']) * len(param_grid_reduced['max_features'])} combinaciones\n")

resultados_todos = []

for target in targets_reg:
    print(f"\n{target}:")
    t0 = time()
    
    rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    
    grid_search = GridSearchCV(
        rf,
        param_grid_reduced,
        cv=5,
        scoring='r2',
        n_jobs=-1,
        verbose=0
    )
    
    grid_search.fit(X_dev, y_dev[target])
    
    # Evaluar en test
    y_pred_dev = grid_search.predict(X_dev)
    y_pred_test = grid_search.predict(X_test)
    
    r2_dev = evaluar_modelo(y_dev[target], y_pred_dev, target)['R2']
    r2_test = evaluar_modelo(y_test[target], y_pred_test, target)['R2']
    
    resultados_todos.append({
        'target': target,
        'CV_R2': grid_search.best_score_,
        'Dev_R2': r2_dev,
        'Test_R2': r2_test,
        'Gap': r2_dev - r2_test,
        'max_depth': grid_search.best_params_['max_depth'],
        'min_samples_leaf': grid_search.best_params_['min_samples_leaf'],
        'max_features': grid_search.best_params_['max_features']
    })
    
    print(f"  CV: {grid_search.best_score_:.4f}")
    print(f"  Test: {r2_test:.4f}")
    print(f"  Params: {grid_search.best_params_}")
    print(f"  Tiempo: {time()-t0:.1f}s")

df_todos = pd.DataFrame(resultados_todos)
print("\n" + "="*70)

## 7. Análisis de Resultados

In [None]:
print("="*70)
print("ANÁLISIS: Hiperparámetros Óptimos por Target")
print("="*70)

print("\n" + df_todos.sort_values('Test_R2', ascending=False).to_string(index=False))

# Hiperparámetros más comunes
print("\n" + "="*70)
print("FRECUENCIA DE HIPERPARÁMETROS:")
print("="*70)

print("\nmax_depth:")
print(df_todos['max_depth'].value_counts().sort_index())

print("\nmin_samples_leaf:")
print(df_todos['min_samples_leaf'].value_counts().sort_index())

print("\nmax_features:")
print(df_todos['max_features'].value_counts().sort_index())

## 8. Visualización

In [None]:
# Gráfico: R² Test vs Hiperparámetros
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# max_depth
ax = axes[0]
df_todos.groupby('max_depth')['Test_R2'].mean().plot(kind='bar', ax=ax, alpha=0.7)
ax.set_xlabel('max_depth')
ax.set_ylabel('R² Test (promedio)')
ax.set_title('Impacto de max_depth')
ax.grid(True, alpha=0.3)

# min_samples_leaf
ax = axes[1]
df_todos.groupby('min_samples_leaf')['Test_R2'].mean().plot(kind='bar', ax=ax, alpha=0.7)
ax.set_xlabel('min_samples_leaf')
ax.set_ylabel('R² Test (promedio)')
ax.set_title('Impacto de min_samples_leaf')
ax.grid(True, alpha=0.3)

# max_features
ax = axes[2]
df_todos.groupby('max_features')['Test_R2'].mean().plot(kind='bar', ax=ax, alpha=0.7)
ax.set_xlabel('max_features')
ax.set_ylabel('R² Test (promedio)')
ax.set_title('Impacto de max_features')
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('../data/processed/tuning_hiperparametros.png', dpi=300, bbox_inches='tight')
plt.show()

## 9. Recomendación Final

In [None]:
# Configuración recomendada
max_depth_rec = df_todos.groupby('max_depth')['Test_R2'].mean().idxmax()
min_samples_leaf_rec = df_todos.groupby('min_samples_leaf')['Test_R2'].mean().idxmax()
max_features_rec = df_todos.groupby('max_features')['Test_R2'].mean().idxmax()

print("="*70)
print("RECOMENDACIÓN FINAL")
print("="*70)
print("\nHiperparámetros óptimos (basado en promedio de R² Test):")
print(f"  max_depth: {max_depth_rec}")
print(f"  min_samples_leaf: {min_samples_leaf_rec}")
print(f"  max_features: {max_features_rec}")

# R² esperado con estos parámetros
r2_promedio = df_todos['Test_R2'].mean()
print(f"\nR² Test promedio: {r2_promedio:.4f}")

# Targets problemáticos
print("\n" + "="*70)
print("TARGETS PROBLEMÁTICOS:")
print("="*70)

for target in targets_problematicos:
    row = df_todos[df_todos['target']==target].iloc[0]
    print(f"\n{target}:")
    print(f"  Test R²: {row['Test_R2']:.4f}")
    print(f"  Params: max_depth={row['max_depth']}, min_samples_leaf={row['min_samples_leaf']}, max_features={row['max_features']}")
    
    if row['Test_R2'] > 0.1:
        print(f"  ✓ MEJORÓ con tuning")
    else:
        print(f"  ❌ No predecible con features meteorológicas")

print("\n" + "="*70)

## 10. Guardar Resultados

In [None]:
df_todos.to_csv('../data/processed/tuning_resultados.csv', index=False)

print("\n" + "="*70)
print("ARCHIVOS GUARDADOS")
print("="*70)
print("  - data/processed/tuning_resultados.csv")
print("  - data/processed/tuning_hiperparametros.png")
print("\n" + "="*70)
print("✓ TUNING DE HIPERPARÁMETROS COMPLETADO")
print("="*70)