In [1]:
%load_ext autoreload
%autoreload 2

# Random Forest Regularizado
## Predicci√≥n de Caracter√≠sticas de Vuelo en Planeador

**Objetivo:** Mejorar sobre baseline con Random Forest regularizado

**Autor:** Estanislao  
**Fecha:** Diciembre 2024

---

## Estrategia Anti-Overfitting

- `max_depth=10`: Limitar profundidad de √°rboles
- `min_samples_leaf=5`: M√≠nimo 5 muestras por hoja
- `max_features=0.3`: Solo 30% features por √°rbol
- Cross-validation para validar

## 1. Setup

In [None]:
import sys
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from time import time

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

sys.path.append('../01_Preprocesamiento')
from eda_functions import configurar_visualizacion
from modelo_utils import preparar_datos, evaluar_modelo

warnings.filterwarnings('ignore')
configurar_visualizacion()
np.random.seed(42)

ModuleNotFoundError: No module named 'eda_functions'

## 2. Cargar Datos

In [None]:
dev = pd.read_csv('../data/processed/dev.csv', index_col=0)
test = pd.read_csv('../data/processed/test.csv', index_col=0)

targets_reg = [
    'altura_max_m', 'ganancia_altura_m', 'duracion_min',
    'distancia_km', 'velocidad_promedio_kmh', 'num_termicas',
    'intensidad_termicas_mean_ms', 'tiempo_en_termicas_min',
    'tasa_ascenso_mean_ms'
]

print(f"Dev: {dev.shape}, Test: {test.shape}")
print(f"Targets: {len(targets_reg)}")

## 3. Preparar Features

In [None]:
# Usar modo simple (solo promedios)
X_dev, y_dev, X_test, y_test = preparar_datos(dev, test, targets_reg, modo='simple')

print(f"\nFeatures: {X_dev.shape[1]}")
print(f"Samples - Dev: {X_dev.shape[0]}, Test: {X_test.shape[0]}")
print(f"Ratio: {X_dev.shape[0]/X_dev.shape[1]:.1f}:1")

## 4. Random Forest Regularizado

In [None]:
resultados_rf = []
modelos_rf = {}

print("="*70)
print("RANDOM FOREST - REGULARIZADO")
print("="*70)
print("max_depth=10, min_samples_leaf=5, max_features=0.3\n")

for target in targets_reg:
    print(f"\n{target}:")
    t0 = time()
    
    # Modelo regularizado
    rf = RandomForestRegressor(
        n_estimators=100,
        max_depth=10,
        min_samples_leaf=5,
        max_features=0.3,
        random_state=42,
        n_jobs=-1
    )
    
    # Cross-validation
    cv_scores = cross_val_score(rf, X_dev, y_dev[target], 
                                cv=5, scoring='r2', n_jobs=-1)
    
    # Entrenar en todo Dev
    rf.fit(X_dev, y_dev[target])
    modelos_rf[target] = rf
    
    # Predicciones
    y_pred_dev = rf.predict(X_dev)
    y_pred_test = rf.predict(X_test)
    
    # M√©tricas
    metrics_dev = evaluar_modelo(y_dev[target], y_pred_dev, target)
    metrics_dev['split'] = 'Dev'
    metrics_dev['CV_mean'] = cv_scores.mean()
    metrics_dev['CV_std'] = cv_scores.std()
    
    metrics_test = evaluar_modelo(y_test[target], y_pred_test, target)
    metrics_test['split'] = 'Test'
    metrics_test['CV_mean'] = cv_scores.mean()
    metrics_test['CV_std'] = cv_scores.std()
    
    resultados_rf.append(metrics_dev)
    resultados_rf.append(metrics_test)
    
    # Calcular gap
    gap = metrics_dev['R2'] - metrics_test['R2']
    status = "üö®" if gap > 0.3 else ("‚ö†Ô∏è" if gap > 0.15 else "‚úì")
    
    print(f"  CV:   {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
    print(f"  Dev:  {metrics_dev['R2']:.4f}")
    print(f"  Test: {metrics_test['R2']:.4f}")
    print(f"  Gap:  {gap:.4f} {status}")
    print(f"  Tiempo: {time()-t0:.1f}s")

df_rf = pd.DataFrame(resultados_rf)
df_rf['modelo'] = 'RandomForest'

# Clipear R¬≤ negativos
df_rf['R2'] = df_rf['R2'].clip(lower=0)

print("\n" + "="*70)

## 5. Comparaci√≥n con Baseline

In [None]:
# Cargar baseline
df_baseline = pd.read_csv('../data/processed/resultados_hibrido.csv')

# Comparar solo Ridge/Dummy vs RF
df_comp = pd.concat([df_baseline, df_rf], ignore_index=True)
df_test_comp = df_comp[df_comp['split'] == 'Test'].copy()

print("="*70)
print("COMPARACI√ìN: Baseline vs Random Forest (Test)")
print("="*70)

for target in targets_reg:
    r2_base = df_test_comp[df_test_comp['target']==target]['R2'].iloc[0]
    r2_rf = df_test_comp[df_test_comp['target']==target]['R2'].iloc[1]
    mejora = r2_rf - r2_base
    
    status = "‚úì‚úì" if mejora > 0.05 else ("‚úì" if mejora > 0 else "‚Üí")
    
    print(f"\n{target}:")
    print(f"  Baseline: {r2_base:.4f}")
    print(f"  RF:       {r2_rf:.4f}")
    print(f"  Mejora:   {mejora:+.4f} {status}")

print("\n" + "="*70)

## 6. Feature Importance

In [None]:
# Top 5 features por target
print("="*70)
print("FEATURE IMPORTANCE (Top 5 por target)")
print("="*70)

for target in targets_reg:
    rf = modelos_rf[target]
    importances = pd.DataFrame({
        'feature': X_dev.columns,
        'importance': rf.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\n{target}:")
    for _, row in importances.head(5).iterrows():
        print(f"  {row['feature']:30s}: {row['importance']:.4f}")

## 7. Visualizaci√≥n

In [None]:
# Gr√°fico comparativo
fig, ax = plt.subplots(figsize=(14, 8))

x = np.arange(len(targets_reg))
width = 0.35

r2_baseline = []
r2_rf = []

for target in targets_reg:
    r2_base = df_test_comp[df_test_comp['target']==target]['R2'].iloc[0]
    r2_forest = df_test_comp[df_test_comp['target']==target]['R2'].iloc[1]
    r2_baseline.append(r2_base)
    r2_rf.append(r2_forest)

ax.bar(x - width/2, r2_baseline, width, label='Baseline', alpha=0.7)
ax.bar(x + width/2, r2_rf, width, label='Random Forest', alpha=0.7)

ax.set_xlabel('Target', fontsize=12)
ax.set_ylabel('R¬≤', fontsize=12)
ax.set_title('Baseline vs Random Forest - R¬≤ en Test', fontsize=14, fontweight='bold')
ax.set_xticks(x)
ax.set_xticklabels(targets_reg, rotation=45, ha='right')
ax.legend(fontsize=11)
ax.axhline(y=0, color='r', linestyle='--', alpha=0.3)
ax.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../data/processed/rf_vs_baseline.png', dpi=300, bbox_inches='tight')
plt.show()

## 8. Guardar Resultados

In [None]:
df_rf.to_csv('../data/processed/resultados_rf.csv', index=False)

print("\n" + "="*70)
print("ARCHIVOS GUARDADOS")
print("="*70)
print("  - data/processed/resultados_rf.csv")
print("  - data/processed/rf_vs_baseline.png")
print("\n" + "="*70)
print("‚úì RANDOM FOREST (FEATURES SIMPLES) COMPLETADO")
print("="*70)

---
# PARTE 2: Random Forest con Features COMPLETAS

Probar si features horarias (09h-18h) mejoran targets problem√°ticos:
- intensidad_termicas_mean_ms
- tasa_ascenso_mean_ms

## 9. Preparar Features Completas

In [None]:
# Usar TODAS las features horarias
X_dev_full, y_dev_full, X_test_full, y_test_full = preparar_datos(
    dev, test, targets_reg, modo='completo'
)

print(f"\nFeatures COMPLETAS: {X_dev_full.shape[1]}")
print(f"Samples - Dev: {X_dev_full.shape[0]}, Test: {X_test_full.shape[0]}")
print(f"Ratio: {X_dev_full.shape[0]/X_dev_full.shape[1]:.1f}:1")
print(f"\n‚ö†Ô∏è Ratio bajo - regularizaci√≥n m√°s agresiva necesaria")

## 10. RF con Features Completas (REGULARIZACI√ìN AGRESIVA)

In [None]:
resultados_rf_full = []
modelos_rf_full = {}

print("="*70)
print("RANDOM FOREST - FEATURES COMPLETAS (110+)")
print("="*70)
print("max_depth=8, min_samples_leaf=8, max_features=0.2 (m√°s restrictivo)\n")

for target in targets_reg:
    print(f"\n{target}:")
    t0 = time()
    
    # Regularizaci√≥n M√ÅS AGRESIVA (ratio bajo)
    rf_full = RandomForestRegressor(
        n_estimators=100,
        max_depth=8,           # M√°s bajo que antes
        min_samples_leaf=8,    # M√°s alto que antes
        max_features=0.2,      # Menos features por √°rbol
        random_state=42,
        n_jobs=-1
    )
    
    # Cross-validation
    cv_scores = cross_val_score(rf_full, X_dev_full, y_dev_full[target],
                                cv=5, scoring='r2', n_jobs=-1)
    
    # Entrenar
    rf_full.fit(X_dev_full, y_dev_full[target])
    modelos_rf_full[target] = rf_full
    
    # Predicciones
    y_pred_dev = rf_full.predict(X_dev_full)
    y_pred_test = rf_full.predict(X_test_full)
    
    # M√©tricas
    metrics_dev = evaluar_modelo(y_dev_full[target], y_pred_dev, target)
    metrics_dev['split'] = 'Dev'
    metrics_dev['CV_mean'] = cv_scores.mean()
    metrics_dev['CV_std'] = cv_scores.std()
    
    metrics_test = evaluar_modelo(y_test_full[target], y_pred_test, target)
    metrics_test['split'] = 'Test'
    metrics_test['CV_mean'] = cv_scores.mean()
    metrics_test['CV_std'] = cv_scores.std()
    
    resultados_rf_full.append(metrics_dev)
    resultados_rf_full.append(metrics_test)
    
    # Gap
    gap = metrics_dev['R2'] - metrics_test['R2']
    status = "üö®" if gap > 0.3 else ("‚ö†Ô∏è" if gap > 0.15 else "‚úì")
    
    print(f"  CV:   {cv_scores.mean():.4f} ¬± {cv_scores.std():.4f}")
    print(f"  Dev:  {metrics_dev['R2']:.4f}")
    print(f"  Test: {metrics_test['R2']:.4f}")
    print(f"  Gap:  {gap:.4f} {status}")
    print(f"  Tiempo: {time()-t0:.1f}s")

df_rf_full = pd.DataFrame(resultados_rf_full)
df_rf_full['modelo'] = 'RF_Full'
df_rf_full['R2'] = df_rf_full['R2'].clip(lower=0)

print("\n" + "="*70)

## 11. Comparaci√≥n: RF Simple vs RF Full

In [None]:
print("="*70)
print("COMPARACI√ìN: RF Simple (14 features) vs RF Full (110+ features)")
print("="*70)

mejoras = []

for target in targets_reg:
    r2_simple = df_rf[(df_rf['target']==target) & (df_rf['split']=='Test')]['R2'].values[0]
    r2_full = df_rf_full[(df_rf_full['target']==target) & (df_rf_full['split']=='Test')]['R2'].values[0]
    
    gap_simple = df_rf[(df_rf['target']==target) & (df_rf['split']=='Dev')]['R2'].values[0] - r2_simple
    gap_full = df_rf_full[(df_rf_full['target']==target) & (df_rf_full['split']=='Dev')]['R2'].values[0] - r2_full
    
    mejora = r2_full - r2_simple
    mejoras.append({'target': target, 'mejora': mejora})
    
    status = "‚úì‚úì" if mejora > 0.05 else ("‚úì" if mejora > 0 else ("‚Üí" if mejora > -0.05 else "‚ùå"))
    
    print(f"\n{target}:")
    print(f"  Simple (14): {r2_simple:.4f} (gap: {gap_simple:.3f})")
    print(f"  Full (110):  {r2_full:.4f} (gap: {gap_full:.3f})")
    print(f"  Mejora:      {mejora:+.4f} {status}")

df_mejoras = pd.DataFrame(mejoras).sort_values('mejora', ascending=False)

print("\n" + "="*70)
print("RESUMEN:")
print("="*70)
mejoraron = len(df_mejoras[df_mejoras['mejora'] > 0])
empeoraron = len(df_mejoras[df_mejoras['mejora'] < 0])
print(f"Mejoraron: {mejoraron}/9")
print(f"Empeoraron: {empeoraron}/9")

if mejoraron > 5:
    print("\n‚úì‚úì FEATURES COMPLETAS FUNCIONAN MEJOR")
elif empeoraron > 5:
    print("\n‚ùå OVERFITTING - Usar features simples")
else:
    print("\n‚Üí RESULTADOS MIXTOS - Analizar caso por caso")

print("="*70)

## 12. Guardar Resultados Completos

In [None]:
df_rf_full.to_csv('../data/processed/resultados_rf_full.csv', index=False)

print("\n" + "="*70)
print("ARCHIVOS GUARDADOS")
print("="*70)
print("  - data/processed/resultados_rf_full.csv")
print("\n" + "="*70)
print("‚úì RANDOM FOREST (FEATURES COMPLETAS) COMPLETADO")
print("="*70)