# Diagn√≥stico: Targets con R¬≤ Negativo

Analizar por qu√© algunos targets dan R¬≤ < 0

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

# Cargar datos
dev = pd.read_csv('../data/processed/dev.csv', index_col=0)
test = pd.read_csv('../data/processed/test.csv', index_col=0)

targets_problematicos = ['num_termicas', 'tasa_ascenso_mean_ms', 'intensidad_termicas_mean_ms']

## 1. Comparar Distribuciones Dev vs Test

In [None]:
print("="*70)
print("ESTAD√çSTICAS: Dev vs Test")
print("="*70)

for target in targets_problematicos:
    print(f"\n{target}:")
    print(f"  Dev:  mean={dev[target].mean():.4f}, std={dev[target].std():.4f}")
    print(f"  Test: mean={test[target].mean():.4f}, std={test[target].std():.4f}")
    
    # Diferencia
    diff_mean = abs(dev[target].mean() - test[target].mean())
    avg_std = (dev[target].std() + test[target].std()) / 2
    print(f"  Œî media: {diff_mean:.4f} ({diff_mean/avg_std:.2f} desv. std)")
    
    if diff_mean/avg_std > 0.3:
        print(f"  ‚ö†Ô∏è DISTRIBUCIONES DIFERENTES - Split aleatorio problem√°tico")

In [None]:
# Visualizar distribuciones
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for i, target in enumerate(targets_problematicos):
    ax = axes[i]
    ax.hist(dev[target], bins=30, alpha=0.5, label='Dev', density=True)
    ax.hist(test[target], bins=30, alpha=0.5, label='Test', density=True)
    ax.set_xlabel(target)
    ax.set_ylabel('Densidad')
    ax.legend()
    ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2. Correlaciones con Features Meteorol√≥gicas

In [None]:
# Calcular promedios
variables_meteo = [
    'solar_rad', 'temp_2m', 'cloud_cover', 'boundary_layer_height',
    'cape', 'skin_temp', 'wind_speed'
]

for var in variables_meteo:
    pattern = f'{var}_\\d{{2}}h'
    cols_horarias = [col for col in dev.columns if re.match(pattern, col)]
    if cols_horarias:
        dev[f'{var}_avg'] = dev[cols_horarias].mean(axis=1)

features_avg = [col for col in dev.columns if col.endswith('_avg')]

print("="*70)
print("CORRELACIONES: Targets vs Features Meteorol√≥gicas")
print("="*70)

for target in targets_problematicos:
    print(f"\n{target}:")
    corrs = dev[features_avg].corrwith(dev[target]).abs().sort_values(ascending=False)
    print("  Top 5:")
    for feat, corr in corrs.head(5).items():
        print(f"    {feat}: {corr:.4f}")
    
    max_corr = corrs.max()
    if max_corr < 0.15:
        print(f"\n  üö® CORRELACI√ìN M√ÅXIMA < 0.15")
        print(f"  ‚Üí Target NO predecible con features meteorol√≥gicas disponibles")
        print(f"  ‚Üí R¬≤ negativo es ESPERADO - eliminar del an√°lisis")

## 3. Recomendaciones

In [None]:
print("="*70)
print("RECOMENDACIONES")
print("="*70)

targets_eliminar = []

for target in targets_problematicos:
    corrs = dev[features_avg].corrwith(dev[target]).abs()
    max_corr = corrs.max()
    
    if max_corr < 0.15:
        targets_eliminar.append(target)
        print(f"\n‚ùå {target}:")
        print(f"   Correlaci√≥n m√°xima: {max_corr:.4f}")
        print(f"   ‚Üí ELIMINAR del an√°lisis (no predecible)")
    else:
        print(f"\n‚úì {target}:")
        print(f"   Correlaci√≥n m√°xima: {max_corr:.4f}")
        print(f"   ‚Üí Mantener (puede mejorar con modelos no lineales)")

print(f"\n" + "="*70)
print(f"TARGETS A ELIMINAR: {targets_eliminar}")
print(f"Raz√≥n: Sin correlaci√≥n con condiciones meteorol√≥gicas")
print("="*70)