In [None]:
# Importación de librerías
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime, timedelta

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# Modelos de ML
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import xgboost as xgb
import lightgbm as lgb

# Métricas
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import roc_curve, precision_recall_curve

# Optimización avanzada
try:
    import optuna
    OPTUNA_AVAILABLE = True
    print("✅ Optuna disponible para optimización bayesiana")
except ImportError:
    OPTUNA_AVAILABLE = False
    print("⚠️ Optuna no disponible. Instalando...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "optuna"])
    import optuna
    OPTUNA_AVAILABLE = True
    print("✅ Optuna instalado exitosamente")

# Configuración
warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
np.random.seed(42)

print("🚀 Librerías importadas exitosamente")
print("📊 Iniciando Parte II: Preprocesamiento y Optimización")


In [None]:
# Cargar el dataset combinado
print("📈 Cargando dataset financiero combinado...")
df = pd.read_csv('selected_dataset/financial_data_combined.csv')
df['Date'] = pd.to_datetime(df['Date'])

print(f"✅ Dataset cargado exitosamente")
print(f"📊 Forma del dataset: {df.shape}")
print(f"🗓️ Período: {df['Date'].min()} a {df['Date'].max()}")
print(f"🏢 Acciones: {', '.join(df['Stock'].unique())}")
print(f"🏭 Sectores: {', '.join(df['Sector'].unique())}")

# Información básica del dataset
print("\n" + "="*50)
print("INFORMACIÓN DEL DATASET")
print("="*50)
print(df.info())

print("\n" + "="*50)
print("ESTADÍSTICAS DESCRIPTIVAS")
print("="*50)
print(df.describe())

print("\n" + "="*50)
print("VALORES NULOS")
print("="*50)
null_counts = df.isnull().sum()
null_percentages = (null_counts / len(df)) * 100
null_summary = pd.DataFrame({
    'Valores Nulos': null_counts,
    'Porcentaje': null_percentages
})
print(null_summary[null_summary['Valores Nulos'] > 0])

print("\n" + "="*50)
print("PRIMERAS 5 FILAS")
print("="*50)
print(df.head())


In [None]:
def create_technical_indicators(df):
    """
    Crea indicadores técnicos para análisis de trading
    """
    df = df.copy()
    
    print("🔧 Creando indicadores técnicos...")
    
    # Agrupar por acción para calcular indicadores por separado
    dfs_processed = []
    
    for stock in df['Stock'].unique():
        stock_df = df[df['Stock'] == stock].copy()
        stock_df = stock_df.sort_values('Date').reset_index(drop=True)
        
        # ========== INDICADORES DE PRECIO ==========
        
        # Retornos
        stock_df['Daily_Return'] = stock_df['Adjusted Close'].pct_change()
        stock_df['Price_Change'] = stock_df['Adjusted Close'].diff()
        
        # Promedios móviles
        stock_df['MA_5'] = stock_df['Adjusted Close'].rolling(window=5).mean()
        stock_df['MA_10'] = stock_df['Adjusted Close'].rolling(window=10).mean()
        stock_df['MA_20'] = stock_df['Adjusted Close'].rolling(window=20).mean()
        stock_df['MA_50'] = stock_df['Adjusted Close'].rolling(window=50, min_periods=30).mean()
        
        # Exponential Moving Averages
        stock_df['EMA_12'] = stock_df['Adjusted Close'].ewm(span=12).mean()
        stock_df['EMA_26'] = stock_df['Adjusted Close'].ewm(span=26).mean()
        
        # ========== RSI (Relative Strength Index) ==========
        delta = stock_df['Adjusted Close'].diff()
        gain = (delta.where(delta > 0, 0)).rolling(window=14).mean()
        loss = (-delta.where(delta < 0, 0)).rolling(window=14).mean()
        rs = gain / loss
        stock_df['RSI'] = 100 - (100 / (1 + rs))
        
        # ========== MACD ==========
        stock_df['MACD'] = stock_df['EMA_12'] - stock_df['EMA_26']
        stock_df['MACD_Signal'] = stock_df['MACD'].ewm(span=9).mean()
        stock_df['MACD_Histogram'] = stock_df['MACD'] - stock_df['MACD_Signal']
        
        # ========== BOLLINGER BANDS ==========
        stock_df['BB_Middle'] = stock_df['MA_20']
        bb_std = stock_df['Adjusted Close'].rolling(window=20).std()
        stock_df['BB_Upper'] = stock_df['BB_Middle'] + (bb_std * 2)
        stock_df['BB_Lower'] = stock_df['BB_Middle'] - (bb_std * 2)
        stock_df['BB_Width'] = stock_df['BB_Upper'] - stock_df['BB_Lower']
        stock_df['BB_Position'] = (stock_df['Adjusted Close'] - stock_df['BB_Lower']) / stock_df['BB_Width']
        
        # ========== STOCHASTIC OSCILLATOR ==========
        high_14 = stock_df['High'].rolling(window=14).max()
        low_14 = stock_df['Low'].rolling(window=14).min()
        stock_df['Stoch_K'] = 100 * (stock_df['Close'] - low_14) / (high_14 - low_14)
        stock_df['Stoch_D'] = stock_df['Stoch_K'].rolling(window=3).mean()
        
        # ========== WILLIAMS %R ==========
        stock_df['Williams_R'] = -100 * (high_14 - stock_df['Close']) / (high_14 - low_14)
        
        # ========== INDICADORES DE VOLUMEN ==========
        stock_df['Volume_MA'] = stock_df['Volume'].rolling(window=20).mean()
        stock_df['Volume_Ratio'] = stock_df['Volume'] / stock_df['Volume_MA']
        
        # Money Flow Index (MFI)
        typical_price = (stock_df['High'] + stock_df['Low'] + stock_df['Close']) / 3
        money_flow = typical_price * stock_df['Volume']
        
        positive_flow = money_flow.where(typical_price > typical_price.shift(1), 0).rolling(window=14).sum()
        negative_flow = money_flow.where(typical_price < typical_price.shift(1), 0).rolling(window=14).sum()
        
        mfi_ratio = positive_flow / negative_flow
        stock_df['MFI'] = 100 - (100 / (1 + mfi_ratio))
        
        # ========== INDICADORES DE VOLATILIDAD ==========
        stock_df['Volatility'] = stock_df['Daily_Return'].rolling(window=20).std()
        stock_df['ATR'] = ((stock_df['High'] - stock_df['Low']).rolling(window=14).mean())
        
        # ========== RATIOS DE PRECIO ==========
        stock_df['Price_to_MA20'] = stock_df['Adjusted Close'] / stock_df['MA_20']
        stock_df['MA5_to_MA20'] = stock_df['MA_5'] / stock_df['MA_20']
        stock_df['High_Low_Ratio'] = stock_df['High'] / stock_df['Low']
        
        dfs_processed.append(stock_df)
        print(f"   ✅ {stock}: {len([col for col in stock_df.columns if col not in df.columns])} nuevos indicadores")
    
    # Combinar todos los dataframes procesados
    df_combined = pd.concat(dfs_processed, ignore_index=True)
    
    print(f"🎯 Feature Engineering completado")
    print(f"📊 Nuevas columnas creadas: {len(df_combined.columns) - len(df.columns)}")
    print(f"📈 Dataset final: {df_combined.shape}")
    
    return df_combined

# Aplicar feature engineering
df_features = create_technical_indicators(df)

# Mostrar las nuevas columnas creadas
new_columns = [col for col in df_features.columns if col not in df.columns]
print(f"\n📋 Indicadores técnicos creados ({len(new_columns)}):")
for i, col in enumerate(new_columns, 1):
    print(f"   {i:2d}. {col}")

# Verificar valores nulos después del feature engineering
print(f"\n⚠️ Valores nulos después del feature engineering:")
null_summary_new = df_features.isnull().sum()
print(null_summary_new[null_summary_new > 0].head(10))


In [None]:
def create_trading_signals(df, forward_days=5, rsi_buy=30, rsi_sell=70):
    """
    Crea señales de trading basadas en indicadores técnicos y retornos futuros
    
    Parámetros:
    - forward_days: días hacia adelante para calcular retornos futuros
    - rsi_buy: nivel RSI para señal de compra
    - rsi_sell: nivel RSI para señal de venta
    
    Señales:
    - 0: BUY - Condiciones favorables para comprar
    - 1: HOLD - Mantener posición actual  
    - 2: SELL - Condiciones favorables para vender
    """
    
    df = df.copy()
    print(f"🎯 Creando señales de trading...")
    print(f"   📊 RSI Buy threshold: {rsi_buy}")
    print(f"   📊 RSI Sell threshold: {rsi_sell}")
    print(f"   📅 Forward days: {forward_days}")
    
    # Procesar cada acción por separado
    dfs_with_signals = []
    
    for stock in df['Stock'].unique():
        stock_df = df[df['Stock'] == stock].copy()
        stock_df = stock_df.sort_values('Date').reset_index(drop=True)
        
        # Calcular retorno futuro
        stock_df['Future_Return'] = stock_df['Adjusted Close'].pct_change(periods=forward_days).shift(-forward_days)
        
        # Inicializar señales como HOLD (1)
        stock_df['Trading_Signal'] = 1
        
        # Condiciones para BUY (0)
        buy_conditions = (
            (stock_df['RSI'] < rsi_buy) &  # RSI oversold
            (stock_df['BB_Position'] < 0.2) &  # Precio cerca del límite inferior de Bollinger
            (stock_df['MACD'] > stock_df['MACD_Signal']) &  # MACD bullish
            (stock_df['Stoch_K'] < 20) &  # Stochastic oversold
            (stock_df['Future_Return'] > 0.02)  # Retorno futuro positivo > 2%
        )
        
        # Condiciones para SELL (2)
        sell_conditions = (
            (stock_df['RSI'] > rsi_sell) &  # RSI overbought
            (stock_df['BB_Position'] > 0.8) &  # Precio cerca del límite superior de Bollinger
            (stock_df['MACD'] < stock_df['MACD_Signal']) &  # MACD bearish
            (stock_df['Stoch_K'] > 80) &  # Stochastic overbought
            (stock_df['Future_Return'] < -0.02)  # Retorno futuro negativo > -2%
        )
        
        # Aplicar las condiciones
        stock_df.loc[buy_conditions, 'Trading_Signal'] = 0  # BUY
        stock_df.loc[sell_conditions, 'Trading_Signal'] = 2  # SELL
        
        dfs_with_signals.append(stock_df)
        
        # Estadísticas por acción
        signal_counts = stock_df['Trading_Signal'].value_counts().sort_index()
        print(f"   📈 {stock}: BUY={signal_counts.get(0, 0)}, HOLD={signal_counts.get(1, 0)}, SELL={signal_counts.get(2, 0)}")
    
    # Combinar todas las acciones
    df_with_signals = pd.concat(dfs_with_signals, ignore_index=True)
    
    # Estadísticas generales
    print(f"\n📊 Distribución de señales general:")
    signal_distribution = df_with_signals['Trading_Signal'].value_counts().sort_index()
    total_signals = len(df_with_signals.dropna(subset=['Trading_Signal']))
    
    for signal, count in signal_distribution.items():
        signal_name = ['BUY', 'HOLD', 'SELL'][signal]
        percentage = (count / total_signals) * 100
        print(f"   {signal_name} ({signal}): {count:,} ({percentage:.1f}%)")
    
    return df_with_signals

# Crear señales de trading
df_with_signals = create_trading_signals(df_features)

# Visualizar la distribución de señales
plt.figure(figsize=(12, 5))

# Subplot 1: Distribución general
plt.subplot(1, 2, 1)
signal_counts = df_with_signals['Trading_Signal'].value_counts().sort_index()
labels = ['BUY', 'HOLD', 'SELL']
colors = ['green', 'orange', 'red']
plt.pie(signal_counts.values, labels=labels, colors=colors, autopct='%1.1f%%', startangle=90)
plt.title('Distribución de Señales de Trading')

# Subplot 2: Distribución por acción
plt.subplot(1, 2, 2)
signal_by_stock = df_with_signals.groupby(['Stock', 'Trading_Signal']).size().unstack(fill_value=0)
signal_by_stock.plot(kind='bar', color=colors, alpha=0.7)
plt.title('Señales por Acción')
plt.xlabel('Acción')
plt.ylabel('Cantidad de Señales')
plt.legend(['BUY', 'HOLD', 'SELL'])
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print(f"\n✅ Variable objetivo creada exitosamente")
print(f"📊 Dataset con señales: {df_with_signals.shape}")


In [None]:
# ============================================================
# PREPARACIÓN DE DATOS PARA MACHINE LEARNING
# ============================================================

print("🔧 Iniciando preprocesamiento de datos...")

# 1. LIMPIEZA INICIAL - Remover filas con valores nulos en la variable objetivo
print("\n1️⃣ Limpieza inicial...")
data_clean = df_with_signals.dropna(subset=['Trading_Signal']).copy()
print(f"   Datos después de remover NaN en Trading_Signal: {data_clean.shape}")

# 2. SELECCIÓN DE FEATURES PARA EL MODELO
print("\n2️⃣ Selección de features...")

# Features técnicos (indicadores)
technical_features = [
    'RSI', 'MACD', 'MACD_Signal', 'MACD_Histogram',
    'BB_Position', 'BB_Width', 'Stoch_K', 'Stoch_D', 'Williams_R',
    'MFI', 'Volatility', 'ATR', 'Volume_Ratio',
    'Price_to_MA20', 'MA5_to_MA20', 'High_Low_Ratio',
    'Daily_Return', 'Price_Change'
]

# Features categóricas
categorical_features = ['Stock', 'Sector', 'Market']

# Features a excluir (identificadores, fechas, etc.)
exclude_features = ['Date', 'Future_Return', 'Trading_Signal']

# Verificar que las features existen
available_technical = [f for f in technical_features if f in data_clean.columns]
missing_technical = [f for f in technical_features if f not in data_clean.columns]

print(f"   ✅ Features técnicos disponibles: {len(available_technical)}")
print(f"   ⚠️ Features técnicos faltantes: {len(missing_technical)}")
if missing_technical:
    print(f"      Faltantes: {missing_technical}")

# 3. ANÁLISIS DE VALORES NULOS
print("\n3️⃣ Análisis de valores nulos...")
null_analysis = data_clean[available_technical + categorical_features].isnull().sum()
null_features = null_analysis[null_analysis > 0]

if len(null_features) > 0:
    print("   Columnas con valores nulos:")
    for col, count in null_features.items():
        percentage = (count / len(data_clean)) * 100
        print(f"      {col}: {count} ({percentage:.1f}%)")
else:
    print("   ✅ No hay valores nulos en las features seleccionadas")

# 4. DETECCIÓN DE OUTLIERS
print("\n4️⃣ Detección de outliers...")

def detect_outliers_iqr(data, column, factor=1.5):
    """Detecta outliers usando el método IQR"""
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - factor * IQR
    upper_bound = Q3 + factor * IQR
    
    outliers = ((data[column] < lower_bound) | (data[column] > upper_bound))
    return outliers.sum(), lower_bound, upper_bound

outlier_summary = []
for feature in available_technical:
    if data_clean[feature].dtype in ['float64', 'int64']:
        outlier_count, lower, upper = detect_outliers_iqr(data_clean, feature)
        outlier_percentage = (outlier_count / len(data_clean)) * 100
        outlier_summary.append({
            'Feature': feature,
            'Outliers': outlier_count,
            'Percentage': outlier_percentage,
            'Lower_Bound': lower,
            'Upper_Bound': upper
        })

outlier_df = pd.DataFrame(outlier_summary)
outlier_df = outlier_df.sort_values('Percentage', ascending=False)

print("   Top 10 features con más outliers:")
print(outlier_df.head(10)[['Feature', 'Outliers', 'Percentage']].to_string(index=False))

# 5. PREPARACIÓN DE DATASETS X y y
print("\n5️⃣ Preparación de datasets X y y...")

# Seleccionar solo las filas con todas las features disponibles
features_to_use = available_technical + categorical_features
X = data_clean[features_to_use].copy()
y = data_clean['Trading_Signal'].copy()

# Remover filas con valores nulos en X
mask_complete = X.notna().all(axis=1)
X = X[mask_complete]
y = y[mask_complete]

print(f"   📊 Shape final de X: {X.shape}")
print(f"   🎯 Shape final de y: {y.shape}")
print(f"   📈 Distribución de clases:")

class_distribution = y.value_counts().sort_index()
for class_val, count in class_distribution.items():
    class_name = ['BUY', 'HOLD', 'SELL'][int(class_val)]
    percentage = (count / len(y)) * 100
    print(f"      {class_name} ({class_val}): {count:,} ({percentage:.1f}%)")

# 6. DIVISIÓN TEMPORAL DE DATOS
print("\n6️⃣ División temporal de datos...")

# Obtener las fechas correspondientes
dates_complete = data_clean[mask_complete]['Date'].reset_index(drop=True)

# Ordenar por fecha para división temporal
sort_idx = dates_complete.argsort()
X_sorted = X.iloc[sort_idx].reset_index(drop=True)
y_sorted = y.iloc[sort_idx].reset_index(drop=True)
dates_sorted = dates_complete.iloc[sort_idx].reset_index(drop=True)

# División temporal 80/20
split_index = int(0.8 * len(X_sorted))

X_train = X_sorted.iloc[:split_index]
X_test = X_sorted.iloc[split_index:]
y_train = y_sorted.iloc[:split_index]
y_test = y_sorted.iloc[split_index:]

print(f"   📅 Período de entrenamiento: {dates_sorted.iloc[0]} a {dates_sorted.iloc[split_index-1]}")
print(f"   📅 Período de prueba: {dates_sorted.iloc[split_index]} a {dates_sorted.iloc[-1]}")
print(f"   📊 Train set: {X_train.shape}")
print(f"   📊 Test set: {X_test.shape}")

print(f"\n✅ Preprocesamiento inicial completado")
print(f"📊 Datos listos para creación de pipelines")


In [None]:
# ============================================================
# CREACIÓN DE PIPELINES DE PREPROCESAMIENTO
# ============================================================

print("🔧 Creando pipelines de preprocesamiento...")

# Separar features por tipo
numerical_features = available_technical
categorical_features_clean = categorical_features

print(f"📊 Features numéricas: {len(numerical_features)}")
print(f"📊 Features categóricas: {len(categorical_features_clean)}")

# ============================================================
# PIPELINE PARA FEATURES NUMÉRICAS
# ============================================================

numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),  # Imputar valores faltantes con mediana
    ('scaler', StandardScaler())  # Escalar a media 0 y desviación estándar 1
])

# ============================================================
# PIPELINE PARA FEATURES CATEGÓRICAS  
# ============================================================

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='unknown')),  # Imputar con 'unknown'
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))  # One-hot encoding
])

# ============================================================
# COMBINAR AMBOS PIPELINES CON COLUMNTRANSFORMER
# ============================================================

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features_clean)
    ],
    remainder='drop'  # Eliminar columnas no especificadas
)

print("✅ Pipelines de preprocesamiento creados")

# ============================================================
# PROBAR EL PREPROCESSOR
# ============================================================

print("\n🧪 Probando el preprocessor...")

# Fit y transform en datos de entrenamiento
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

print(f"📊 Shape original X_train: {X_train.shape}")
print(f"📊 Shape procesado X_train: {X_train_processed.shape}")
print(f"📊 Shape original X_test: {X_test.shape}")
print(f"📊 Shape procesado X_test: {X_test_processed.shape}")

# Obtener nombres de las features después del preprocesamiento
def get_feature_names(preprocessor, numerical_features, categorical_features):
    """Obtiene los nombres de las features después del preprocesamiento"""
    
    # Features numéricas (se mantienen igual)
    num_feature_names = numerical_features
    
    # Features categóricas (one-hot encoded)
    cat_transformer = preprocessor.named_transformers_['cat']
    if hasattr(cat_transformer.named_steps['onehot'], 'get_feature_names_out'):
        # Scikit-learn >= 1.0
        cat_feature_names = cat_transformer.named_steps['onehot'].get_feature_names_out(categorical_features)
    else:
        # Scikit-learn < 1.0 (fallback)
        categories = cat_transformer.named_steps['onehot'].categories_
        cat_feature_names = []
        for i, cat_name in enumerate(categorical_features):
            for category in categories[i]:
                cat_feature_names.append(f"{cat_name}_{category}")
    
    return list(num_feature_names) + list(cat_feature_names)

feature_names_processed = get_feature_names(preprocessor, numerical_features, categorical_features_clean)

print(f"📋 Total de features después del preprocesamiento: {len(feature_names_processed)}")
print(f"   - Features numéricas: {len(numerical_features)}")
print(f"   - Features categóricas (one-hot): {len(feature_names_processed) - len(numerical_features)}")

# Mostrar algunas features de ejemplo
print(f"\n📋 Primeras 10 features procesadas:")
for i, name in enumerate(feature_names_processed[:10], 1):
    print(f"   {i:2d}. {name}")

if len(feature_names_processed) > 10:
    print(f"   ... y {len(feature_names_processed) - 10} más")

# ============================================================
# VERIFICAR CALIDAD DEL PREPROCESAMIENTO
# ============================================================

print(f"\n🔍 Verificación de calidad del preprocesamiento:")

# Verificar valores nulos
train_nulls = np.isnan(X_train_processed).sum()
test_nulls = np.isnan(X_test_processed).sum()

print(f"   ✅ Valores nulos en train: {train_nulls}")
print(f"   ✅ Valores nulos en test: {test_nulls}")

# Verificar estadísticas básicas de features numéricas
train_means = np.mean(X_train_processed[:, :len(numerical_features)], axis=0)
train_stds = np.std(X_train_processed[:, :len(numerical_features)], axis=0)

print(f"   📊 Media de features numéricas (debería estar cerca de 0):")
print(f"      Min: {train_means.min():.6f}, Max: {train_means.max():.6f}")
print(f"   📊 Desviación estándar de features numéricas (debería estar cerca de 1):")
print(f"      Min: {train_stds.min():.6f}, Max: {train_stds.max():.6f}")

print(f"\n✅ Pipelines de preprocesamiento completados y verificados")


In [None]:
# ============================================================
# ENTRENAMIENTO Y COMPARACIÓN DE MÚLTIPLES MODELOS
# ============================================================

print("🤖 Iniciando entrenamiento y comparación de modelos...")

# Definir modelos a comparar
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Support Vector Machine': SVC(random_state=42, probability=True),
    'XGBoost': xgb.XGBClassifier(random_state=42, eval_metric='mlogloss'),
    'LightGBM': lgb.LGBMClassifier(random_state=42, verbose=-1)
}

print(f"📊 Modelos a evaluar: {len(models)}")
for name in models.keys():
    print(f"   • {name}")

# ============================================================
# FUNCIÓN PARA EVALUAR MODELOS
# ============================================================

def evaluate_model(name, model, X_train, y_train, cv_folds=5):
    """Evalúa un modelo usando validación cruzada"""
    
    # Crear pipeline completo (preprocesamiento + modelo)
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Validación cruzada
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=cv_folds, 
                               scoring='accuracy', n_jobs=-1)
    
    # Entrenar en todo el conjunto de entrenamiento para métricas adicionales
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_train)
    
    # Calcular métricas
    accuracy = cv_scores.mean()
    accuracy_std = cv_scores.std()
    precision = precision_score(y_train, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_train, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_train, y_pred, average='weighted', zero_division=0)
    
    return {
        'Model': name,
        'CV_Accuracy_Mean': accuracy,
        'CV_Accuracy_Std': accuracy_std,
        'Train_Precision': precision,
        'Train_Recall': recall,
        'Train_F1': f1,
        'Pipeline': pipeline
    }

# ============================================================
# EVALUAR TODOS LOS MODELOS
# ============================================================

print(f"\n🔄 Evaluando modelos con validación cruzada (5-fold)...")

results = []
pipelines = {}

for name, model in models.items():
    print(f"\n   🔄 Evaluando {name}...")
    try:
        result = evaluate_model(name, model, X_train, y_train)
        results.append(result)
        pipelines[name] = result['Pipeline']
        
        print(f"      ✅ CV Accuracy: {result['CV_Accuracy_Mean']:.4f} ± {result['CV_Accuracy_Std']:.4f}")
        print(f"      📊 Train F1: {result['Train_F1']:.4f}")
        
    except Exception as e:
        print(f"      ❌ Error: {str(e)}")

# ============================================================
# RESULTADOS Y COMPARACIÓN
# ============================================================

print(f"\n📊 Resumen de resultados:")
print("="*80)

# Crear DataFrame con resultados
results_df = pd.DataFrame(results)
results_df = results_df.sort_values('CV_Accuracy_Mean', ascending=False)

# Mostrar tabla de resultados
print(results_df[['Model', 'CV_Accuracy_Mean', 'CV_Accuracy_Std', 'Train_F1']].to_string(index=False))

# Identificar el mejor modelo
best_model_name = results_df.iloc[0]['Model']
best_accuracy = results_df.iloc[0]['CV_Accuracy_Mean']

print(f"\n🏆 MEJOR MODELO: {best_model_name}")
print(f"   📊 CV Accuracy: {best_accuracy:.4f}")

# ============================================================
# VISUALIZACIÓN DE RESULTADOS
# ============================================================

plt.figure(figsize=(15, 10))

# Subplot 1: Accuracy con barras de error
plt.subplot(2, 2, 1)
x_pos = range(len(results_df))
plt.bar(x_pos, results_df['CV_Accuracy_Mean'], 
        yerr=results_df['CV_Accuracy_Std'], capsize=5, alpha=0.7)
plt.xlabel('Modelos')
plt.ylabel('CV Accuracy')
plt.title('Comparación de Accuracy (Validación Cruzada)')
plt.xticks(x_pos, results_df['Model'], rotation=45, ha='right')
plt.grid(True, alpha=0.3)

# Subplot 2: F1 Score en entrenamiento
plt.subplot(2, 2, 2)
plt.bar(x_pos, results_df['Train_F1'], alpha=0.7, color='orange')
plt.xlabel('Modelos')
plt.ylabel('F1 Score')
plt.title('F1 Score en Entrenamiento')
plt.xticks(x_pos, results_df['Model'], rotation=45, ha='right')
plt.grid(True, alpha=0.3)

# Subplot 3: Scatter plot Accuracy vs F1
plt.subplot(2, 2, 3)
plt.scatter(results_df['CV_Accuracy_Mean'], results_df['Train_F1'], 
           s=100, alpha=0.7, c=range(len(results_df)), cmap='viridis')
plt.xlabel('CV Accuracy')
plt.ylabel('Train F1 Score')
plt.title('Accuracy vs F1 Score')
plt.grid(True, alpha=0.3)

# Añadir etiquetas a los puntos
for i, model in enumerate(results_df['Model']):
    plt.annotate(model, 
                (results_df.iloc[i]['CV_Accuracy_Mean'], results_df.iloc[i]['Train_F1']),
                xytext=(5, 5), textcoords='offset points', fontsize=8)

# Subplot 4: Ranking de modelos
plt.subplot(2, 2, 4)
ranking_scores = (results_df['CV_Accuracy_Mean'] + results_df['Train_F1']) / 2
plt.barh(range(len(results_df)), ranking_scores, alpha=0.7, color='green')
plt.ylabel('Modelos')
plt.xlabel('Score Promedio (Accuracy + F1) / 2')
plt.title('Ranking General de Modelos')
plt.yticks(range(len(results_df)), results_df['Model'])
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n✅ Comparación de modelos completada")
print(f"🎯 Modelo seleccionado para optimización: {best_model_name}")


In [None]:
# ============================================================
# OPTIMIZACIÓN DE HIPERPARÁMETROS
# ============================================================

print("🔍 Iniciando optimización de hiperparámetros...")
print(f"🎯 Modelo seleccionado: {best_model_name}")

# Obtener el mejor modelo
best_model = models[best_model_name]

# Crear pipeline para optimización
optimization_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', best_model)
])

# ============================================================
# DEFINIR ESPACIOS DE BÚSQUEDA SEGÚN EL MODELO
# ============================================================

def get_param_grid(model_name):
    """Define el espacio de búsqueda de hiperparámetros según el modelo"""
    
    if model_name == 'Random Forest':
        return {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [10, 20, 30, None],
            'classifier__min_samples_split': [2, 5, 10],
            'classifier__min_samples_leaf': [1, 2, 4],
            'classifier__max_features': ['sqrt', 'log2', None]
        }
    
    elif model_name == 'XGBoost':
        return {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [3, 6, 9],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__subsample': [0.8, 0.9, 1.0],
            'classifier__colsample_bytree': [0.8, 0.9, 1.0]
        }
    
    elif model_name == 'LightGBM':
        return {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [3, 6, 9],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__num_leaves': [31, 50, 100],
            'classifier__feature_fraction': [0.8, 0.9, 1.0]
        }
    
    elif model_name == 'Gradient Boosting':
        return {
            'classifier__n_estimators': [100, 200, 300],
            'classifier__max_depth': [3, 6, 9],
            'classifier__learning_rate': [0.01, 0.1, 0.2],
            'classifier__subsample': [0.8, 0.9, 1.0],
            'classifier__max_features': ['sqrt', 'log2', None]
        }
    
    elif model_name == 'Support Vector Machine':
        return {
            'classifier__C': [0.1, 1, 10, 100],
            'classifier__gamma': ['scale', 'auto', 0.01, 0.1, 1],
            'classifier__kernel': ['rbf', 'poly', 'sigmoid']
        }
    
    elif model_name == 'Logistic Regression':
        return {
            'classifier__C': [0.01, 0.1, 1, 10, 100],
            'classifier__penalty': ['l1', 'l2', 'elasticnet'],
            'classifier__solver': ['liblinear', 'saga'],
            'classifier__max_iter': [1000, 2000]
        }
    
    else:  # Default para otros modelos
        return {
            'classifier__random_state': [42]  # Parámetro mínimo
        }

param_grid = get_param_grid(best_model_name)
print(f"📋 Parámetros a optimizar: {len(param_grid)}")
for param, values in param_grid.items():
    print(f"   • {param}: {values}")

# ============================================================
# 1. GRIDSEARCHCV - BÚSQUEDA EXHAUSTIVA
# ============================================================

print(f"\n1️⃣ Ejecutando GridSearchCV...")
print(f"   🔄 Búsqueda exhaustiva en {np.prod([len(v) for v in param_grid.values()])} combinaciones")

start_time = datetime.now()

grid_search = GridSearchCV(
    optimization_pipeline,
    param_grid,
    cv=3,  # Reducido para acelerar
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
grid_time = (datetime.now() - start_time).total_seconds()

print(f"   ✅ GridSearchCV completado en {grid_time:.1f} segundos")
print(f"   🏆 Mejor score: {grid_search.best_score_:.4f}")
print(f"   📋 Mejores parámetros: {grid_search.best_params_}")

# ============================================================
# 2. RANDOMIZEDSEARCHCV - BÚSQUEDA ALEATORIA
# ============================================================

print(f"\n2️⃣ Ejecutando RandomizedSearchCV...")

# Expandir el espacio de búsqueda para RandomizedSearch
def get_random_param_dist(model_name):
    """Define distribuciones para búsqueda aleatoria"""
    
    if model_name == 'Random Forest':
        return {
            'classifier__n_estimators': [50, 100, 200, 300, 500],
            'classifier__max_depth': [5, 10, 15, 20, 25, 30, None],
            'classifier__min_samples_split': [2, 5, 10, 15],
            'classifier__min_samples_leaf': [1, 2, 4, 6],
            'classifier__max_features': ['sqrt', 'log2', None, 0.5, 0.8]
        }
    
    elif model_name == 'XGBoost':
        return {
            'classifier__n_estimators': [50, 100, 200, 300, 500],
            'classifier__max_depth': [3, 6, 9, 12],
            'classifier__learning_rate': [0.01, 0.05, 0.1, 0.15, 0.2],
            'classifier__subsample': [0.6, 0.7, 0.8, 0.9, 1.0],
            'classifier__colsample_bytree': [0.6, 0.7, 0.8, 0.9, 1.0]
        }
    
    # Usar el mismo que GridSearch para otros modelos
    else:
        return param_grid

random_param_dist = get_random_param_dist(best_model_name)

start_time = datetime.now()

random_search = RandomizedSearchCV(
    optimization_pipeline,
    random_param_dist,
    n_iter=50,  # Número de iteraciones aleatorias
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42,
    verbose=1
)

random_search.fit(X_train, y_train)
random_time = (datetime.now() - start_time).total_seconds()

print(f"   ✅ RandomizedSearchCV completado en {random_time:.1f} segundos")
print(f"   🏆 Mejor score: {random_search.best_score_:.4f}")
print(f"   📋 Mejores parámetros: {random_search.best_params_}")

# ============================================================
# 3. OPTUNA - OPTIMIZACIÓN BAYESIANA
# ============================================================

print(f"\n3️⃣ Ejecutando Optuna (Optimización Bayesiana)...")

def objective(trial, model_name, pipeline_template, X_train, y_train):
    """Función objetivo para Optuna"""
    
    # Sugerir parámetros según el modelo
    if model_name == 'Random Forest':
        params = {
            'classifier__n_estimators': trial.suggest_int('n_estimators', 50, 500, step=50),
            'classifier__max_depth': trial.suggest_int('max_depth', 5, 30),
            'classifier__min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'classifier__min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
            'classifier__max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        }
    
    elif model_name == 'XGBoost':
        params = {
            'classifier__n_estimators': trial.suggest_int('n_estimators', 50, 500, step=50),
            'classifier__max_depth': trial.suggest_int('max_depth', 3, 12),
            'classifier__learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
            'classifier__subsample': trial.suggest_float('subsample', 0.6, 1.0),
            'classifier__colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0)
        }
    
    else:
        # Para otros modelos, usar parámetros del GridSearch
        params = {}
        for param, values in param_grid.items():
            if isinstance(values[0], (int, float)):
                if isinstance(values[0], int):
                    params[param] = trial.suggest_int(param.split('__')[1], min(values), max(values))
                else:
                    params[param] = trial.suggest_float(param.split('__')[1], min(values), max(values))
            else:
                params[param] = trial.suggest_categorical(param.split('__')[1], values)
    
    # Crear pipeline con parámetros sugeridos
    pipeline_copy = Pipeline([
        ('preprocessor', pipeline_template.named_steps['preprocessor']),
        ('classifier', pipeline_template.named_steps['classifier'].__class__(**{k.split('__')[1]: v for k, v in params.items()}, random_state=42))
    ])
    
    # Evaluar con validación cruzada
    scores = cross_val_score(pipeline_copy, X_train, y_train, cv=3, scoring='accuracy', n_jobs=1)
    return scores.mean()

# Crear estudio de Optuna
start_time = datetime.now()

study = optuna.create_study(direction='maximize', pruner=optuna.pruners.MedianPruner())
study.optimize(
    lambda trial: objective(trial, best_model_name, optimization_pipeline, X_train, y_train),
    n_trials=50,
    timeout=300,  # 5 minutos máximo
    show_progress_bar=True
)

optuna_time = (datetime.now() - start_time).total_seconds()

print(f"   ✅ Optuna completado en {optuna_time:.1f} segundos")
print(f"   🏆 Mejor score: {study.best_value:.4f}")
print(f"   📋 Mejores parámetros: {study.best_params}")

# ============================================================
# COMPARACIÓN DE TÉCNICAS DE OPTIMIZACIÓN
# ============================================================

print(f"\n📊 COMPARACIÓN DE TÉCNICAS DE OPTIMIZACIÓN")
print("="*80)

optimization_results = pd.DataFrame({
    'Technique': ['GridSearchCV', 'RandomizedSearchCV', 'Optuna'],
    'Best_Score': [grid_search.best_score_, random_search.best_score_, study.best_value],
    'Time_Seconds': [grid_time, random_time, optuna_time],
    'Evaluations': [len(grid_search.cv_results_['mean_test_score']), 50, len(study.trials)]
})

print(optimization_results.to_string(index=False))

# Identificar la mejor técnica
best_technique_idx = optimization_results['Best_Score'].idxmax()
best_technique = optimization_results.iloc[best_technique_idx]

print(f"\n🏆 MEJOR TÉCNICA: {best_technique['Technique']}")
print(f"   📊 Score: {best_technique['Best_Score']:.4f}")
print(f"   ⏱️ Tiempo: {best_technique['Time_Seconds']:.1f} segundos")
print(f"   🔄 Evaluaciones: {best_technique['Evaluations']}")

# Visualización
plt.figure(figsize=(15, 5))

# Subplot 1: Comparación de scores
plt.subplot(1, 3, 1)
bars = plt.bar(optimization_results['Technique'], optimization_results['Best_Score'], 
               color=['blue', 'orange', 'green'], alpha=0.7)
plt.title('Comparación de Best Scores')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

# Añadir valores en las barras
for bar, score in zip(bars, optimization_results['Best_Score']):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001, 
             f'{score:.4f}', ha='center', va='bottom')

# Subplot 2: Comparación de tiempos
plt.subplot(1, 3, 2)
plt.bar(optimization_results['Technique'], optimization_results['Time_Seconds'], 
        color=['blue', 'orange', 'green'], alpha=0.7)
plt.title('Tiempo de Ejecución')
plt.ylabel('Segundos')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

# Subplot 3: Eficiencia (Score/Time)
plt.subplot(1, 3, 3)
efficiency = optimization_results['Best_Score'] / optimization_results['Time_Seconds']
plt.bar(optimization_results['Technique'], efficiency, 
        color=['blue', 'orange', 'green'], alpha=0.7)
plt.title('Eficiencia (Score/Tiempo)')
plt.ylabel('Score por Segundo')
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print(f"\n✅ Optimización de hiperparámetros completada")


In [None]:
# ============================================================
# EVALUACIÓN FINAL DEL MODELO OPTIMIZADO
# ============================================================

print("🎯 Evaluación final del modelo optimizado...")

# Seleccionar el mejor modelo según la técnica de optimización ganadora
if best_technique['Technique'] == 'GridSearchCV':
    final_model = grid_search.best_estimator_
    best_params = grid_search.best_params_
elif best_technique['Technique'] == 'RandomizedSearchCV':
    final_model = random_search.best_estimator_
    best_params = random_search.best_params_
else:  # Optuna
    # Crear modelo con los mejores parámetros de Optuna
    optuna_params = {f"classifier__{k}": v for k, v in study.best_params.items()}
    final_model = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', best_model.__class__(**study.best_params, random_state=42))
    ])
    final_model.fit(X_train, y_train)
    best_params = study.best_params

print(f"🏆 Modelo final: {best_model_name} optimizado con {best_technique['Technique']}")
print(f"📋 Parámetros finales: {best_params}")

# ============================================================
# EVALUACIÓN EN CONJUNTO DE PRUEBA
# ============================================================

print(f"\n📊 Evaluando en conjunto de prueba...")

# Predicciones
y_pred = final_model.predict(X_test)
y_pred_proba = final_model.predict_proba(X_test)

# Métricas principales
test_accuracy = accuracy_score(y_test, y_pred)
test_precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
test_recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
test_f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0)

print(f"✅ Resultados en conjunto de prueba:")
print(f"   📊 Accuracy: {test_accuracy:.4f}")
print(f"   📊 Precision: {test_precision:.4f}")
print(f"   📊 Recall: {test_recall:.4f}")
print(f"   📊 F1-Score: {test_f1:.4f}")

# ============================================================
# MATRIZ DE CONFUSIÓN
# ============================================================

print(f"\n📋 Matriz de Confusión:")
cm = confusion_matrix(y_test, y_pred)
print(cm)

# Visualización de la matriz de confusión
plt.figure(figsize=(15, 12))

# Subplot 1: Matriz de confusión
plt.subplot(2, 3, 1)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['BUY', 'HOLD', 'SELL'],
            yticklabels=['BUY', 'HOLD', 'SELL'])
plt.title('Matriz de Confusión')
plt.xlabel('Predicción')
plt.ylabel('Real')

# Subplot 2: Distribución de clases reales vs predichas
plt.subplot(2, 3, 2)
class_names = ['BUY', 'HOLD', 'SELL']
real_counts = [sum(y_test == i) for i in range(3)]
pred_counts = [sum(y_pred == i) for i in range(3)]

x = np.arange(len(class_names))
width = 0.35

plt.bar(x - width/2, real_counts, width, label='Real', alpha=0.7)
plt.bar(x + width/2, pred_counts, width, label='Predicción', alpha=0.7)
plt.xlabel('Clases')
plt.ylabel('Cantidad')
plt.title('Distribución Real vs Predicha')
plt.xticks(x, class_names)
plt.legend()
plt.grid(True, alpha=0.3)

# Subplot 3: Métricas por clase
plt.subplot(2, 3, 3)
precision_per_class = precision_score(y_test, y_pred, average=None, zero_division=0)
recall_per_class = recall_score(y_test, y_pred, average=None, zero_division=0)
f1_per_class = f1_score(y_test, y_pred, average=None, zero_division=0)

x = np.arange(len(class_names))
width = 0.25

plt.bar(x - width, precision_per_class, width, label='Precision', alpha=0.7)
plt.bar(x, recall_per_class, width, label='Recall', alpha=0.7)
plt.bar(x + width, f1_per_class, width, label='F1-Score', alpha=0.7)
plt.xlabel('Clases')
plt.ylabel('Score')
plt.title('Métricas por Clase')
plt.xticks(x, class_names)
plt.legend()
plt.grid(True, alpha=0.3)

# Subplot 4: Curva ROC (para clase BUY)
plt.subplot(2, 3, 4)
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize

# Binarizar las etiquetas para ROC multiclase
y_test_bin = label_binarize(y_test, classes=[0, 1, 2])
y_pred_proba_bin = y_pred_proba

# Calcular ROC para cada clase
colors = ['blue', 'orange', 'green']
for i, class_name in enumerate(class_names):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_pred_proba_bin[:, i])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, color=colors[i], lw=2, 
             label=f'{class_name} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('Tasa de Falsos Positivos')
plt.ylabel('Tasa de Verdaderos Positivos')
plt.title('Curvas ROC por Clase')
plt.legend(loc="lower right")
plt.grid(True, alpha=0.3)

# Subplot 5: Comparación modelo base vs optimizado
plt.subplot(2, 3, 5)
base_accuracy = results_df[results_df['Model'] == best_model_name]['CV_Accuracy_Mean'].iloc[0]
comparison_data = ['Modelo Base', 'Modelo Optimizado']
comparison_scores = [base_accuracy, test_accuracy]

bars = plt.bar(comparison_data, comparison_scores, color=['red', 'green'], alpha=0.7)
plt.ylabel('Accuracy')
plt.title('Mejora del Modelo')
plt.grid(True, alpha=0.3)

# Añadir valores en las barras
for bar, score in zip(bars, comparison_scores):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01, 
             f'{score:.4f}', ha='center', va='bottom')

# Calcular mejora
improvement = ((test_accuracy - base_accuracy) / base_accuracy) * 100
plt.text(0.5, max(comparison_scores) * 0.8, f'Mejora: {improvement:.1f}%', 
         ha='center', va='center', fontsize=12, fontweight='bold',
         bbox=dict(boxstyle="round,pad=0.3", facecolor="yellow", alpha=0.7))

# Subplot 6: Importancia de features (si el modelo lo soporta)
plt.subplot(2, 3, 6)
try:
    if hasattr(final_model.named_steps['classifier'], 'feature_importances_'):
        # Obtener importancias
        importances = final_model.named_steps['classifier'].feature_importances_
        
        # Obtener nombres de features
        feature_names_short = feature_names_processed[:len(importances)]
        
        # Seleccionar top 10 features más importantes
        indices = np.argsort(importances)[-10:]
        
        plt.barh(range(len(indices)), importances[indices], alpha=0.7)
        plt.yticks(range(len(indices)), [feature_names_short[i] for i in indices])
        plt.xlabel('Importancia')
        plt.title('Top 10 Features Importantes')
        plt.grid(True, alpha=0.3)
    else:
        plt.text(0.5, 0.5, 'Importancia de features\nno disponible para\neste modelo', 
                ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Importancia de Features')
except Exception as e:
    plt.text(0.5, 0.5, f'Error al calcular\nimportancia: {str(e)[:30]}...', 
            ha='center', va='center', transform=plt.gca().transAxes)
    plt.title('Importancia de Features')

plt.tight_layout()
plt.show()

# ============================================================
# REPORTE DETALLADO
# ============================================================

print(f"\n📋 REPORTE DE CLASIFICACIÓN DETALLADO")
print("="*80)
print(classification_report(y_test, y_pred, target_names=['BUY', 'HOLD', 'SELL']))

# ============================================================
# ANÁLISIS DE ERRORES
# ============================================================

print(f"\n🔍 ANÁLISIS DE ERRORES")
print("="*50)

# Crear DataFrame con resultados
results_analysis = pd.DataFrame({
    'Real': y_test,
    'Prediccion': y_pred,
    'Correcto': y_test == y_pred
})

# Estadísticas de errores
error_stats = results_analysis.groupby(['Real', 'Prediccion']).size().unstack(fill_value=0)
print("Matriz de errores detallada:")
print(error_stats)

# Porcentaje de aciertos por clase
accuracy_per_class = []
for i in range(3):
    class_mask = y_test == i
    if class_mask.sum() > 0:
        class_accuracy = (y_pred[class_mask] == i).mean()
        accuracy_per_class.append(class_accuracy)
        print(f"Accuracy clase {class_names[i]}: {class_accuracy:.4f}")

# ============================================================
# CONCLUSIONES FINALES
# ============================================================

print(f"\n🎯 CONCLUSIONES FINALES")
print("="*80)

print(f"✅ MODELO FINAL SELECCIONADO:")
print(f"   • Algoritmo: {best_model_name}")
print(f"   • Técnica de optimización: {best_technique['Technique']}")
print(f"   • Accuracy en prueba: {test_accuracy:.4f}")
print(f"   • F1-Score: {test_f1:.4f}")

print(f"\n📊 RENDIMIENTO POR OBJETIVO:")
print(f"   • BUY: Precision={precision_per_class[0]:.3f}, Recall={recall_per_class[0]:.3f}")
print(f"   • HOLD: Precision={precision_per_class[1]:.3f}, Recall={recall_per_class[1]:.3f}")
print(f"   • SELL: Precision={precision_per_class[2]:.3f}, Recall={recall_per_class[2]:.3f}")

print(f"\n🚀 MEJORAS OBTENIDAS:")
print(f"   • Mejora en accuracy: {improvement:.1f}%")
print(f"   • Tiempo de optimización total: {optimization_results['Time_Seconds'].sum():.1f} segundos")

print(f"\n💡 RECOMENDACIONES:")
print(f"   • El modelo muestra {'buen' if test_accuracy > 0.7 else 'regular' if test_accuracy > 0.6 else 'bajo'} rendimiento")
print(f"   • {'Se recomienda usar en producción' if test_accuracy > 0.75 else 'Requiere más optimización antes de producción'}")
if test_f1 < 0.7:
    print(f"   • Considerar balanceo de clases o más feature engineering")
if improvement < 5:
    print(f"   • La optimización mostró mejoras limitadas, considerar otros modelos")

print(f"\n✅ Evaluación final completada")
print(f"🎯 Sistema de recomendación de trading listo para {'implementación' if test_accuracy > 0.7 else 'más desarrollo'}")
