# üöÄ CatBoost con 10-Fold Cross-Validation - Predicci√≥n de Fertilizantes

> **Objetivo**: Combinar la importaci√≥n de datos del modelado TIER 1 con validaci√≥n cruzada robusta de 10-fold para predecir **nombres de fertilizantes** optimizando MAP@3.
> 
> **Variable Objetivo**: `Fertilizer Name` (nombres de fertilizantes codificados)
> 
> **Estrategia**: Usar ModelTrainer para cargar datos preprocesados y aplicar 10-fold CV con CatBoost optimizado
> 
> **M√©trica Principal**: MAP@3 (Mean Average Precision at 3) - requerida por la competencia de Kaggle
> 
> **Mejoras**: Configuraci√≥n corregida de early stopping y ensemble de modelos

---

## üìö 1. Importar Librer√≠as y Datos

### Importaci√≥n de librer√≠as necesarias y carga de datos preprocesados

In [None]:
# Librer√≠as principales
import pandas as pd
import numpy as np
import joblib
import time
from collections import Counter

# Utilidades del proyecto
import sys
sys.path.append('../src')
from model_utils import ModelTrainer, print_feature_selection_summary, print_training_config
from visuals import plot_confusion_matrix, plot_feature_importance
from metrics import mapk

# Scikit-learn
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# CatBoost
from catboost import CatBoostClassifier, Pool

# Configuraci√≥n
np.random.seed(513)

In [None]:
# Inicializar trainer y cargar datos preprocesados
trainer = ModelTrainer('CatBoostClassifier', 'CAT')
data = trainer.load_data()

# Extraer datasets
X_train, y_train = data['X_train'], data['y_train']
X_val, y_val = data['X_val'], data['y_val']
X_test = data['X_test']
feature_info = data['feature_info']
label_encoders = data['label_encoders']
fertilizer_encoder = label_encoders['Fertilizer Name']

print("üìä Datos cargados exitosamente:")
print(f"  ‚Ä¢ X_train shape: {X_train.shape}")
print(f"  ‚Ä¢ X_val shape: {X_val.shape}")
print(f"  ‚Ä¢ X_test shape: {X_test.shape}")
print(f"  ‚Ä¢ Variable objetivo: {y_train.name}")
print(f"  ‚Ä¢ Clases objetivo: {len(fertilizer_encoder.classes_)}")

# Mostrar las clases de fertilizantes
print(f"\nüß™ CLASES DE FERTILIZANTES:")
for i, class_name in enumerate(fertilizer_encoder.classes_):
    print(f"  {i}: {class_name}")

## üéØ 2. Selecci√≥n de Features

### Definiendo las caracter√≠sticas que utilizaremos para el modelo

In [None]:
# =============================================================================
# SELECCI√ìN DE FEATURES PARA EL MODELO
# =============================================================================

features_to_use = [
    # üå°Ô∏è VARIABLES CLIM√ÅTICAS ORIGINALES
    'Temparature',
    'Humidity', 
    'Moisture',
    
    # üå± VARIABLES DE SUELO Y CULTIVO (CODIFICADAS)
    # NOTA: CatBoost puede usar categ√≥ricas directamente como strings
    # pero para mayor compatibilidad usamos las versiones codificadas
    'Soil Type',     # ‚ùå Categ√≥rica sin codificar (requiere cat_features)
    'Crop Type',     # ‚ùå Categ√≥rica sin codificar (requiere cat_features)

    # üß™ VARIABLES QU√çMICAS (NPK)
    'Nitrogen',
    'Potassium', 
    'Phosphorous',
    
    # üìä FEATURES ENGINEERED - RATIOS NPK
    # 'N_P_ratio',
    # 'N_K_ratio',
    # 'P_K_ratio',
    # 'Total_NPK',
    
    # üå°Ô∏è FEATURES ENGINEERED - √çNDICES CLIM√ÅTICOS
    # 'Temp_Hum_index',
    # 'Moist_Balance',
    # 'Environ_Stress',
    
    # # üè∑Ô∏è FEATURES ENGINEERED - CATEGOR√çAS
    # 'Temp_Cat',
    # 'Hum_Cat',
    # 'N_Level',
    # 'K_Level',
    # 'P_Level',

    # üîó FEATURES ENGINEERED - COMBINACIONES
    # 'Soil_Crop_Combo',
    # 'NPK_Balance',
    # 'Dominant_NPK_Level',
    # 'Temp_Moist_inter',
    
    # üî¢ FEATURES ENCODED (CATEG√ìRICAS) - ‚úÖ HABILITADAS
    # 'Soil Type_encoded',      # ‚úÖ Versi√≥n codificada de Soil Type
    # 'Crop Type_encoded',      # ‚úÖ Versi√≥n codificada de Crop Type
    # 'Temp_Cat_encoded',
    # 'Hum_Cat_encoded',
    # 'N_Level_encoded',
    # 'K_Level_encoded',
    # 'P_Level_encoded',
    # 'Soil_Crop_Combo_encoded'
]

# Validar features disponibles
features_to_use = trainer.validate_features(features_to_use, X_train)
print_feature_selection_summary(features_to_use, features_to_use)

print(f"\n‚úÖ Features seleccionadas: {len(features_to_use)}")
print(f"üìä Dimensi√≥n final: {X_train[features_to_use].shape}")

In [None]:
# =============================================================================
# CONFIGURACI√ìN DE CARACTER√çSTICAS CATEG√ìRICAS PARA CATBOOST
# =============================================================================

# Identificar caracter√≠sticas categ√≥ricas en los features seleccionados
categorical_features = []
categorical_feature_indices = []

# Revisar qu√© features categ√≥ricas est√°n disponibles
print(f"üîç AN√ÅLISIS DE CARACTER√çSTICAS CATEG√ìRICAS:")
print(f"Features seleccionadas: {features_to_use}")

# Buscar caracter√≠sticas categ√≥ricas sin codificar
for i, feature in enumerate(features_to_use):
    if feature in ['Soil Type', 'Crop Type']:
        categorical_features.append(feature)
        categorical_feature_indices.append(i)
        print(f"  ‚úÖ Categ√≥rica encontrada: {feature} (√≠ndice {i})")
    elif any(cat_name in feature for cat_name in ['_Cat', 'Level', 'Combo']) and '_encoded' not in feature:
        categorical_features.append(feature)
        categorical_feature_indices.append(i)
        print(f"  ‚úÖ Categ√≥rica engineered: {feature} (√≠ndice {i})")

# Si no hay caracter√≠sticas categ√≥ricas, usar lista vac√≠a
if not categorical_features:
    print(f"  ‚ùå No se encontraron caracter√≠sticas categ√≥ricas nativas")
    print(f"  üìä Todas las caracter√≠sticas ser√°n tratadas como num√©ricas")
    categorical_feature_indices = []
else:
    print(f"  ‚úÖ Total caracter√≠sticas categ√≥ricas: {len(categorical_features)}")
    print(f"  üìä √çndices: {categorical_feature_indices}")

# Configuraci√≥n final para CatBoost
print(f"\n‚öôÔ∏è CONFIGURACI√ìN CATBOOST:")
print(f"  ‚Ä¢ Features totales: {len(features_to_use)}")
print(f"  ‚Ä¢ Features categ√≥ricas: {len(categorical_feature_indices)}")
print(f"  ‚Ä¢ Features num√©ricas: {len(features_to_use) - len(categorical_feature_indices)}")
print(f"  ‚Ä¢ cat_features parameter: {categorical_feature_indices}")

## üîÑ 3. Configuraci√≥n de Validaci√≥n Cruzada

### Configuraci√≥n de validaci√≥n cruzada estratificada con k-fold = 10

In [None]:
# =============================================================================
# CONFIGURACI√ìN DE VALIDACI√ìN CRUZADA ESTRATIFICADA
# =============================================================================

# Combinar datos de entrenamiento y validaci√≥n para CV completa
X_full = pd.concat([X_train, X_val], axis=0).reset_index(drop=True)
y_full = pd.concat([y_train, y_val], axis=0).reset_index(drop=True)

print(f"üìä DATOS COMBINADOS PARA CV:")
print(f"  ‚Ä¢ X_full shape: {X_full.shape}")
print(f"  ‚Ä¢ y_full shape: {y_full.shape}")
print(f"  ‚Ä¢ Features a usar: {len(features_to_use)}")

# Par√°metros de CV
N_SPLITS = 10  # 10-fold cross-validation
RANDOM_STATE = 513
SHUFFLE = True

# Inicializar StratifiedKFold
skf = StratifiedKFold(
    n_splits=N_SPLITS, 
    shuffle=SHUFFLE, 
    random_state=RANDOM_STATE
)

print(f"\nüîÑ CONFIGURACI√ìN DE VALIDACI√ìN CRUZADA:")
print(f"  ‚Ä¢ N√∫mero de folds: {N_SPLITS}")
print(f"  ‚Ä¢ Estratificada: S√≠ (mantiene proporci√≥n de clases)")
print(f"  ‚Ä¢ Shuffle: {SHUFFLE}")
print(f"  ‚Ä¢ Random state: {RANDOM_STATE}")

## ‚öôÔ∏è 4. Configuraci√≥n de Hiperpar√°metros CatBoost

### Definiendo los par√°metros optimizados del modelo CatBoost

In [None]:
# =============================================================================
# CONFIGURACI√ìN DE HIPERPAR√ÅMETROS CATBOOST OPTIMIZADA
# =============================================================================

# Configuraci√≥n optimizada basada en an√°lisis previos
catboost_params = {
    # Par√°metros principales
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'iterations': 1500,
    'learning_rate': 0.075,
    
    # Estructura del √°rbol
    'depth': 7,
    
    # Regularizaci√≥n
    'l2_leaf_reg': 0.81,
    
    # Muestreo - Bayesian bootstrap
    'bootstrap_type': 'Bayesian',       # ‚Üê use Bayesian
    'bagging_temperature': 0.34,       # ‚Üê valid only for Bayesian
    'random_strength': 6.65,
    
    # Optimizaci√≥n
    'task_type': 'CPU',
    
    # Early stopping
    'early_stopping_rounds': 50,
    'use_best_model': True,
    
    # NOTA: cat_features se especifica directamente en Pool() objects
    # en lugar de en los par√°metros del clasificador
}

# Configuraci√≥n de early stopping
EARLY_STOPPING_ROUNDS = 50
VERBOSE_EVAL = 200

print(f"‚öôÔ∏è CONFIGURACI√ìN CATBOOST: {N_SPLITS}-fold CV")
print(f"  ‚Ä¢ Iterations: {catboost_params['iterations']} | Early stopping: {EARLY_STOPPING_ROUNDS}")
print(f"  ‚Ä¢ Learning rate: {catboost_params['learning_rate']} | Max depth: {catboost_params['depth']}")
print(f"  ‚Ä¢ Features categ√≥ricas: {len(categorical_feature_indices)} de {len(features_to_use)}")
print(f"  ‚Ä¢ Features categ√≥ricas indices: {categorical_feature_indices}")
print(f"  ‚Ä¢ Clases objetivo: {len(fertilizer_encoder.classes_)}")
print(f"  ‚Ä¢ Loss function: {catboost_params['loss_function']}")
print(f"  ‚Ä¢ Eval metric: {catboost_params['eval_metric']}")
print(f"  ‚Ä¢ Bootstrap type: {catboost_params['bootstrap_type']}")
print(f"  ‚Ä¢ Task type: {catboost_params['task_type']}")

## üöÄ 5. Entrenamiento del Modelo con Validaci√≥n Cruzada

### Entrenamiento con 10-fold CV y ensemble de modelos

In [None]:
# =============================================================================
# ENTRENAMIENTO CON 10-FOLD CROSS-VALIDATION
# =============================================================================

print(f"üöÄ INICIANDO ENTRENAMIENTO CON {N_SPLITS}-FOLD CROSS-VALIDATION")
print("=" * 60)

# Inicializar variables para almacenar resultados
fold_results = []
oof_predictions = np.zeros((len(X_full), len(fertilizer_encoder.classes_)))
feature_importance_folds = []
trained_models = []

# Calcular class weights globales para balance
class_counts = Counter(y_full)
max_count = max(class_counts.values())
class_weights = {cls: max_count / count for cls, count in class_counts.items()}

print(f"‚öñÔ∏è CLASS WEIGHTS PARA BALANCE:")
for cls, weight in class_weights.items():
    class_name = fertilizer_encoder.classes_[cls]
    print(f"  {cls} ({class_name:15}): {weight:.3f}")

# Tiempo de inicio
start_time = time.time()

# Entrenamiento por folds
for fold, (train_idx, val_idx) in enumerate(skf.split(X_full[features_to_use], y_full), 1):
    print(f"\n{'='*20} FOLD {fold}/{N_SPLITS} {'='*20}")
    fold_start_time = time.time()
    
    # Dividir datos
    X_train_fold = X_full[features_to_use].iloc[train_idx]
    X_val_fold = X_full[features_to_use].iloc[val_idx]
    y_train_fold = y_full.iloc[train_idx]
    y_val_fold = y_full.iloc[val_idx]
    
    print(f"üìä Train size: {len(X_train_fold):,} | Val size: {len(X_val_fold):,}")
    
    # Calcular sample weights para este fold
    fold_class_counts = Counter(y_train_fold)
    fold_max_count = max(fold_class_counts.values())
    sample_weights = y_train_fold.map(lambda cls: fold_max_count / fold_class_counts[cls])
    
    # Crear pools de CatBoost
    train_pool = Pool(
        data=X_train_fold,
        label=y_train_fold,
        weight=sample_weights,
        cat_features=categorical_feature_indices  # Usar √≠ndices de features categ√≥ricas
    )
    
    val_pool = Pool(
        data=X_val_fold,
        label=y_val_fold,
        cat_features=categorical_feature_indices  # Usar √≠ndices de features categ√≥ricas
    )
    
    # Inicializar modelo
    model = CatBoostClassifier(**catboost_params)
    
    # Entrenar modelo
    model.fit(
        train_pool,
        eval_set=val_pool,
        verbose=VERBOSE_EVAL if fold == 1 else 0  # Solo verbose en el primer fold
    )
    
    # Predicciones
    val_pred_proba = model.predict_proba(X_val_fold)
    val_pred = model.predict(X_val_fold)
    
    # Guardar predicciones OOF
    oof_predictions[val_idx] = val_pred_proba
    
    # Calcular m√©tricas
    accuracy = accuracy_score(y_val_fold, val_pred)
    
    # MAP@3
    top3_preds = np.argsort(val_pred_proba, axis=1)[:, ::-1][:, :3]
    map3_score = mapk(y_val_fold.tolist(), top3_preds.tolist(), k=3)
    
    # Guardar resultados del fold
    fold_time = time.time() - fold_start_time
    fold_results.append({
        'fold': fold,
        'accuracy': accuracy,
        'map3': map3_score,
        'best_iteration': model.get_best_iteration(),
        'training_time': fold_time
    })
    
    # Guardar modelo e importancia
    trained_models.append(model)
    feature_importance_folds.append(model.get_feature_importance())
    
    print(f"‚úÖ Fold {fold} completado:")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   MAP@3: {map3_score:.4f}")

# Tiempo total
total_time = time.time() - start_time
print(f"\n‚è±Ô∏è ENTRENAMIENTO COMPLETADO en {total_time:.1f}s ({total_time/60:.1f}min)")

## üìä 6. Evaluaci√≥n del Modelo

### An√°lisis completo de rendimiento y m√©tricas de validaci√≥n cruzada

In [None]:
# =============================================================================
# EVALUACI√ìN COMPLETA DE RESULTADOS
# =============================================================================

print("üìä RESULTADOS DE VALIDACI√ìN CRUZADA")
print("=" * 50)

# Convertir resultados a DataFrame
results_df = pd.DataFrame(fold_results)

# Estad√≠sticas de accuracy
accuracy_mean = results_df['accuracy'].mean()
accuracy_std = results_df['accuracy'].std()
accuracy_min = results_df['accuracy'].min()
accuracy_max = results_df['accuracy'].max()

# Estad√≠sticas de MAP@3
map3_mean = results_df['map3'].mean()
map3_std = results_df['map3'].std()
map3_min = results_df['map3'].min()
map3_max = results_df['map3'].max()

# Resumen estad√≠stico
print(f"\nüéØ M√âTRICAS FINALES:")
print(f"  üìà Accuracy promedio: {accuracy_mean:.4f} ¬± {accuracy_std:.4f}")
print(f"  üìà MAP@3 promedio:    {map3_mean:.4f} ¬± {map3_std:.4f}")
print(f"  üìä Rango Accuracy:    [{accuracy_min:.4f}, {accuracy_max:.4f}]")
print(f"  üìä Rango MAP@3:       [{map3_min:.4f}, {map3_max:.4f}]")

# Evaluaci√≥n de estabilidad
accuracy_cv = accuracy_std / accuracy_mean
map3_cv = map3_std / map3_mean

print(f"\nüîç AN√ÅLISIS DE ESTABILIDAD:")
print(f"  üìä Coeficiente de variaci√≥n (Accuracy): {accuracy_cv:.3f}")
print(f"  üìä Coeficiente de variaci√≥n (MAP@3):    {map3_cv:.3f}")
print(f"  {'‚úÖ Modelo estable' if accuracy_cv < 0.05 else '‚ö†Ô∏è Modelo variable'} (Accuracy CV < 0.05)")
print(f"  {'‚úÖ Modelo estable' if map3_cv < 0.05 else '‚ö†Ô∏è Modelo variable'} (MAP@3 CV < 0.05)")

# Tiempo promedio por fold
avg_fold_time = results_df['training_time'].mean()
print(f"\n‚è±Ô∏è TIEMPOS DE ENTRENAMIENTO:")
print(f"  üìä Tiempo promedio por fold: {avg_fold_time:.1f}s")
print(f"  üìä Tiempo total: {total_time:.1f}s ({total_time/60:.1f}min)")

In [None]:
# =============================================================================
# EVALUACI√ìN OUT-OF-FOLD (OOF)
# =============================================================================

print("\nüîç EVALUACI√ìN OUT-OF-FOLD (OOF)")
print("=" * 50)

# Predicciones OOF finales
oof_pred = np.argmax(oof_predictions, axis=1)
oof_accuracy = accuracy_score(y_full, oof_pred)

# MAP@3 con predicciones OOF
oof_top3 = np.argsort(oof_predictions, axis=1)[:, ::-1][:, :3]
oof_map3 = mapk(y_full.tolist(), oof_top3.tolist(), k=3)

print(f"üìä M√âTRICAS OOF (predicciones combinadas):")
print(f"  üéØ OOF Accuracy: {oof_accuracy:.4f}")
print(f"  üéØ OOF MAP@3:    {oof_map3:.4f}")

# Comparaci√≥n con CV
print(f"\nüìä COMPARACI√ìN CV vs OOF:")
print(f"  Accuracy: CV={accuracy_mean:.4f} | OOF={oof_accuracy:.4f} | Diff={abs(accuracy_mean-oof_accuracy):.4f}")
print(f"  MAP@3:    CV={map3_mean:.4f} | OOF={oof_map3:.4f} | Diff={abs(map3_mean-oof_map3):.4f}")

# M√©tricas por clase
print(f"\nüìä M√âTRICAS POR CLASE (OOF):")
class_report = classification_report(y_full, oof_pred, 
                                   target_names=fertilizer_encoder.classes_,
                                   output_dict=True)

print("Clase            Precision  Recall  F1-Score  Support")
print("-" * 55)
for class_name in fertilizer_encoder.classes_:
    metrics = class_report[class_name]
    print(f"{class_name:15} {metrics['precision']:8.3f} {metrics['recall']:7.3f} {metrics['f1-score']:8.3f} {metrics['support']:8.0f}")

print("-" * 55)
macro_avg = class_report['macro avg']
print(f"{'Macro avg':15} {macro_avg['precision']:8.3f} {macro_avg['recall']:7.3f} {macro_avg['f1-score']:8.3f} {macro_avg['support']:8.0f}")
weighted_avg = class_report['weighted avg']
print(f"{'Weighted avg':15} {weighted_avg['precision']:8.3f} {weighted_avg['recall']:7.3f} {weighted_avg['f1-score']:8.3f} {weighted_avg['support']:8.0f}")

In [None]:
# =============================================================================
# AN√ÅLISIS DE IMPORTANCIA DE CARACTER√çSTICAS
# =============================================================================

print(f"\nüîç AN√ÅLISIS DE IMPORTANCIA DE CARACTER√çSTICAS")
print("=" * 50)

# Calcular importancia promedio
feature_importance_mean = np.mean(feature_importance_folds, axis=0)
feature_importance_std = np.std(feature_importance_folds, axis=0)

# Crear DataFrame con importancias
importance_df = pd.DataFrame({
    'feature': features_to_use,
    'importance_mean': feature_importance_mean,
    'importance_std': feature_importance_std
}).sort_values('importance_mean', ascending=False)

# An√°lisis de tipos de caracter√≠sticas
print(f"\nüìä AN√ÅLISIS POR TIPO DE CARACTER√çSTICA:")
numeric_features = ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous']
engineered_features = [f for f in features_to_use if any(keyword in f for keyword in ['ratio', 'index', 'Balance', 'Total', 'inter'])]
encoded_features = [f for f in features_to_use if '_encoded' in f]

numeric_importance = importance_df[importance_df['feature'].isin(numeric_features)]['importance_mean'].sum()
engineered_importance = importance_df[importance_df['feature'].isin(engineered_features)]['importance_mean'].sum()
encoded_importance = importance_df[importance_df['feature'].isin(encoded_features)]['importance_mean'].sum()
total_importance = numeric_importance + engineered_importance + encoded_importance

if total_importance > 0:
    print(f"  üî¢ Caracter√≠sticas num√©ricas:    {numeric_importance:.3f} ({numeric_importance/total_importance*100:.1f}%)")
    print(f"  ‚öôÔ∏è Features engineered:         {engineered_importance:.3f} ({engineered_importance/total_importance*100:.1f}%)")
    print(f"  üè∑Ô∏è Caracter√≠sticas categ√≥ricas: {encoded_importance:.3f} ({encoded_importance/total_importance*100:.1f}%)")

## üéØ 7. Generaci√≥n de Predicciones para Test

### Predicciones finales usando ensemble de 10 modelos

In [None]:
# =============================================================================
# GENERACI√ìN DE PREDICCIONES PARA TEST
# =============================================================================

print("üéØ GENERANDO PREDICCIONES PARA CONJUNTO DE TEST")
print("=" * 60)

# Ensemble de predicciones de todos los modelos
print(f"üìä Ensemble de {len(trained_models)} modelos entrenados...")

test_predictions_all = []
for i, model in enumerate(trained_models):
    pred_proba = model.predict_proba(X_test[features_to_use])
    test_predictions_all.append(pred_proba)
    if i < 3:  # Mostrar progreso para los primeros 3
        print(f"  ‚úÖ Modelo {i+1}: Predicciones generadas")

if len(trained_models) > 3:
    print(f"  ‚úÖ ... y {len(trained_models)-3} modelos m√°s")

# Promedio de las predicciones (ensemble)
test_predictions_ensemble = np.mean(test_predictions_all, axis=0)
print(f"üìä Shape de predicciones ensemble: {test_predictions_ensemble.shape}")

# Obtener √≠ndices de las top 3 clases para cada muestra
test_top3_indices = np.argsort(test_predictions_ensemble, axis=1)[:, ::-1][:, :3]
print(f"üìä Shape de top-3 √≠ndices: {test_top3_indices.shape}")

# Convertir √≠ndices a nombres de fertilizantes
test_top3_names = []
for i in range(len(test_top3_indices)):
    top3_for_sample = []
    for j in range(3):
        class_idx = test_top3_indices[i, j]
        class_name = fertilizer_encoder.classes_[class_idx]
        top3_for_sample.append(class_name)
    test_top3_names.append(top3_for_sample)

print(f"‚úÖ Conversi√≥n a nombres completada")

## üéâ 8. Resumen Final y Conclusiones

### Resultados del modelo combinado con 10-fold CV

In [None]:
# =============================================================================
# CONFIGURACI√ìN PARA GUARDADO DE ARCHIVOS
# =============================================================================

import os
import json
import joblib
from datetime import datetime

# Configurar nombre del modelo basado en MAP@3
model_name = f"CAT_10CV_MAP@3-{oof_map3:.5f}".replace('.', '')
model_dir = f"../models/CAT/{N_SPLITS}CV/{model_name}"

# Crear directorio si no existe
os.makedirs(model_dir, exist_ok=True)

print(f"üìÅ DIRECTORIO DEL MODELO:")
print(f"  {model_dir}")

# Configuraci√≥n de nombres de archivos
base_filename = model_name
files_to_create = {
    'hparams': f"{base_filename}_hparams.json",
    'metrics': f"{base_filename}_metrics.json",
    'metrics_pkl': f"{base_filename}_metrics.pkl",
    'model_pkl': f"{base_filename}_model.pkl",
    'feature_import': f"{base_filename}_feature_importance.csv",
    'submission': f"{base_filename}_submission.csv",
    'submission_info': f"{base_filename}_submission_info.json"
}

print(f"\nüìù ARCHIVOS A CREAR:")
for file_type, filename in files_to_create.items():
    print(f"  {file_type:15}: {filename}")

In [None]:
# =============================================================================
# GUARDAR HIPERPAR√ÅMETROS
# =============================================================================

hparams_data = {
    "model_type": "CatBoostClassifier",
    "model_abbreviation": "CAT",
    "cv_strategy": f"{N_SPLITS}-Fold Stratified Cross Validation",
    "ensemble_method": "Average of fold predictions",
    "hyperparameters": catboost_params,
    "early_stopping_rounds": EARLY_STOPPING_ROUNDS,
    "features_selected": features_to_use,
    "num_features": len(features_to_use),
    "categorical_features": categorical_features,
    "categorical_feature_indices": categorical_feature_indices,
    "num_categorical_features": len(categorical_feature_indices),
    "class_weights_used": True,
    "random_state": RANDOM_STATE,
    "cv_splits": N_SPLITS,
    "total_models": len(trained_models)
}

# Guardar hiperpar√°metros
hparams_file = os.path.join(model_dir, files_to_create['hparams'])
with open(hparams_file, 'w') as f:
    json.dump(hparams_data, f, indent=2)

In [None]:
# =============================================================================
# GUARDAR M√âTRICAS
# =============================================================================

# M√©tricas principales para JSON
metrics_data = {
    "model_type": "CatBoostClassifier",
    "model_abbreviation": "CAT",
    "tier": "10_FOLD_CV",
    "target_variable": "Fertilizer Name",
    "cv_strategy": f"{N_SPLITS}-Fold Stratified Cross Validation",
    
    # M√©tricas principales
    "map3_score_cv_mean": float(map3_mean),
    "map3_score_cv_std": float(map3_std),
    "map3_score_oof": float(oof_map3),
    "accuracy_cv_mean": float(accuracy_mean),
    "accuracy_cv_std": float(accuracy_std),
    "accuracy_oof": float(oof_accuracy),
    
    # Informaci√≥n del modelo
    "num_classes": len(fertilizer_encoder.classes_),
    "features_used": len(features_to_use),
    "features_list": features_to_use,
    "categorical_features": categorical_features,
    "categorical_feature_indices": categorical_feature_indices,
    "num_categorical_features": len(categorical_feature_indices),
    "cv_folds": N_SPLITS,
    "total_models_trained": len(trained_models),
    
    # M√©tricas por fold
    "fold_results": fold_results,
    
    # Estad√≠sticas de estabilidad
    "accuracy_cv_coefficient": float(accuracy_cv),
    "map3_cv_coefficient": float(map3_cv),
    
    # Tiempos
    "training_time_total": float(total_time),
    "training_time_per_fold_avg": float(avg_fold_time),
    
    # Metadatos
    "timestamp": datetime.now().isoformat(),
    "kaggle_competition": "playground-series-s5e6"
}

# Guardar m√©tricas JSON
metrics_file = os.path.join(model_dir, files_to_create['metrics'])
with open(metrics_file, 'w') as f:
    json.dump(metrics_data, f, indent=2)

# M√©tricas completas para PKL (incluye objetos complejos)
metrics_pkl_data = {
    **metrics_data,
    "oof_predictions": oof_predictions,
    "trained_models": trained_models,
    "feature_importance_folds": feature_importance_folds,
    "class_report": class_report,
    "confusion_matrix": confusion_matrix(y_full, oof_pred),
    "fertilizer_encoder": fertilizer_encoder
}

# Guardar m√©tricas PKL
metrics_pkl_file = os.path.join(model_dir, files_to_create['metrics_pkl'])
joblib.dump(metrics_pkl_data, metrics_pkl_file, compress=3)

In [None]:
# =============================================================================
# GUARDAR IMPORTANCIA DE CARACTER√çSTICAS
# =============================================================================

# Guardar DataFrame de importancia
feature_importance_file = os.path.join(model_dir, files_to_create['feature_import'])
importance_df.to_csv(feature_importance_file, index=False)

# =============================================================================
# GUARDAR MODELOS ENTRENADOS
# =============================================================================

# Guardar el ensemble de modelos entrenados
model_data = {
    "ensemble_models": trained_models,
    "model_type": "CatBoostClassifier",
    "cv_folds": N_SPLITS,
    "features_used": features_to_use,
    "categorical_features": categorical_features,
    "categorical_feature_indices": categorical_feature_indices,
    "hyperparameters": catboost_params,
    "label_encoder": fertilizer_encoder,
    "training_info": {
        "map3_cv_mean": float(map3_mean),
        "map3_oof": float(oof_map3),
        "timestamp": datetime.now().isoformat()
    }
}

# Guardar modelos
model_file = os.path.join(model_dir, files_to_create['model_pkl'])
joblib.dump(model_data, model_file, compress=3)

# =============================================================================
# GUARDAR SUBMISSION Y INFORMACI√ìN
# =============================================================================

# Crear submission directamente desde predicciones del ensemble
print(f"üìù Creando submission desde ensemble de {len(trained_models)} modelos...")

# Formatear predicciones como string separado por espacios
submission_predictions = []
for top3_names in test_top3_names:
    prediction_string = ' '.join(top3_names)
    submission_predictions.append(prediction_string)

# Crear DataFrame de submission
submission = pd.DataFrame({
    'id': X_test.index,  # Usar √≠ndice de X_test como ID
    'Fertilizer Name': submission_predictions
})

# Guardar archivo de submission
submission_file = os.path.join(model_dir, files_to_create['submission'])
submission.to_csv(submission_file, index=False)

# Informaci√≥n del submission
submission_info = {
    "model_type": "CatBoostClassifier",
    "model_abbreviation": "CAT",
    "cv_strategy": f"{N_SPLITS}-Fold Stratified Cross Validation",
    "map3_score_cv_mean": float(map3_mean),
    "map3_score_oof": float(oof_map3),
    "submission_file": files_to_create['submission'],
    "num_predictions": len(submission),
    "format": "MAP@3 - Top 3 fertilizer names separated by spaces",
    "target_variable": "Fertilizer Name",
    "ensemble_models": len(trained_models),
    "features_used": len(features_to_use),
    "total_training_time_minutes": float(total_time / 60),
    "timestamp": datetime.now().isoformat(),
    "kaggle_competition": "playground-series-s5e6"
}

# Guardar informaci√≥n del submission
submission_info_file = os.path.join(model_dir, files_to_create['submission_info'])
with open(submission_info_file, 'w') as f:
    json.dump(submission_info, f, indent=2)

In [None]:
# =============================================================================
# RESUMEN FINAL DE ARCHIVOS GUARDADOS
# =============================================================================

print(f"\nüíæ RESUMEN FINAL - ARCHIVOS GUARDADOS")
print("=" * 60)

print(f"üìÅ DIRECTORIO: {model_dir}")
print(f"\nüìÑ ARCHIVOS CREADOS:")

# Verificar y mostrar todos los archivos creados
for file_type, filename in files_to_create.items():
    file_path = os.path.join(model_dir, filename)
    if os.path.exists(file_path):
        file_size = os.path.getsize(file_path)
        if file_size > 1024*1024:  # > 1MB
            size_str = f"{file_size/(1024*1024):.1f} MB"
        elif file_size > 1024:  # > 1KB
            size_str = f"{file_size/1024:.1f} KB"
        else:
            size_str = f"{file_size} bytes"
        
        print(f"  ‚úÖ {filename:35} ({size_str})")
    else:
        print(f"  ‚ùå {filename:35} (NO CREADO)")

print(f"\nüéØ M√âTRICAS PRINCIPALES:")
print(f"  üìä MAP@3 (CV Mean): {map3_mean:.5f} ¬± {map3_std:.5f}")
print(f"  üìä MAP@3 (OOF):     {oof_map3:.5f}")
print(f"  üìä Accuracy (OOF):  {oof_accuracy:.5f}")
print(f"  ü§ñ Modelos:         {len(trained_models)} (ensemble)")
print(f"  üìä Features:        {len(features_to_use)}")
print(f"  ‚è±Ô∏è Tiempo total:    {total_time/60:.1f} minutos")

print(f"\nüéâ TODOS LOS ARCHIVOS GUARDADOS EXITOSAMENTE")
print(f"üìÇ Ubicaci√≥n: {os.path.abspath(model_dir)}")

print(f"\nüçÉ ENTRENAMIENTO CATBOOST COMPLETADO")
print(f"  ‚Ä¢ Algoritmo: CatBoost con 10-fold cross-validation")
print(f"  ‚Ä¢ MAP@3 objetivo: > 0.32")
print(f"  ‚Ä¢ MAP@3 alcanzado: {oof_map3:.5f}")
print(f"  ‚Ä¢ Estado: {'\u2705 OBJETIVO ALCANZADO' if oof_map3 > 0.32 else 'üìà MEJORA NECESARIA'}")
print(f"  ‚Ä¢ Archivos listos para ensemble")