# üöÄ Kaggle S5E6: XGBoost Simple con Optuna

**Pipeline simplificado de XGBoost con optimizaci√≥n Optuna para predicci√≥n de fertilizantes**

---

## üéØ Objetivos

- **Implementaci√≥n simple**: XGBoost sin validaci√≥n cruzada
- **Optimizaci√≥n inteligente**: Uso de Optuna para hiperpar√°metros
- **Train/Validation split**: Divisi√≥n simple 70/30
- **Foco en resultados**: Predicciones r√°pidas y efectivas

---

## üìä M√©trica: MAP@3

El objetivo es maximizar Mean Average Precision @ 3 para las predicciones de fertilizantes.

## üìö 1. Librer√≠as

In [1]:
# Librer√≠as b√°sicas
import pandas as pd
import numpy as np
import warnings
import time
import gc
import os
from datetime import datetime

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Optuna para optimizaci√≥n
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner

# Configuraci√≥n
warnings.filterwarnings('ignore')
np.random.seed(513)

print("‚úÖ Librer√≠as importadas correctamente")

‚úÖ Librer√≠as importadas correctamente


## üìÇ 2. Datos

In [2]:
# Rutas de datos
train_path = "../data/train.csv"
test_path = "../data/test.csv"
sample_submission_path = "../data/sample_submission.csv"

# Cargar datos
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

print(f"‚úÖ Datos: Train {train_df.shape} | Test {test_df.shape} | {train_df['Fertilizer Name'].nunique()} clases")

‚úÖ Datos: Train (750000, 10) | Test (250000, 9) | 7 clases


## ‚öôÔ∏è 3. Feature Engineering

In [3]:
def create_features(df):
    """
    Create engineered features based on agricultural domain knowledge
    
    Args:
        df: DataFrame with agricultural features
        
    Returns:
        DataFrame with additional engineered features
    """
    df_eng = df.copy()
    
    # NPK Ratios (crucial for agricultural decisions)
    df_eng['N_P_ratio'] = df_eng['Nitrogen'] / (df_eng['Phosphorous'] + 0.001)
    df_eng['N_K_ratio'] = df_eng['Nitrogen'] / (df_eng['Potassium'] + 0.001)
    df_eng['P_K_ratio'] = df_eng['Phosphorous'] / (df_eng['Potassium'] + 0.001)
    
    # Total NPK and NPK Balance
    df_eng['Total_NPK'] = df_eng['Nitrogen'] + df_eng['Phosphorous'] + df_eng['Potassium']
    npk_mean = df_eng[['Nitrogen', 'Phosphorous', 'Potassium']].mean(axis=1)
    df_eng['NPK_Balance'] = df_eng[['Nitrogen', 'Phosphorous', 'Potassium']].std(axis=1) / (npk_mean + 0.001)
    
    # Environmental indices
    df_eng['Temp_Hum_index'] = df_eng['Temparature'] * df_eng['Humidity'] / 100
    df_eng['Moist_Balance'] = df_eng['Moisture'] - df_eng['Humidity']
    df_eng['Environ_Stress'] = np.sqrt((df_eng['Temparature'] - 25)**2 + (df_eng['Humidity'] - 65)**2)
    df_eng['Temp_Moist_inter'] = df_eng['Temparature'] * df_eng['Moisture'] / 100
    
    # Dominant nutrient
    npk_cols = ['Nitrogen', 'Phosphorous', 'Potassium']
    df_eng['Dominant_NPK'] = df_eng[npk_cols].idxmax(axis=1)
    
    # Categorical binning
    df_eng['Temp_Cat'] = pd.cut(df_eng['Temparature'], bins=3, labels=['Low', 'Medium', 'High'])
    df_eng['Hum_Cat'] = pd.cut(df_eng['Humidity'], bins=3, labels=['Low', 'Medium', 'High'])
    df_eng['N_Level'] = pd.cut(df_eng['Nitrogen'], bins=3, labels=['Low', 'Medium', 'High'])
    df_eng['K_Level'] = pd.cut(df_eng['Potassium'], bins=3, labels=['Low', 'Medium', 'High'])
    df_eng['P_Level'] = pd.cut(df_eng['Phosphorous'], bins=3, labels=['Low', 'Medium', 'High'])
    
    # Soil-Crop interaction
    df_eng['Soil_Crop_Combo'] = df_eng['Soil Type'].astype(str) + '_' + df_eng['Crop Type'].astype(str)
    
    return df_eng

# Separate features and target variable
target_column = 'Fertilizer Name'
X_raw = train_df.drop(columns=[target_column])
y_raw = train_df[target_column]
X_test_raw = test_df.copy()

# Apply feature engineering
X_train_featured = create_features(X_raw)
X_test_featured = create_features(X_test_raw)

print(f"‚úÖ Feature engineering: {X_raw.shape[1]} ‚Üí {X_train_featured.shape[1]} features (+{X_train_featured.shape[1] - X_raw.shape[1]})")

‚úÖ Feature engineering: 9 ‚Üí 25 features (+16)


## üî¢ 4. Encoding

In [4]:
def encode_categorical_features(X_train, X_test, y_train):
    """
    Encode categorical features using LabelEncoder
    
    Args:
        X_train: Training features
        X_test: Test features  
        y_train: Training target
        
    Returns:
        Tuple of (X_train_encoded, X_test_encoded, y_encoded, encoders_dict)
    """
    
    # Initialize encoders dictionary
    encoders = {}
    
    # Create copies to avoid modifying originals
    X_train_enc = X_train.copy()
    X_test_enc = X_test.copy()
    
    # Identify categorical columns
    categorical_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Encode categorical features
    for col in categorical_cols:
        encoder = LabelEncoder()
        combined_values = pd.concat([X_train[col], X_test[col]]).astype(str)
        encoder.fit(combined_values)
        X_train_enc[col] = encoder.transform(X_train[col].astype(str))
        X_test_enc[col] = encoder.transform(X_test[col].astype(str))
        encoders[col] = encoder
    
    # Encode target variable
    target_encoder = LabelEncoder()
    y_encoded = target_encoder.fit_transform(y_train)
    encoders['target'] = target_encoder
    
    return X_train_enc, X_test_enc, y_encoded, encoders, target_encoder

# Apply encoding
X_train_encoded, X_test_encoded, y_encoded, label_encoders, target_encoder = encode_categorical_features(
    X_train_featured, X_test_featured, y_raw
)

print(f"‚úÖ Encoding: {len(label_encoders)-1} categorical features + target ({len(target_encoder.classes_)} classes)")

‚úÖ Encoding: 9 categorical features + target (7 classes)


## üîÑ 5. Train/Val Split

In [5]:
# Divisi√≥n simple train/validation (70/30)
X_train, X_val, y_train, y_val = train_test_split(
    X_train_encoded, y_encoded, 
    test_size=0.3, 
    random_state=42, 
    stratify=y_encoded
)

# Calcular pesos de clase para balanceo
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))
sample_weights = np.array([class_weight_dict[label] for label in y_train])

print(f"‚úÖ Train/Val split: {X_train.shape} / {X_val.shape} | Balanced weights ready")

‚úÖ Train/Val split: (525000, 25) / (225000, 25) | Balanced weights ready


## üéØ 5.1. Feature Selection

In [6]:
# =============================================================================
# FEATURE SELECTION FOR THE MODEL
# =============================================================================

features_to_use = [
    # üå°Ô∏è ORIGINAL CLIMATE VARIABLES
    'Temparature',
    'Humidity', 
    'Moisture',
    
    # üß™ CHEMICAL VARIABLES (NPK)
    'Nitrogen',
    'Potassium', 
    'Phosphorous',
    
    # üìä ENGINEERED FEATURES - NPK RATIOS (from create_features)
    # 'N_P_ratio',
    # 'N_K_ratio',
    # 'P_K_ratio',
    # 'Total_NPK',
    # 'NPK_Balance',
    
    # üå°Ô∏è ENGINEERED FEATURES - CLIMATE INDICES (from create_features)
    # 'Temp_Hum_index',
    # 'Moist_Balance',
    # 'Environ_Stress',
    # 'Temp_Moist_inter',
    
    # üè∑Ô∏è ENGINEERED FEATURES - CATEGORICAL LEVELS (from create_features, encoded)
    # 'Temp_Cat',
    # 'Hum_Cat',
    # 'N_Level',
    # 'K_Level',
    # 'P_Level',

    # üîó ENGINEERED FEATURES - COMBINATIONS (from create_features)
    'Soil_Crop_Combo', # ‚úÖ Encoded during preprocessing
    # 'Dominant_NPK', # ‚úÖ Encoded during preprocessing
    
    # üî¢ ENCODED CATEGORICAL FEATURES (from preprocessing)
    # 'Soil Type',      # ‚úÖ Encoded during preprocessing
    'Crop Type',      # ‚úÖ Encoded during preprocessing
]

# Validate and filter available features
available_features = [f for f in features_to_use if f in X_train_encoded.columns]
missing_features = [f for f in features_to_use if f not in X_train_encoded.columns]

if missing_features:
    print(f"‚ö†Ô∏è Missing features: {missing_features}")

# Create final datasets with selected features
X_train = X_train[available_features].copy()
X_val = X_val[available_features].copy()
X_test_encoded = X_test_encoded[available_features].copy()

print(f"üéØ Selected {len(available_features)} features: {available_features}")
print(f"‚úÖ Final shapes: Train {X_train.shape} | Val {X_val.shape} | Test {X_test_encoded.shape}")

üéØ Selected 8 features: ['Temparature', 'Humidity', 'Moisture', 'Nitrogen', 'Potassium', 'Phosphorous', 'Soil_Crop_Combo', 'Crop Type']
‚úÖ Final shapes: Train (525000, 8) | Val (225000, 8) | Test (250000, 8)


## üìä 6. MAP@3

In [7]:
def mapk(actual, predicted, k=3):
    """Compute mean average precision at k (MAP@k)."""
    def apk(a, p, k):
        score = 0.0
        for i in range(min(k, len(p))):
            if p[i] == a:
                score += 1.0 / (i + 1)
                break
        return score
    return np.mean([apk(a, p, k) for a, p in zip(actual, predicted)])

## ‚öôÔ∏è 7. Optuna

In [8]:
def objective(trial):
    """
    Funci√≥n objetivo para optimizaci√≥n con Optuna
    """
    # Sugerir hiperpar√°metros
    params = {
        'objective': 'multi:softprob',
        'num_class': len(target_encoder.classes_),
        'eval_metric': 'mlogloss',
        'random_state': 42,
        # 'n_jobs': -1,
        'verbosity': 0,
        'device': 'cpu',  # Cambiar a 'gpu' si se tiene GPU disponible
        'tree_method': 'hist',  # Eficiente para CPU, cambiar a 'gpu_hist' si se usa GPU
        
        # Par√°metros a optimizar
        'max_depth': trial.suggest_int('max_depth', 5, 12, step=1),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 4, step=1),
        # 'gamma': trial.suggest_float('gamma', 0.0, 0.5),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 0.5, step=0.1),
        'n_estimators': trial.suggest_int('n_estimators', 1000, 5000, step=25),
        'subsample': trial.suggest_float('subsample', 0.6, 0.8, step=0.025),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.4, 0.6, step=0.025),
        'colsample_bylevel': trial.suggest_float('colsample_bylevel', 0.8, 1.0, step=0.05),
        'colsample_bynode': trial.suggest_float('colsample_bynode', 0.8, 1.0, step=0.05),
        'alpha': trial.suggest_float('alpha', 0.6, 1.0, step=0.025),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.3, 5.0, step=0.25),
    }
    
    # Entrenar modelo con EarlyStopping callback
    model = XGBClassifier(**params)
    
    model.fit(
        X_train, y_train,
        sample_weight=sample_weights,
        eval_set=[(X_val, y_val)],
        verbose=False
    )
    
    # Predecir en validation
    y_pred_proba = model.predict_proba(X_val)
    
    # Obtener top-3 predicciones
    top3_indices = np.argsort(y_pred_proba, axis=1)[:, -3:][:, ::-1]
    
    # Calcular MAP@3
    map3_score = mapk(y_val.tolist(), top3_indices.tolist(), k=3)
    
    return map3_score

print("‚öôÔ∏è Funci√≥n objetivo para Optuna definida")

‚öôÔ∏è Funci√≥n objetivo para Optuna definida


## üöÄ 8. Optimizaci√≥n

In [9]:
# Configuraci√≥n de optimizaci√≥n
N_TRIALS = 25
N_STARTUP_TRIALS = 5  # N√∫mero de trials iniciales para el pruner
N_WARMUP_STEPS = 2  # Pasos de calentamiento para el pruner
TIMEOUT = 3600 * 3  # 3 horas

# Crear directorio y configurar base de datos
models_dir = "../models"
os.makedirs(models_dir, exist_ok=True)
DB_PATH = os.path.join(models_dir, "xgb_optuna.db")
STUDY_NAME = "xgboost_fertilizer"  # Nombre fijo para acumular trials

# Crear estudio Optuna
study = optuna.create_study(
    study_name=STUDY_NAME,
    storage=f"sqlite:///{DB_PATH}",
    direction='maximize',
    sampler=TPESampler(seed=42),
    pruner=MedianPruner(n_startup_trials=N_STARTUP_TRIALS, n_warmup_steps=N_WARMUP_STEPS),
    load_if_exists=True
)

# Ejecutar optimizaci√≥n
start_time = time.time()
study.optimize(objective, n_trials=N_TRIALS, timeout=TIMEOUT, show_progress_bar=True)
optimization_time = time.time() - start_time

print(f"üèÜ Optimizaci√≥n completada: {len(study.trials)} trials en {optimization_time/60:.1f}min")
print(f"üìä Mejor MAP@3: {study.best_value:.6f}")

[I 2025-06-18 12:37:44,661] Using an existing study with name 'xgboost_fertilizer' instead of creating a new one.


  0%|          | 0/25 [00:00<?, ?it/s]

[I 2025-06-18 13:00:11,795] Trial 189 finished with value: 0.3390525925925927 and parameters: {'max_depth': 9, 'min_child_weight': 2, 'learning_rate': 0.1, 'n_estimators': 1075, 'subsample': 0.675, 'colsample_bytree': 0.47500000000000003, 'colsample_bylevel': 1.0, 'colsample_bynode': 0.9, 'alpha': 0.725, 'reg_lambda': 4.55}. Best is trial 136 with value: 0.3410785185185184.
[I 2025-06-18 13:19:37,621] Trial 190 finished with value: 0.32727999999999985 and parameters: {'max_depth': 9, 'min_child_weight': 2, 'learning_rate': 0.30000000000000004, 'n_estimators': 1075, 'subsample': 0.7, 'colsample_bytree': 0.47500000000000003, 'colsample_bylevel': 1.0, 'colsample_bynode': 0.9, 'alpha': 0.725, 'reg_lambda': 4.8}. Best is trial 136 with value: 0.3410785185185184.
[I 2025-06-18 13:37:15,837] Trial 191 finished with value: 0.3233592592592593 and parameters: {'max_depth': 9, 'min_child_weight': 2, 'learning_rate': 0.4, 'n_estimators': 1075, 'subsample': 0.675, 'colsample_bytree': 0.475000000000

## üèãÔ∏è 9. Modelo Final

In [10]:
# Entrenar modelo final con mejores par√°metros
final_params = {
    'objective': 'multi:softprob',
    'num_class': len(target_encoder.classes_),
    'eval_metric': 'mlogloss',
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': 0,
    **study.best_params
}

final_model = XGBClassifier(**final_params)
final_model.fit(X_train, y_train, sample_weight=sample_weights, eval_set=[(X_val, y_val)], verbose=False)

## üìä 10. M√©tricas

In [11]:
# Evaluaci√≥n del modelo
y_val_pred_proba = final_model.predict_proba(X_val)
val_top3_indices = np.argsort(y_val_pred_proba, axis=1)[:, -3:][:, ::-1]
val_map3 = mapk(y_val.tolist(), val_top3_indices.tolist(), k=3)
val_accuracy = accuracy_score(y_val, final_model.predict(X_val))

print(f"üìä M√âTRICAS DE VALIDACI√ìN:")
print(f"  ‚Ä¢ MAP@3: {val_map3:.6f}")
print(f"  ‚Ä¢ Accuracy: {val_accuracy:.4f}")

# Feature importance (top 5)
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': final_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\nüîç TOP 5 FEATURES:")
for i, (_, row) in enumerate(feature_importance.head(5).iterrows()):
    print(f"  {i+1}. {row['feature']}: {row['importance']:.4f}")

üìä M√âTRICAS DE VALIDACI√ìN:
  ‚Ä¢ MAP@3: 0.340123
  ‚Ä¢ Accuracy: 0.2042

üîç TOP 5 FEATURES:
  1. Phosphorous: 0.1426
  2. Moisture: 0.1357
  3. Nitrogen: 0.1346
  4. Soil_Crop_Combo: 0.1245
  5. Potassium: 0.1217


## üîÆ 11. Predicciones

In [12]:
# Generar predicciones para test
test_pred_proba = final_model.predict_proba(X_test_encoded)
test_top3_indices = np.argsort(test_pred_proba, axis=1)[:, -3:][:, ::-1]

# Convertir a nombres de fertilizantes
test_predictions = []
for i in range(len(test_top3_indices)):
    top3_classes = test_top3_indices[i]
    top3_names = target_encoder.inverse_transform(top3_classes)
    test_predictions.append(' '.join(top3_names))

# Crear submission
submission = pd.DataFrame({
    'id': sample_submission['id'].copy(),  # Use exact same IDs as sample_submission
    'Fertilizer Name': test_predictions
})

print(f"üîÆ Predicciones generadas: {len(submission)} muestras")
print(f"üìã Ejemplo: {submission.iloc[0, 1]}")

üîÆ Predicciones generadas: 250000 muestras
üìã Ejemplo: DAP 28-28 Urea


## üíæ 12. Resultados

In [13]:
import os
import json
from datetime import datetime

# Crear directorio para resultados
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
model_name = f"XGB_Optuna_MAP3-{val_map3:.5f}".replace('.', '')
results_dir = f"../models/XGB/{model_name}"
os.makedirs(results_dir, exist_ok=True)

print(f"üíæ Guardando resultados en: {results_dir}")

# Guardar submission
submission_file = os.path.join(results_dir, f"{model_name}_submission.csv")
submission.to_csv(submission_file, index=False)

# Guardar m√©tricas principales
metrics = {
    'validation_map3': float(val_map3),
    'validation_accuracy': float(val_accuracy),
    'optuna_best_map3': float(study.best_value),
    'optimization_time_min': float(optimization_time / 60),
    'n_trials': len(study.trials),
    'best_params': study.best_params
}

with open(os.path.join(results_dir, f"{model_name}_metrics.json"), 'w') as f:
    json.dump(metrics, f, indent=2)

print(f"üíæ Resultados guardados en: {results_dir}")
print(f"\nüéØ RESUMEN FINAL:")
print(f"  üìà MAP@3: {val_map3:.6f}")
print(f"  üìà Accuracy: {val_accuracy:.4f}") 
print(f"  ‚è∞ Tiempo: {optimization_time/60:.1f}min | {len(study.trials)} trials")
print(f"  üìÑ Submission: {model_name}_submission.csv")

üíæ Guardando resultados en: ../models/XGB/XGB_Optuna_MAP3-034012
üíæ Resultados guardados en: ../models/XGB/XGB_Optuna_MAP3-034012

üéØ RESUMEN FINAL:
  üìà MAP@3: 0.340123
  üìà Accuracy: 0.2042
  ‚è∞ Tiempo: 184.1min | 199 trials
  üìÑ Submission: XGB_Optuna_MAP3-034012_submission.csv
