In [1]:
"""
Kaggle Submission Script - Mitsui Commodity Prediction Challenge
"""

import os
import pandas as pd
import polars as pl
import xgboost as xgb
import kaggle_evaluation.mitsui_inference_server

NUM_TARGET_COLUMNS = 424

# ============================================================================
# CARGAR MODELOS Y DATOS AUXILIARES (se ejecuta una sola vez al inicio)
# ============================================================================

print("üîÑ Cargando modelos y datos auxiliares...")

# Cargar pairs_df (necesario para feature engineering)

pairs_df = pd.read_csv("/kaggle/input/mitsui-commodity-prediction-challenge/target_pairs.csv")


# Cargar los 424 modelos entrenados desde tu dataset
# IMPORTANTE: Cambia esta ruta por la ruta de tu dataset en Kaggle
MODELS_PATH = "/kaggle/input/models1-xgbmutsui/models"  # üîπ AJUSTA ESTA RUTA

trained_models = {}
for i in range(NUM_TARGET_COLUMNS):
    target_name = f'target_{i}'
    model = xgb.XGBRegressor()
    model.load_model(f'{MODELS_PATH}/{target_name}.json')
    trained_models[target_name] = model

print(f"‚úÖ {len(trained_models)} modelos cargados exitosamente")

# ============================================================================
# FUNCI√ìN DE FEATURE ENGINEERING (debe ser id√©ntica a la de entrenamiento)
# ============================================================================

def create_targeted_features(df, pairs_df):
    """
    Creates targeted features (ratios or moving averages) for all targets.
    This function must be used on both the training and test data to ensure consistency.
    """
    # Optimized to avoid performance warnings and improve speed
    new_features = {} # Use a dictionary to store new columns before adding them
    #esto del diccionario solo sirve para ser m√°s r√°pido porque luego convertiremos el diccionario en una columna 

   #para el bucle principal vamos a recorrer cada fila de pairs y coger el valor de la columna "pair" para esa fila  
    for i, row in pairs_df.iterrows():
        pair_string = row['pair']
        
        #este if lo vamos a usar paa distiguir si es un par de commodities o una sola commoditie ya que llevan el simbolo -
        if ' - ' in pair_string: # It's a pair of instruments
            instrument_1, instrument_2 = pair_string.split(' - ')
            feature_name = f'ratio_{instrument_1}_vs_{instrument_2}'
            #aqui simplemente separa en commoditie 1 y commoditie 2 mediante la funcion split

            # Ensure columns exist before creating feature, verifica que ambos commodities existen y hace la resta, la suma
            # es para no dividir entre 0
            if instrument_1 in df.columns and instrument_2 in df.columns:
                new_features[feature_name] = df[instrument_1] / (df[instrument_2] + 1e-6)
        else: # It's a single instrument y simplemente hace el moving avarage con la funci√≥n rolling del single pair
            instrument_1 = pair_string
            feature_name = f'ma_10_{instrument_1}'
            if instrument_1 in df.columns:
                new_features[feature_name] = df[instrument_1].rolling(window=10, min_periods=1).mean()
    # Convert the dictionary of new features to a DataFrame, pasar de diccionario usado a dataframe
    new_features_df = pd.DataFrame(new_features)
    
    # Concatenate the original df with the new features df for better performance
    return pd.concat([df, new_features_df], axis=1)

# ============================================================================
# FUNCI√ìN DE PREDICCI√ìN (requerida por Kaggle)
# ============================================================================

def predict(
    test: pl.DataFrame,
    label_lags_1_batch: pl.DataFrame,
    label_lags_2_batch: pl.DataFrame,
    label_lags_3_batch: pl.DataFrame,
    label_lags_4_batch: pl.DataFrame,
) -> pl.DataFrame:
    """
    Funci√≥n de predicci√≥n llamada por el servidor de inferencia de Kaggle.
    Debe retornar predicciones dentro de 5 minutos (excepto la primera llamada).
    """
    # Convertir de Polars a Pandas
    test_df = test.to_pandas()
    
    # Asegurar que todas las features sean num√©ricas
    feature_cols = [col for col in test_df.columns if col != 'date_id']
    for col in feature_cols:
        test_df[col] = pd.to_numeric(test_df[col], errors='coerce')
    
    # 1. Crear features espec√≠ficas
    test_df_featured = create_targeted_features(test_df, pairs_df)
    
    # 2. A√±adir lags de targets
    for i in range(1, 5):
        lag_df = locals()[f'label_lags_{i}_batch'].to_pandas()
        lag_df = lag_df.rename(columns={f'target_{j}': f'target_{j}_lag_{i}' for j in range(NUM_TARGET_COLUMNS)})
        
        cols_to_merge = [col for col in lag_df.columns if col.startswith('target_') or col == 'date_id']
        test_df_featured = pd.merge(test_df_featured, lag_df[cols_to_merge], on='date_id', how='left')
    
    # 3. Generar predicciones para los 424 targets
    predictions = {}
    for i in range(NUM_TARGET_COLUMNS):
        current_target = f'target_{i}'
        model = trained_models[current_target]
        
        # Obtener las features que el modelo espera
        model_features = model.feature_names_in_
        
        # Asegurar que todas las features requeridas existan
        X_test = test_df_featured.copy()
        for col in model_features:
            if col not in X_test.columns:
                X_test[col] = 0  # Rellenar features faltantes con 0
        
        # Hacer predicci√≥n
        prediction = model.predict(X_test[model_features])
        predictions[current_target] = prediction[0]
    
    # 4. Retornar predicciones como DataFrame de Polars
    predictions_df = pl.DataFrame(predictions)
    
    assert isinstance(predictions_df, (pd.DataFrame, pl.DataFrame))
    assert len(predictions_df) == 1
    return predictions_df

# ============================================================================
# INICIAR SERVIDOR DE INFERENCIA
# ============================================================================

inference_server = kaggle_evaluation.mitsui_inference_server.MitsuiInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway(('/kaggle/input/mitsui-commodity-prediction-challenge/',))

üîÑ Cargando modelos y datos auxiliares...
‚úÖ 424 modelos cargados exitosamente
