In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mitsui-commodity-prediction-challenge/target_pairs.csv
/kaggle/input/mitsui-commodity-prediction-challenge/train_labels.csv
/kaggle/input/mitsui-commodity-prediction-challenge/train.csv
/kaggle/input/mitsui-commodity-prediction-challenge/test.csv
/kaggle/input/mitsui-commodity-prediction-challenge/lagged_test_labels/test_labels_lag_1.csv
/kaggle/input/mitsui-commodity-prediction-challenge/lagged_test_labels/test_labels_lag_4.csv
/kaggle/input/mitsui-commodity-prediction-challenge/lagged_test_labels/test_labels_lag_3.csv
/kaggle/input/mitsui-commodity-prediction-challenge/lagged_test_labels/test_labels_lag_2.csv
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/mitsui_inference_server.py
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/mitsui_gateway.py
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/__init__.py
/kaggle/input/mitsui-commodity-prediction-challenge/kaggle_evaluation/core/templates.py
/kaggle/inpu

In [2]:
import os
import pandas as pd
import polars as pl
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
import warnings
warnings.filterwarnings('ignore')

import kaggle_evaluation.mitsui_inference_server

NUM_TARGET_COLUMNS = 424

# Simple global model storage
SIMPLE_MODEL = None

class SimpleReliablePredictor:
    """Simple, fast, and reliable predictor for API submission"""
    
    def __init__(self):
        self.models = {}
        self.scaler = None
        self.feature_columns = []
        self.feature_means = {}
        self.target_means = {}
        self.is_fitted = False
    
    def create_simple_features(self, df, lags_1=None):
        """Create minimal but effective features"""
        
        # Convert to pandas
        if isinstance(df, pl.DataFrame):
            df = df.to_pandas()
        
        features = df.copy()
        
        # Get a few key numeric columns
        numeric_cols = [col for col in df.columns if col not in ['date_id'] and 
                       df[col].dtype in [np.float64, np.int64]]
        
        # 1. Use most important base features
        key_features = []
        for col in numeric_cols:
            if any(x in col for x in ['LME_', 'US_Stock_', 'FX_']):
                key_features.append(col)
            if len(key_features) >= 20:  # Limit for speed
                break
        
        # 2. Simple aggregations
        lme_cols = [col for col in key_features if col.startswith('LME_')]
        if lme_cols:
            features['LME_mean'] = features[lme_cols].mean(axis=1)
        
        us_cols = [col for col in key_features if col.startswith('US_Stock_')]
        if us_cols:
            features['US_mean'] = features[us_cols[:5]].mean(axis=1)
        
        fx_cols = [col for col in key_features if col.startswith('FX_')]
        if fx_cols:
            features['FX_mean'] = features[fx_cols[:5]].mean(axis=1)
        
        # 3. Use lag1 features (most important!)
        if lags_1 is not None and not lags_1.is_empty():
            lag_data = lags_1.to_pandas() if isinstance(lags_1, pl.DataFrame) else lags_1
            if len(lag_data) > 0:
                latest = lag_data.iloc[-1]
                # Use a few key lag features
                target_cols = [col for col in latest.index if col.startswith('target_')]
                if target_cols:
                    # Mean of recent targets
                    recent_values = [latest[col] for col in target_cols[:10] if not np.isnan(latest[col])]
                    if recent_values:
                        features['lag1_target_mean'] = np.mean(recent_values)
                        features['lag1_target_std'] = np.std(recent_values) if len(recent_values) > 1 else 0.0
                    else:
                        features['lag1_target_mean'] = 0.0
                        features['lag1_target_std'] = 0.0
        
        # 4. Clean up
        features = features.fillna(0.0)
        features = features.replace([np.inf, -np.inf], 0.0)
        
        return features
    
    def fit(self, train_data, train_labels):
        """Fit simple models quickly"""
        print("Training simple reliable predictor...")
        
        # Take a sample for faster training
        sample_size = min(1000, len(train_data))
        train_sample = train_data.sample(n=sample_size, random_state=42)
        labels_sample = train_labels[train_labels['date_id'].isin(train_sample['date_id'])]
        
        # Create features
        train_features = self.create_simple_features(train_sample)
        
        # Get feature columns
        feature_cols = [col for col in train_features.columns if col not in ['date_id']]
        target_cols = [col for col in labels_sample.columns if col.startswith('target_')]
        
        # Store feature means for missing value imputation
        for col in feature_cols:
            self.feature_means[col] = train_features[col].mean()
        
        # Merge data
        train_full = train_features.merge(labels_sample, on='date_id', how='left')
        
        # Simple scaling
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(train_full[feature_cols])
        
        # Calculate target means
        for target in target_cols:
            self.target_means[target] = train_full[target].mean()
        
        # Train models for a subset of targets
        max_models = 50  # Very conservative for speed
        print(f"Training {max_models} models...")
        
        for i, target in enumerate(target_cols[:max_models]):
            y = train_full[target]
            mask = ~np.isnan(y)
            
            if mask.sum() >= 30:
                # Simple Ridge model
                model = Ridge(alpha=1.0, random_state=42)
                model.fit(X_scaled[mask], y[mask])
                self.models[target] = model
                
                if (i + 1) % 10 == 0:
                    print(f"Trained {i + 1} models...")
        
        self.feature_columns = feature_cols
        self.is_fitted = True
        
        print(f"Training completed: {len(self.models)} models, {len(feature_cols)} features")
        return self
    
    def predict_single(self, test_data, lags_1=None, lags_2=None, lags_3=None, lags_4=None):
        """Make simple predictions"""
        
        # Create features
        features = self.create_simple_features(test_data, lags_1)
        
        # Handle missing features
        for col in self.feature_columns:
            if col not in features.columns:
                features[col] = self.feature_means.get(col, 0.0)
        
        # Scale features
        X_scaled = self.scaler.transform(features[self.feature_columns])
        
        # Make predictions
        predictions = {}
        
        for i in range(NUM_TARGET_COLUMNS):
            target = f'target_{i}'
            
            if target in self.models:
                # Use trained model
                pred = self.models[target].predict(X_scaled)[0]
            else:
                # Use target mean or simple heuristic
                if target in self.target_means:
                    pred = self.target_means[target]
                else:
                    # Simple heuristic based on lag features
                    if 'lag1_target_mean' in features.columns:
                        lag_mean = features['lag1_target_mean'].iloc[0]
                        pred = lag_mean * 0.8 + np.random.normal(0, 0.001)  # Slight momentum + noise
                    else:
                        pred = np.random.normal(0, 0.001)  # Small random value
            
            # Clip extreme values
            pred = np.clip(pred, -0.1, 0.1)
            predictions[target] = pred
        
        return predictions

def load_and_train_simple_model():
    """Load and train the simple model"""
    global SIMPLE_MODEL
    
    print("Loading training data for simple model...")
    
    try:
        # Load data
        train = pd.read_csv('/kaggle/input/mitsui-commodity-prediction-challenge/train.csv')
        train_labels = pd.read_csv('/kaggle/input/mitsui-commodity-prediction-challenge/train_labels.csv')
        
        print(f"Loaded: train {train.shape}, labels {train_labels.shape}")
        
        # Train model
        SIMPLE_MODEL = SimpleReliablePredictor()
        SIMPLE_MODEL.fit(train, train_labels)
        
        print("Simple model training completed!")
        
    except Exception as e:
        print(f"Error loading model: {e}")
        # Create dummy model that returns zeros
        SIMPLE_MODEL = DummyPredictor()

class DummyPredictor:
    """Fallback predictor that returns small random values"""
    
    def __init__(self):
        self.is_fitted = True
    
    def predict_single(self, test_data, lags_1=None, lags_2=None, lags_3=None, lags_4=None):
        """Return small random predictions"""
        np.random.seed(42)
        predictions = {}
        for i in range(NUM_TARGET_COLUMNS):
            predictions[f'target_{i}'] = np.random.normal(0, 0.001)
        return predictions

def predict(
    test: pl.DataFrame,
    label_lags_1_batch: pl.DataFrame,
    label_lags_2_batch: pl.DataFrame,
    label_lags_3_batch: pl.DataFrame,
    label_lags_4_batch: pl.DataFrame,
) -> pl.DataFrame:
    """Main prediction function with error handling"""
    
    global SIMPLE_MODEL
    
    try:
        # Load model on first call
        if SIMPLE_MODEL is None:
            load_and_train_simple_model()
        
        # Make predictions
        predictions = SIMPLE_MODEL.predict_single(
            test, 
            label_lags_1_batch, 
            label_lags_2_batch, 
            label_lags_3_batch, 
            label_lags_4_batch
        )
        
        # Convert to polars DataFrame
        result = pl.DataFrame({
            f'target_{i}': [predictions[f'target_{i}']] 
            for i in range(NUM_TARGET_COLUMNS)
        })
        
    except Exception as e:
        print(f"Error in prediction: {e}")
        # Fallback to random predictions
        np.random.seed(42)
        result = pl.DataFrame({
            f'target_{i}': [np.random.normal(0, 0.001)] 
            for i in range(NUM_TARGET_COLUMNS)
        })
    
    assert isinstance(result, pl.DataFrame)
    assert len(result) == 1
    assert len(result.columns) == NUM_TARGET_COLUMNS
    
    return result

# Create inference server
inference_server = kaggle_evaluation.mitsui_inference_server.MitsuiInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    print("Starting inference server for competition...")
    inference_server.serve()
else:
    print("Running local gateway for testing...")
    inference_server.run_local_gateway(('/kaggle/input/mitsui-commodity-prediction-challenge/',))

Running local gateway for testing...
Loading training data for simple model...
Loaded: train (1961, 558), labels (1961, 425)
Training simple reliable predictor...
Training 50 models...
Trained 10 models...
Trained 20 models...
Trained 30 models...
Trained 40 models...
Trained 50 models...
Training completed: 50 models, 559 features
Simple model training completed!
