In [1]:
# ------------------ Setup ------------------
import os
import gc
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import polars as pl
from tqdm.auto import tqdm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import joblib
import kaggle_evaluation.mitsui_inference_server

# Make folders
os.makedirs("models", exist_ok=True)
os.makedirs("oof", exist_ok=True)

# ------------------ Data Loading ------------------
train_df = pl.read_csv('/kaggle/input/mitsui-commodity-prediction-challenge/train.csv').to_pandas()
train_labels_df = pl.read_csv('/kaggle/input/mitsui-commodity-prediction-challenge/train_labels.csv').to_pandas()
target_pairs_df = pl.read_csv('/kaggle/input/mitsui-commodity-prediction-challenge/target_pairs.csv').to_pandas()
test_df = pl.read_csv('/kaggle/input/mitsui-commodity-prediction-challenge/test.csv').to_pandas()

# Remove last 90 rows from training data
train_df = train_df[:1827]
train_labels_train = train_labels_df[:1827]
train_labels_valid = train_labels_df[1827:]

# ------------------ Utility Functions ------------------
def generate_log_returns(data, lag):
    log_returns = pd.Series(np.nan, index=data.index)
    for t in range(len(data)):
        try:
            log_returns.iloc[t] = np.log(data.iloc[t] / data.iloc[t - lag])
        except Exception:
            log_returns.iloc[t] = np.nan
    return log_returns

def generate_features(column_a: pd.Series, lag: int, column_b: pd.Series = None) -> pd.Series:
    a_returns = generate_log_returns(column_a, lag)
    if column_b is not None:
        b_returns = generate_log_returns(column_b, lag)
        return a_returns - b_returns
    else:
        return a_returns

# ------------------ Training Models ------------------
models = {}

for i, row in tqdm(target_pairs_df.iterrows(), total=len(target_pairs_df)):
    target_name = row['target']
    lag = row['lag']
    pair = row['pair']

    feature_df = pd.DataFrame(index=train_df.index)
    
    if ' - ' in pair:
        a, b = pair.split(' - ')
        feature = generate_features(train_df[a], lag, train_df[b])
        
        feature_df[f'{target_name}_lag_{lag}'] = feature
    else:
        feature = generate_features(train_df[pair], lag)
        feature_df[f'{target_name}_lag_{lag}'] = feature
    
    df = pd.concat([feature_df, train_labels_train[target_name].rename('target')], axis=1).fillna(0)

    if df.empty:
        continue

    X = df.drop(columns='target')
    y = df['target']

    model = LinearRegression()
    model.fit(X, y)

    # Save model to file
    joblib.dump(model, f"models/{target_name}_model.pkl")

    # Store model in dictionary
    models[target_name] = model

  0%|          | 0/424 [00:00<?, ?it/s]

# Calculate score

In [2]:
# ------------------ Generating Predictions ------------------
predictions = {}

for i, row in tqdm(target_pairs_df.iterrows(), total=len(target_pairs_df)):
    target_name = row['target']
    lag = row['lag']
    pair = row['pair']

    feature_df = pd.DataFrame(index=test_df.index)
    
    if ' - ' in pair:
        a, b = pair.split(' - ')
        feature = generate_features(test_df[a], lag, test_df[b])
        feature_df[f'{target_name}_lag_{lag}'] = feature
    else:
        feature = generate_features(test_df[pair], lag)
        feature_df[f'{target_name}_lag_{lag}'] = feature

    X = feature_df.fillna(0)

    if target_name in models:
        model = models[target_name]
        pred = model.predict(X)
        predictions[target_name] = pred
    else:
        predictions[target_name] = np.full(len(test_df), 0.0)

pred_df = pd.DataFrame(predictions)

# ------------------ Scoring ------------------
SOL_FILL = -999999
def compute_sharpe(preds: pd.DataFrame, truths: pd.DataFrame) -> float:
    scores = []
    for p_row, t_row in zip(preds.values, truths.values):
        mask = t_row != SOL_FILL
        if mask.sum() < 2 or np.std(p_row[mask]) == 0 or np.std(t_row[mask]) == 0:
            scores.append(0.0)
        else:
            scores.append(
                np.corrcoef(
                    pd.Series(p_row[mask]).rank(),
                    pd.Series(t_row[mask]).rank()
                )[0, 1]
            )
    arr = np.array(scores)
    print(f"mean Spearman rank correlation: {arr.mean()}")
    print(f"standard deviation: {arr.std(ddof=0)}")
    return float(arr.mean() / arr.std(ddof=0)) if arr.std(ddof=0) > 0 else 0.0

TARGETS = [f"target_{i}" for i in range(424)]
valid_truth = train_labels_valid[TARGETS].fillna(SOL_FILL)
cv_score = compute_sharpe(pred_df, valid_truth)

print(f"\n✅ CV RankCorr-Sharpe Score = {cv_score:.4f}")

  0%|          | 0/424 [00:00<?, ?it/s]

mean Spearman rank correlation: 0.046269742138653785
standard deviation: 0.18765547355491607

✅ CV RankCorr-Sharpe Score = 0.2466


# inference

In [3]:
def predict(
    test: pl.DataFrame,
    lag1, lag2, lag3, lag4
) -> pl.DataFrame:
   
    test_df = test.to_pandas()
    predictions = {}

    for i, row in tqdm(target_pairs_df.iterrows(), total=len(target_pairs_df)):
        target_name = row['target']
        lag = row['lag']
        pair = row['pair']

        if ' - ' in pair:
            a, b = pair.split(' - ')
            feature = generate_features(test_df[a], lag, test_df[b])
        else:
            feature = generate_features(test_df[pair], lag)

        feature_df = pd.DataFrame(index=test_df.index)
        feature_df[f'{target_name}_lag_{lag}'] = feature

        X = feature_df.fillna(0)
        
        model = models[target_name]
        pred = model.predict(X)  
        predictions[target_name] = pred

    pred_df = pd.DataFrame(predictions)
    return pl.DataFrame(pred_df)

server = kaggle_evaluation.mitsui_inference_server.MitsuiInferenceServer(predict)
if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    server.serve()
else:
    server.run_local_gateway(('/kaggle/input/mitsui-commodity-prediction-challenge',))

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]

  0%|          | 0/424 [00:00<?, ?it/s]