# **MITSUI: RandomForestRegressor**


---

### 🌳 RandomForestRegressor and Multi-Output Regression

`RandomForestRegressor` (from `sklearn.ensemble`) natively supports **multi-output regression**.
This means you can pass a **2D target array** `y` with shape `(n_samples, n_outputs)` directly to `.fit()`.
Each output column is treated as a separate regression problem, but the model shares the same set of trees, making training efficient.


#### Key points:

* Unlike LightGBM/XGBoost, **no wrapper is required** (`MultiOutputRegressor` is not needed).
* Predictions preserve the same dimensionality: if `y` has 3 target variables, the output will have 3 columns.
* It is especially useful when the targets are related (e.g., predicting multiple sensor readings at once).

---


In [1]:
import pandas as pd
import polars as pl
import numpy as np
import random
import time
import os, gc
import warnings
warnings.simplefilter('ignore')

from sklearn.preprocessing import StandardScaler

class CFG:
    path = "/kaggle/input/mitsui-commodity-prediction-challenge/"
    seed = 42
    targets = [f"target_{i}" for i in range(424)]
    solution_null_filler = 0.0

train = pd.read_csv(CFG.path + "train.csv").sort_values("date_id")
train_labels = pd.read_csv(CFG.path + "train_labels.csv")
CFG.features = [c for c in train.columns if c not in ["date_id"]]

print(f"Train shape: {train.shape}, Train labels shape: {train_labels.shape}")

def preprocess_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    columns = [
        "US_Stock_GOLD_adj_open", "US_Stock_GOLD_adj_high",
        "US_Stock_GOLD_adj_low", "US_Stock_GOLD_adj_close",
        "US_Stock_GOLD_adj_volume"
    ]
    
    for col in columns:
        if col in df.columns and df[col].dtype == "object":
            df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df

X_train = (
    preprocess_columns(
        train[CFG.features]
        .fillna(-1)
    ).reindex(
        columns=CFG.features,
        fill_value=0.0
    )
).copy()

y_train = (
    train_labels[CFG.targets]
    .fillna(CFG.solution_null_filler)
    .copy()
)

scaler = StandardScaler()
X_train[CFG.features] = scaler.fit_transform(X_train)

Train shape: (1961, 558), Train labels shape: (1961, 425)


In [2]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(
    random_state=CFG.seed,
    n_jobs=-1
)

In [3]:
model.fit(X_train, y_train)

del train, train_labels, X_train, y_train
gc.collect()

def predict(
    test: pl.DataFrame,
    label_lags_1_batch: pl.DataFrame,
    label_lags_2_batch: pl.DataFrame,
    label_lags_3_batch: pl.DataFrame,
    label_lags_4_batch: pl.DataFrame,
) -> pd.DataFrame:
    
    X_test = (
        preprocess_columns(
            test
            .select(pl.col(CFG.features))
            .to_pandas()
            .fillna(-1)
        ).reindex(
            columns=CFG.features,
            fill_value=0.0
        )
    ).copy()

    pred = model.predict(scaler.transform(X_test))
    predictions = pd.DataFrame(pred, columns=CFG.targets)
    return predictions

import kaggle_evaluation.mitsui_inference_server

inference_server = kaggle_evaluation.mitsui_inference_server.MitsuiInferenceServer(predict)

if os.getenv('KAGGLE_IS_COMPETITION_RERUN'):
    inference_server.serve()
else:
    inference_server.run_local_gateway((CFG.path,))