# –û—Ç–±–æ—Ä —Å–∞–º—ã—Ö –∑–Ω–∞—á–∏–º—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤

In [1]:
import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder


In [2]:
RANDOM_STATE = 42
SAMPLE_ROWS = None
VALID_SIZE = 0.2
TOP_K = 12
PERMUTATION_SAMPLE = 15000

TARGET_COLS = ["real_weight", "real_height", "real_length", "real_width"]
ID_COLS = ["item_id", "seller_id", "buyer_id"]


In [None]:
train_df = pd.read_parquet("train.parquet")

if SAMPLE_ROWS is not None and len(train_df) > SAMPLE_ROWS:
    train_df = train_df.sample(SAMPLE_ROWS, random_state=RANDOM_STATE).reset_index(drop=True)

print("train shape:", train_df.shape)
train_df.head(2)


train shape: (312908, 16)


Unnamed: 0,item_id,order_date,item_condition,item_price,category_name,subcategory_name,microcat_name,seller_id,buyer_id,title,description,image_name,real_weight,real_height,real_length,real_width
0,185689,2024-09-27,–ë/—É,3000.0,–¢—Ä–∞–Ω—Å–ø–æ—Ä—Ç,–ó–∞–ø—á–∞—Å—Ç–∏ –∏ –∞–∫—Å–µ—Å—Å—É–∞—Ä—ã,–°–∞–ª–æ–Ω,1942218,1935418,–†—É—á–∫–∞ –ê–ö–ü–ü mercedes w203 avangarde,–†—É—á–∫–∞ —Ä—ã—á–∞–≥ –∞–∫–ø–ø –Ω–∞ –ú–µ—Ä—Å–µ–¥–µ—Å –í203 mercedes w20...,185689.jpg,0.37,10.0,23.0,19.0
1,1914373,2024-11-07,–ù–æ–≤–æ–µ —Å –±–∏—Ä–∫–æ–π,5990.0,–õ–∏—á–Ω—ã–µ –≤–µ—â–∏,"–û–¥–µ–∂–¥–∞, –æ–±—É–≤—å, –∞–∫—Å–µ—Å—Å—É–∞—Ä—ã",–ó–∏–º–Ω–∏–µ –∫—É—Ä—Ç–∫–∏ –∏ –ø—É—Ö–æ–≤–∏–∫–∏,2164034,1753243,–ü—É—Ö–æ–≤–∏–∫ Moncler –≥–æ–ª—É–±–æ–π (52 —Ä–∞–∑–º–µ—Ä),–û–±—ä—è–≤–ª–µ–Ω–∏–µ –¥–ª—è –∑–∞–∫–∞–∑–∞ üì≤\n\n–ê–≤–∏—Ç–æ –¥–æ—Å—Ç–∞–≤–∫–∞ üöö\n\...,1914373.jpg,2.486,14.0,37.0,24.0


In [4]:
def build_candidate_features(df: pd.DataFrame):
    data = df.copy()

    # 1) –°—Ä–∞–∑—É –≤—ã–±—Ä–∞—Å—ã–≤–∞–µ–º ID –∏ —Ç–∞—Ä–≥–µ—Ç—ã, —á—Ç–æ–±—ã –æ–Ω–∏ –Ω–µ —É—á–∞—Å—Ç–≤–æ–≤–∞–ª–∏ –≤ –æ–±—É—á–µ–Ω–∏–∏.
    drop_cols = [c for c in (TARGET_COLS + ID_COLS) if c in data.columns]
    data = data.drop(columns=drop_cols)

    # 2) –î–∞—Ç–∞: –¥–æ–±–∞–≤–ª—è–µ–º –ø—Ä–æ—Å—Ç—ã–µ –∫–∞–ª–µ–Ω–¥–∞—Ä–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ –∏ —É–¥–∞–ª—è–µ–º –∏—Å—Ö–æ–¥–Ω—É—é —Å—Ç—Ä–æ–∫–æ–≤—É—é –¥–∞—Ç—É.
    if "order_date" in data.columns:
        order_dt = pd.to_datetime(data["order_date"], errors="coerce")
        data["order_year"] = order_dt.dt.year.fillna(0)
        data["order_month"] = order_dt.dt.month.fillna(0)
        data["order_day"] = order_dt.dt.day.fillna(0)
        data["order_dow"] = order_dt.dt.dayofweek.fillna(0)
        data = data.drop(columns=["order_date"])

    # 3) –¢–µ–∫—Å—Ç: –∑–∞–º–µ–Ω—è–µ–º —Å—ã—Ä—ã–µ —Ç–µ–∫—Å—Ç—ã –Ω–∞ –ø—Ä–æ—Å—Ç—ã–µ —á–∏—Å–ª–æ–≤—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ –¥–ª–∏–Ω—ã.
    for text_col in ["title", "description"]:
        if text_col in data.columns:
            text = data[text_col].fillna("").astype(str)
            data[f"{text_col}_len"] = text.str.len()
            data[f"{text_col}_words"] = text.str.split().str.len().fillna(0)
            data = data.drop(columns=[text_col])

    # 4) –ò–∑ –∏–∑–æ–±—Ä–∞–∂–µ–Ω–∏—è –æ—Å—Ç–∞–≤–ª—è–µ–º —Ç–æ–ª—å–∫–æ —Ñ–∞–∫—Ç –Ω–∞–ª–∏—á–∏—è —Ñ–∞–π–ª–∞.
    if "image_name" in data.columns:
        data["has_image"] = data["image_name"].notna().astype(np.int8)
        data = data.drop(columns=["image_name"])

    # 5) –ü—Ä–æ—Å—Ç–∞—è —É–Ω–∏–≤–µ—Ä—Å–∞–ª—å–Ω–∞—è –æ–±—Ä–∞–±–æ—Ç–∫–∞: object -> –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã–µ, –æ—Å—Ç–∞–ª—å–Ω—ã–µ -> —á–∏—Å–ª–æ–≤—ã–µ.
    categorical_cols = data.select_dtypes(include=["object", "category"]).columns.tolist()
    numeric_cols = [c for c in data.columns if c not in categorical_cols]

    for col in categorical_cols:
        data[col] = data[col].fillna("unknown").astype(str)

    for col in numeric_cols:
        data[col] = pd.to_numeric(data[col], errors="coerce")

    if numeric_cols:
        data[numeric_cols] = data[numeric_cols].fillna(data[numeric_cols].median())
        data[numeric_cols] = data[numeric_cols].astype(np.float32)

    return data, numeric_cols, categorical_cols


X, NUMERIC_COLS, CATEGORICAL_COLS = build_candidate_features(train_df)
y = train_df[TARGET_COLS].to_numpy(dtype=np.float32)
ALL_FEATURE_COLS = X.columns.tolist()

print("X shape:", X.shape)
print("y shape:", y.shape)
print("numeric features:", len(NUMERIC_COLS))
print("categorical features:", len(CATEGORICAL_COLS))
print("dropped id features:", ID_COLS)
X.head(2)


X shape: (312908, 14)
y shape: (312908, 4)
numeric features: 10
categorical features: 4
dropped id features: ['item_id', 'seller_id', 'buyer_id']


Unnamed: 0,item_condition,item_price,category_name,subcategory_name,microcat_name,order_year,order_month,order_day,order_dow,title_len,title_words,description_len,description_words,has_image
0,–ë/—É,3000.0,–¢—Ä–∞–Ω—Å–ø–æ—Ä—Ç,–ó–∞–ø—á–∞—Å—Ç–∏ –∏ –∞–∫—Å–µ—Å—Å—É–∞—Ä—ã,–°–∞–ª–æ–Ω,2024.0,9.0,27.0,4.0,34.0,5.0,111.0,15.0,1.0
1,–ù–æ–≤–æ–µ —Å –±–∏—Ä–∫–æ–π,5990.0,–õ–∏—á–Ω—ã–µ –≤–µ—â–∏,"–û–¥–µ–∂–¥–∞, –æ–±—É–≤—å, –∞–∫—Å–µ—Å—Å—É–∞—Ä—ã",–ó–∏–º–Ω–∏–µ –∫—É—Ä—Ç–∫–∏ –∏ –ø—É—Ö–æ–≤–∏–∫–∏,2024.0,11.0,7.0,3.0,35.0,5.0,77.0,12.0,1.0


In [None]:
def log_mae_per_target(y_true: np.ndarray, y_pred: np.ndarray):
    y_true = np.nan_to_num(y_true, nan=0.0)
    y_pred = np.nan_to_num(y_pred, nan=0.0)
    y_true = np.maximum(y_true, 0.0)
    y_pred = np.maximum(y_pred, 0.0)

    log_true = np.log1p(y_true)
    log_pred = np.log1p(y_pred)
    per_target = np.mean(np.abs(log_true - log_pred), axis=0)
    return dict(zip(TARGET_COLS, per_target))


def macro_log_mae(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return float(np.mean(list(log_mae_per_target(y_true, y_pred).values())))


In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=VALID_SIZE, random_state=RANDOM_STATE
)


def make_preprocessor(numeric_cols, categorical_cols):
    transformers = []

    if numeric_cols:
        transformers.append(("num", "passthrough", numeric_cols))

    if categorical_cols:
        transformers.append(
            (
                "cat",
                OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
                categorical_cols,
            )
        )

    return ColumnTransformer(transformers=transformers, remainder="drop")


preprocessor = make_preprocessor(NUMERIC_COLS, CATEGORICAL_COLS)

base_model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        (
            "model",
            ExtraTreesRegressor(
                n_estimators=500,
                random_state=RANDOM_STATE,
                n_jobs=-1,
                min_samples_leaf=2,
            ),
        ),
    ]
)

base_model.fit(X_train, y_train)
base_valid_pred = np.maximum(base_model.predict(X_valid), 0.0)
base_macro = macro_log_mae(y_valid, base_valid_pred)

print(f"Baseline Macro Log-MAE (all features, no IDs): {base_macro:.6f}")
print("Baseline Log-MAE per target:", log_mae_per_target(y_valid, base_valid_pred))


Baseline Macro Log-MAE (all features, no IDs): 0.392568
Baseline Log-MAE per target: {'real_weight': np.float64(0.384934408847202), 'real_height': np.float64(0.5369063572152105), 'real_length': np.float64(0.3258375229537271), 'real_width': np.float64(0.32259296060241166)}


In [7]:
# –í—Å—Ç—Ä–æ–µ–Ω–Ω–∞—è –≤–∞–∂–Ω–æ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –∏–∑ –¥–µ—Ä–µ–≤—å–µ–≤ (impurity-based).
model_feature_names = ALL_FEATURE_COLS
impurity_importance = pd.DataFrame(
    {
        "feature": model_feature_names,
        "impurity_importance": base_model.named_steps["model"].feature_importances_,
    }
).sort_values("impurity_importance", ascending=False)

impurity_importance.head(20)


Unnamed: 0,feature,impurity_importance
13,has_image,0.14217
5,order_year,0.121497
3,subcategory_name,0.114529
12,description_words,0.110083
4,microcat_name,0.097655
2,category_name,0.094557
0,item_condition,0.084232
7,order_day,0.075131
8,order_dow,0.06981
6,order_month,0.06046


Permutation Importance

In [None]:
rng = np.random.default_rng(RANDOM_STATE)

if len(X_valid) > PERMUTATION_SAMPLE:
    perm_idx = rng.choice(len(X_valid), size=PERMUTATION_SAMPLE, replace=False)
    X_perm_eval = X_valid.iloc[perm_idx].copy()
    y_perm_eval = y_valid[perm_idx]
else:
    X_perm_eval = X_valid.copy()
    y_perm_eval = y_valid

baseline_perm_score = macro_log_mae(y_perm_eval, np.maximum(base_model.predict(X_perm_eval), 0.0))

perm_rows = []
for feature in ALL_FEATURE_COLS:
    X_shuffled = X_perm_eval.copy()
    X_shuffled[feature] = rng.permutation(X_shuffled[feature].values)
    shuffled_pred = np.maximum(base_model.predict(X_shuffled), 0.0)
    shuffled_score = macro_log_mae(y_perm_eval, shuffled_pred)
    perm_rows.append(
        {
            "feature": feature,
            "baseline_macro_log_mae": baseline_perm_score,
            "shuffled_macro_log_mae": shuffled_score,
            "permutation_delta": shuffled_score - baseline_perm_score,
        }
    )

permutation_importance = pd.DataFrame(perm_rows).sort_values("permutation_delta", ascending=False)
permutation_importance.head(20)


Unnamed: 0,feature,baseline_macro_log_mae,shuffled_macro_log_mae,permutation_delta
2,category_name,0.393418,0.442011,0.04859345
1,item_price,0.393418,0.441363,0.04794542
3,subcategory_name,0.393418,0.435001,0.0415836
4,microcat_name,0.393418,0.425186,0.03176861
0,item_condition,0.393418,0.421234,0.02781586
6,order_month,0.393418,0.407079,0.01366099
9,title_len,0.393418,0.399304,0.005885751
10,title_words,0.393418,0.398261,0.004843208
12,description_words,0.393418,0.39591,0.002492103
11,description_len,0.393418,0.395738,0.002319998


In [None]:
importance_df = impurity_importance.merge(
    permutation_importance[["feature", "permutation_delta"]], on="feature", how="inner"
)

importance_df["rank_impurity"] = importance_df["impurity_importance"].rank(ascending=False, method="min")
importance_df["rank_permutation"] = importance_df["permutation_delta"].rank(ascending=False, method="min")
importance_df["rank_mean"] = (importance_df["rank_impurity"] + importance_df["rank_permutation"]) / 2

importance_df = importance_df.sort_values("rank_mean").reset_index(drop=True)
selected_features = importance_df.head(TOP_K).copy()
selected_feature_names = selected_features["feature"].tolist()

print("–í—ã–±—Ä–∞–Ω–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ (top-K):")
print(selected_feature_names)
selected_features


–í—ã–±—Ä–∞–Ω–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏ (top-K):
['subcategory_name', 'category_name', 'microcat_name', 'item_condition', 'description_words', 'has_image', 'order_year', 'item_price', 'order_month', 'order_day', 'order_dow', 'title_words']


Unnamed: 0,feature,impurity_importance,permutation_delta,rank_impurity,rank_permutation,rank_mean
0,subcategory_name,0.114529,0.0415836,3.0,3.0,3.0
1,category_name,0.094557,0.04859345,6.0,1.0,3.5
2,microcat_name,0.097655,0.03176861,5.0,4.0,4.5
3,item_condition,0.084232,0.02781586,7.0,5.0,6.0
4,description_words,0.110083,0.002492103,4.0,9.0,6.5
5,has_image,0.14217,0.0,1.0,14.0,7.5
6,order_year,0.121497,5.5511150000000004e-17,2.0,13.0,7.5
7,item_price,0.0,0.04794542,13.0,2.0,7.5
8,order_month,0.06046,0.01366099,10.0,6.0,8.0
9,order_day,0.075131,0.0003499837,8.0,12.0,10.0


In [10]:
# –ü—Ä–æ–≤–µ—Ä—è–µ–º –∫–∞—á–µ—Å—Ç–≤–æ –º–æ–¥–µ–ª–∏ —Ç–æ–ª—å–∫–æ –Ω–∞ –≤—ã–±—Ä–∞–Ω–Ω—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–∞—Ö.
selected_numeric = [c for c in selected_feature_names if c in NUMERIC_COLS]
selected_categorical = [c for c in selected_feature_names if c in CATEGORICAL_COLS]

preprocessor_selected = make_preprocessor(selected_numeric, selected_categorical)

selected_model = Pipeline(
    steps=[
        ("preprocess", preprocessor_selected),
        (
            "model",
            ExtraTreesRegressor(
                n_estimators=500,
                random_state=RANDOM_STATE,
                n_jobs=-1,
                min_samples_leaf=2,
            ),
        ),
    ]
)

selected_model.fit(X_train[selected_feature_names], y_train)
selected_valid_pred = np.maximum(selected_model.predict(X_valid[selected_feature_names]), 0.0)
selected_macro = macro_log_mae(y_valid, selected_valid_pred)

comparison = pd.DataFrame(
    [
        {"setup": "all_features_without_ids", "macro_log_mae": base_macro, "n_features": len(ALL_FEATURE_COLS)},
        {"setup": "selected_top_k", "macro_log_mae": selected_macro, "n_features": len(selected_feature_names)},
    ]
)
comparison


Unnamed: 0,setup,macro_log_mae,n_features
0,all_features_without_ids,0.392568,14
1,selected_top_k,0.392401,12


In [16]:
best_model_name = "ExtraTrees (selected top-k features)"
best_model = ExtraTreesRegressor(
    n_estimators=500,
    random_state=RANDOM_STATE,
    n_jobs=-1,
    min_samples_leaf=2,
)

print("Best model:", best_model_name)
print(f"Best validation Macro Log-MAE: {selected_macro:.6f}")

final_model = Pipeline(
    steps=[
        ("preprocess", make_preprocessor(selected_numeric, selected_categorical)),
        ("model", best_model),
    ]
)

X_full = X[selected_feature_names].copy()
y_full = y.copy()

low_q_full = np.quantile(y_full, 0.01, axis=0)
high_q_full = np.quantile(y_full, 0.99, axis=0)
inlier_mask_full = ((y_full >= low_q_full) & (y_full <= high_q_full)).all(axis=1)

X_full = X_full.iloc[inlier_mask_full].copy()
y_full = y_full[inlier_mask_full]
final_model.fit(X_full, y_full)

test_df = pd.read_parquet("test.parquet")
X_test_full, _, _ = build_candidate_features(test_df)
X_test = X_test_full.reindex(columns=selected_feature_names)

for col in selected_feature_names:
    if col in selected_categorical:
        X_test[col] = X_test[col].fillna("unknown")
    else:
        X_test[col] = pd.to_numeric(X_test[col], errors="coerce").fillna(0.0).astype(np.float32)

test_pred = np.maximum(final_model.predict(X_test), 0.0)

submission = pd.DataFrame(test_pred, columns=["weight", "height", "length", "width"])
submission.insert(0, "item_id", test_df["item_id"].values)
submission = submission[["item_id", "weight", "height", "length", "width"]]
submission.head()


Best model: ExtraTrees (selected top-k features)
Best validation Macro Log-MAE: 0.392401


Unnamed: 0,item_id,weight,height,length,width
0,163755,0.605421,9.592833,28.4705,22.838556
1,1339648,2.338517,16.5204,41.7474,29.353633
2,21095,1.476608,12.313167,35.343267,28.7455
3,925424,0.719614,6.210129,34.372962,27.10331
4,780125,3.729366,20.506833,41.860433,32.724567


In [17]:
submission.to_csv("result.csv", index=False)
