In [3]:
!pip install lightgbm -q

import pandas as pd
import numpy as np
import lightgbm as lgb

from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder

pd.set_option("display.max_columns", None)
np.random.seed(42)

In [4]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
interactions = pd.read_csv("interactions.csv")

print(f"Train shape: {train.shape}")
print(f"Test shape: {test.shape}")
print(f"Interactions shape: {interactions.shape}")


Train shape: (67200, 4)
Test shape: (5900, 4)
Interactions shape: (54761, 11)


In [5]:
for df in [train, test, interactions]:
    df["service_date"] = pd.to_datetime(
        df["service_date"], format="mixed", dayfirst=True
    )

interactions["interaction_date"] = pd.to_datetime(
    interactions["interaction_date"], format="mixed", dayfirst=True
)


In [6]:
for col in ["origin_hub_id", "destination_hub_id"]:
    train[col] = train[col].astype(str)
    test[col] = test[col].astype(str)
    interactions[col] = interactions[col].astype(str)


In [7]:
interactions_15 = interactions[
    interactions["days_before_service"] >= 15
].copy()

print(f"Interactions after 15-day filter: {len(interactions_15)}")


Interactions after 15-day filter: 28800


In [8]:
test_routes = set(
    zip(
        test["service_date"],
        test["origin_hub_id"],
        test["destination_hub_id"]
    )
)

inter_routes = set(
    zip(
        interactions_15["service_date"],
        interactions_15["origin_hub_id"],
        interactions_15["destination_hub_id"]
    )
)

coverage = len(test_routes.intersection(inter_routes)) / len(test_routes) * 100
print(f"Test set coverage with interactions: {coverage:.1f}%")


Test set coverage with interactions: 0.0%


In [9]:
features_basic = (
    interactions_15
    .groupby(["service_date", "origin_hub_id", "destination_hub_id"])
    .agg(
        max_commitment=("cumulative_commitments", "max"),
        max_interest=("cumulative_interest_signals", "max"),
        mean_commitment=("cumulative_commitments", "mean"),
        mean_interest=("cumulative_interest_signals", "mean"),
        std_commitment=("cumulative_commitments", lambda x: x.std() if len(x) > 1 else 0),
        std_interest=("cumulative_interest_signals", lambda x: x.std() if len(x) > 1 else 0),
        days_active=("interaction_date", "nunique"),
        origin_region=("origin_region", "first"),
        destination_region=("destination_region", "first"),
        origin_hub_tier=("origin_hub_tier", "first"),
        destination_hub_tier=("destination_hub_tier", "first"),
    )
    .reset_index()
)


In [10]:
features_basic["commitment_per_day"] = (
    features_basic["max_commitment"] / (features_basic["days_active"] + 1)
)
features_basic["interest_per_day"] = (
    features_basic["max_interest"] / (features_basic["days_active"] + 1)
)
features_basic["commit_interest_ratio"] = (
    features_basic["max_commitment"] / (features_basic["max_interest"] + 1)
)

features_basic["log_max_commitment"] = np.log1p(features_basic["max_commitment"])
features_basic["log_max_interest"] = np.log1p(features_basic["max_interest"])


In [11]:
interactions_sorted = interactions_15.sort_values(
    ["service_date", "origin_hub_id", "destination_hub_id", "days_before_service"]
)

trend_data = []

for (service_date, origin, dest), group in interactions_sorted.groupby(
    ["service_date", "origin_hub_id", "destination_hub_id"]
):
    if len(group) >= 2:
        early_commit = group["cumulative_commitments"].iloc[0]
        late_commit = group["cumulative_commitments"].iloc[-1]
        early_interest = group["cumulative_interest_signals"].iloc[0]
        late_interest = group["cumulative_interest_signals"].iloc[-1]

        trend_data.append({
            "service_date": service_date,
            "origin_hub_id": origin,
            "destination_hub_id": dest,
            "commitment_growth": late_commit - early_commit,
            "interest_growth": late_interest - early_interest,
            "growth_rate": (late_commit - early_commit) / (early_commit + 1)
        })

features_trend = pd.DataFrame(trend_data)


In [12]:
features = features_basic.merge(
    features_trend,
    on=["service_date", "origin_hub_id", "destination_hub_id"],
    how="left"
)

for col in ["commitment_growth", "interest_growth", "growth_rate"]:
    features[col] = features[col].fillna(0)


In [13]:
route_prior = (
    train
    .groupby(["origin_hub_id", "destination_hub_id"])
    .agg(
        route_mean=("final_service_units", "mean"),
        route_median=("final_service_units", "median"),
        route_std=("final_service_units", lambda x: x.std() if len(x) > 1 else 0),
        route_count=("final_service_units", "count"),
    )
    .reset_index()
)

origin_stats = (
    train.groupby("origin_hub_id")["final_service_units"]
    .mean()
    .reset_index()
    .rename(columns={"final_service_units": "origin_mean"})
)

dest_stats = (
    train.groupby("destination_hub_id")["final_service_units"]
    .mean()
    .reset_index()
    .rename(columns={"final_service_units": "dest_mean"})
)

global_mean = train["final_service_units"].mean()


In [14]:
def merge_all(df):
    return (
        df
        .merge(features, on=["service_date", "origin_hub_id", "destination_hub_id"], how="left")
        .merge(route_prior, on=["origin_hub_id", "destination_hub_id"], how="left")
        .merge(origin_stats, on="origin_hub_id", how="left")
        .merge(dest_stats, on="destination_hub_id", how="left")
    )

train_data = merge_all(train)
test_data = merge_all(test)


In [15]:
for df in [train_data, test_data]:
    df["month"] = df["service_date"].dt.month
    df["day_of_week"] = df["service_date"].dt.dayofweek
    df["day_of_month"] = df["service_date"].dt.day
    df["quarter"] = df["service_date"].dt.quarter
    df["is_weekend"] = (df["day_of_week"] >= 5).astype(int)
    df["is_month_start"] = (df["day_of_month"] <= 7).astype(int)
    df["is_month_end"] = (df["day_of_month"] >= 23).astype(int)


In [16]:
interaction_features = [
    "max_commitment","max_interest","mean_commitment","mean_interest",
    "std_commitment","std_interest","days_active","commitment_per_day",
    "interest_per_day","commit_interest_ratio","log_max_commitment",
    "log_max_interest","commitment_growth","interest_growth","growth_rate"
]

for col in interaction_features:
    train_data[col] = train_data[col].fillna(0)
    test_data[col] = test_data[col].fillna(0)

for df in [train_data, test_data]:
    df["route_mean"] = df["route_mean"].fillna(
        (df["origin_mean"].fillna(global_mean) + df["dest_mean"].fillna(global_mean)) / 2
    )
    df["route_median"] = df["route_median"].fillna(df["route_mean"])
    df["route_std"] = df["route_std"].fillna(df["route_mean"] * 0.3)
    df["route_count"] = df["route_count"].fillna(0)
    df["origin_mean"] = df["origin_mean"].fillna(global_mean)
    df["dest_mean"] = df["dest_mean"].fillna(global_mean)


In [17]:
cat_cols = [
    "origin_region","destination_region",
    "origin_hub_tier","destination_hub_tier"
]

for col in cat_cols:
    train_data[col] = train_data[col].fillna("unknown")
    test_data[col] = test_data[col].fillna("unknown")

    le = LabelEncoder()
    le.fit(pd.concat([train_data[col], test_data[col]]))
    train_data[col] = le.transform(train_data[col])
    test_data[col] = le.transform(test_data[col])

train_data["has_interactions"] = (train_data["max_commitment"] > 0).astype(int)
test_data["has_interactions"] = (test_data["max_commitment"] > 0).astype(int)


In [18]:
feature_cols = [
    "max_commitment","max_interest","mean_commitment","mean_interest",
    "std_commitment","std_interest","days_active",
    "commitment_per_day","interest_per_day","commit_interest_ratio",
    "log_max_commitment","log_max_interest",
    "commitment_growth","interest_growth","growth_rate",
    "route_mean","route_median","route_std","route_count",
    "origin_mean","dest_mean",
    "month","day_of_week","day_of_month","quarter",
    "is_weekend","is_month_start","is_month_end",
    "origin_region","destination_region",
    "origin_hub_tier","destination_hub_tier",
    "has_interactions"
]

X = train_data[feature_cols]
y = train_data["final_service_units"]
X_test = test_data[feature_cols]


In [19]:
train_data = train_data.sort_values("service_date")
cutoff = train_data["service_date"].quantile(0.8)

train_idx = train_data[train_data["service_date"] < cutoff].index
val_idx = train_data[train_data["service_date"] >= cutoff].index

X_train, X_val = X.loc[train_idx], X.loc[val_idx]
y_train, y_val = y.loc[train_idx], y.loc[val_idx]


In [20]:
model = lgb.LGBMRegressor(
    objective="mae",
    n_estimators=2000,
    learning_rate=0.05,
    num_leaves=64,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="mae",
    callbacks=[lgb.early_stopping(100, verbose=False)]
)

val_pred = model.predict(X_val)
print("Validation MAE:", mean_absolute_error(y_val, val_pred))


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.010106 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3313
[LightGBM] [Info] Number of data points in the train set: 53700, number of used features: 32
[LightGBM] [Info] Start training from score 1635.500000
Validation MAE: 623.7808368453458


In [21]:
folds = np.array_split(train_data.index, 5)
test_preds = np.zeros(len(X_test))

for i in range(5):
    val_idx = folds[i]
    train_idx = train_data.index.difference(val_idx)

    model = lgb.LGBMRegressor(
        objective="mae",
        n_estimators=2000,
        learning_rate=0.05,
        num_leaves=64,
        max_depth=8,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42 + i
    )

    model.fit(
        X.loc[train_idx], y.loc[train_idx],
        eval_set=[(X.loc[val_idx], y.loc[val_idx])],
        eval_metric="mae",
        callbacks=[lgb.early_stopping(100, verbose=False)]
    )

    test_preds += model.predict(X_test) / 5

submission = pd.DataFrame({
    "service_key": test["service_key"],
    "final_service_units": test_preds
})

submission.to_csv("submission.csv", index=False)
submission.head()


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011642 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 452
[LightGBM] [Info] Number of data points in the train set: 53760, number of used features: 12
[LightGBM] [Info] Start training from score 1725.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3313
[LightGBM] [Info] Number of data points in the train set: 53760, number of used features: 32
[LightGBM] [Info] Start training from score 1748.000000
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.016126 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tot

Unnamed: 0,service_key,final_service_units
0,2025-02-11_46_45,3427.011742
1,2025-01-20_17_23,1328.816061
2,2025-01-08_02_14,1195.822921
3,2025-01-08_08_47,1168.661377
4,2025-01-08_09_46,3566.419501
