In [14]:
import sys
import subprocess

for package in ['catboost', 'lightgbm', 'xgboost']:
    try:
        __import__(package)
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", package])

import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
import warnings

warnings.filterwarnings("ignore")
np.random.seed(42)

In [15]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
interactions = pd.read_csv("interactions.csv")

for df in [train, test, interactions]:
    df["service_date"] = pd.to_datetime(df["service_date"], format="mixed", dayfirst=True)
interactions["interaction_date"] = pd.to_datetime(interactions["interaction_date"], format="mixed", dayfirst=True)

for col in ["origin_hub_id", "destination_hub_id"]:
    for df in [train, test, interactions]:
        df[col] = df[col].astype(str).str.strip().str.lower()

print(f"âœ“ Train: {train.shape}, Test: {test.shape}, Interactions: {interactions.shape}")

âœ“ Train: (67200, 4), Test: (5900, 4), Interactions: (1048575, 11)


In [None]:
# Multiple time windows for interactions
def create_interaction_features(interactions_df, days_threshold):
    """Create interaction features for specific time window"""
    filtered = interactions_df[interactions_df.days_before_service >= days_threshold].copy()

    route_features = filtered.groupby(["origin_hub_id", "destination_hub_id"]).agg({
        "cumulative_commitments": ["mean", "max", "std", "sum"],
        "cumulative_interest_signals": ["mean", "max", "std", "sum"],
    }).reset_index()
    route_features.columns = ["origin_hub_id", "destination_hub_id"] + \
                             [f"route_{days_threshold}d_c_{stat}" for stat in ["mean", "max", "std", "sum"]] + \
                             [f"route_{days_threshold}d_i_{stat}" for stat in ["mean", "max", "std", "sum"]]
    return route_features

# Create features for different time windows
route_15d = create_interaction_features(interactions, 15)
route_30d = create_interaction_features(interactions, 30)
route_7d = create_interaction_features(interactions, 7)

# Recent trend features (last interactions before service)
recent_inter = interactions[interactions.days_before_service <= 30].copy()
route_recent = recent_inter.groupby(["origin_hub_id", "destination_hub_id"]).agg({
    "cumulative_commitments": ["last", "mean", "max"],
    "cumulative_interest_signals": ["last", "mean", "max"],
    "days_before_service": ["min", "mean"]
}).reset_index()
route_recent.columns = ["origin_hub_id", "destination_hub_id",
                        "recent_c_last", "recent_c_mean", "recent_c_max",
                        "recent_i_last", "recent_i_mean", "recent_i_max",
                        "recent_days_min", "recent_days_mean"]

# Route statistics from training
route_train = train.groupby(["origin_hub_id", "destination_hub_id"]).final_service_units.agg(
    ["mean", "median", "std", "count", "min", "max"]
).reset_index()
route_train.columns = ["origin_hub_id", "destination_hub_id",
                       "route_mean", "route_median", "route_std", "route_count", "route_min", "route_max"]

# Hub statistics
origin_stats = train.groupby("origin_hub_id").final_service_units.agg(
    ["mean", "median", "std", "count", "min", "max"]
).reset_index()
origin_stats.columns = ["origin_hub_id", "o_mean", "o_median", "o_std", "o_count", "o_min", "o_max"]

dest_stats = train.groupby("destination_hub_id").final_service_units.agg(
    ["mean", "median", "std", "count", "min", "max"]
).reset_index()
dest_stats.columns = ["destination_hub_id", "d_mean", "d_median", "d_std", "d_count", "d_min", "d_max"]

# Hub interaction patterns
inter_15 = interactions[interactions.days_before_service >= 15].copy()

origin_inter = inter_15.groupby("origin_hub_id").agg({
    "cumulative_commitments": ["mean", "max", "std"],
    "cumulative_interest_signals": ["mean", "max", "std"],
}).reset_index()
origin_inter.columns = ["origin_hub_id", "o_c_mean", "o_c_max", "o_c_std", "o_i_mean", "o_i_max", "o_i_std"]

dest_inter = inter_15.groupby("destination_hub_id").agg({
    "cumulative_commitments": ["mean", "max", "std"],
    "cumulative_interest_signals": ["mean", "max", "std"],
}).reset_index()
dest_inter.columns = ["destination_hub_id", "d_c_mean", "d_c_max", "d_c_std", "d_i_mean", "d_i_max", "d_i_std"]

# Hub metadata
origin_meta = inter_15.groupby("origin_hub_id")[["origin_region", "origin_hub_tier"]].agg(
    lambda x: x.mode()[0] if len(x.mode()) > 0 else "unknown"
).reset_index()

dest_meta = inter_15.groupby("destination_hub_id")[["destination_region", "destination_hub_tier"]].agg(
    lambda x: x.mode()[0] if len(x.mode()) > 0 else "unknown"
).reset_index()

In [17]:
def add_temporal(df):
    df["year"] = df.service_date.dt.year
    df["month"] = df.service_date.dt.month
    df["day"] = df.service_date.dt.day
    df["dow"] = df.service_date.dt.dayofweek
    df["quarter"] = df.service_date.dt.quarter
    df["week_of_year"] = df.service_date.dt.isocalendar().week
    df["is_weekend"] = (df.dow >= 5).astype(int)
    df["is_month_start"] = (df.day <= 7).astype(int)
    df["is_month_end"] = (df.day >= 24).astype(int)
    df["is_quarter_end"] = df.month.isin([3, 6, 9, 12]).astype(int)

    # Cyclical encoding
    df["month_sin"] = np.sin(2 * np.pi * df.month / 12)
    df["month_cos"] = np.cos(2 * np.pi * df.month / 12)
    df["dow_sin"] = np.sin(2 * np.pi * df.dow / 7)
    df["dow_cos"] = np.cos(2 * np.pi * df.dow / 7)
    df["day_sin"] = np.sin(2 * np.pi * df.day / 31)
    df["day_cos"] = np.cos(2 * np.pi * df.day / 31)

    return df

train = add_temporal(train)
test = add_temporal(test)

# Temporal patterns
month_stats = train.groupby("month").final_service_units.agg(["mean", "std", "median", "min", "max"]).reset_index()
month_stats.columns = ["month", "month_mean", "month_std", "month_median", "month_min", "month_max"]

dow_stats = train.groupby("dow").final_service_units.agg(["mean", "std", "median"]).reset_index()
dow_stats.columns = ["dow", "dow_mean", "dow_std", "dow_median"]

quarter_stats = train.groupby("quarter").final_service_units.agg(["mean", "std"]).reset_index()
quarter_stats.columns = ["quarter", "quarter_mean", "quarter_std"]

# Month-Route interaction
month_route_stats = train.groupby(["month", "origin_hub_id", "destination_hub_id"]).final_service_units.agg(
    ["mean", "count"]
).reset_index()
month_route_stats.columns = ["month", "origin_hub_id", "destination_hub_id", "month_route_mean", "month_route_count"]

In [None]:
def enrich(df):
    # Route features (multiple time windows)
    df = df.merge(route_train, on=["origin_hub_id", "destination_hub_id"], how="left")
    df = df.merge(route_15d, on=["origin_hub_id", "destination_hub_id"], how="left")
    df = df.merge(route_30d, on=["origin_hub_id", "destination_hub_id"], how="left")
    df = df.merge(route_7d, on=["origin_hub_id", "destination_hub_id"], how="left")
    df = df.merge(route_recent, on=["origin_hub_id", "destination_hub_id"], how="left")

    # Hub features
    df = df.merge(origin_stats, on="origin_hub_id", how="left")
    df = df.merge(dest_stats, on="destination_hub_id", how="left")
    df = df.merge(origin_inter, on="origin_hub_id", how="left")
    df = df.merge(dest_inter, on="destination_hub_id", how="left")
    df = df.merge(origin_meta, on="origin_hub_id", how="left")
    df = df.merge(dest_meta, on="destination_hub_id", how="left")

    # Temporal features
    df = df.merge(month_stats, on="month", how="left")
    df = df.merge(dow_stats, on="dow", how="left")
    df = df.merge(quarter_stats, on="quarter", how="left")
    df = df.merge(month_route_stats, on=["month", "origin_hub_id", "destination_hub_id"], how="left")

    return df

train_df = enrich(train)
test_df = enrich(test)

# Global fallbacks
global_mean = train.final_service_units.mean()
global_std = train.final_service_units.std()
global_median = train.final_service_units.median()

# Smart imputation with hierarchy: route â†’ hub â†’ temporal â†’ global
for df in [train_df, test_df]:
    # Route-level (highest priority)
    df["route_mean"] = df["route_mean"].fillna(
        df.groupby(["origin_hub_id", "destination_hub_id"])["o_mean"].transform("first")
    ).fillna(df["o_mean"]).fillna(df["month_mean"]).fillna(global_mean)

    df["route_median"] = df["route_median"].fillna(df["o_median"]).fillna(df["month_median"]).fillna(global_median)
    df["route_std"] = df["route_std"].fillna(df["o_std"]).fillna(df["month_std"]).fillna(global_std)

    # Hub-level
    df["o_mean"] = df["o_mean"].fillna(df["month_mean"]).fillna(global_mean)
    df["d_mean"] = df["d_mean"].fillna(df["month_mean"]).fillna(global_mean)
    df["o_std"] = df["o_std"].fillna(global_std)
    df["d_std"] = df["d_std"].fillna(global_std)
    df["o_median"] = df["o_median"].fillna(global_median)
    df["d_median"] = df["d_median"].fillna(global_median)

    # Month-route
    df["month_route_mean"] = df["month_route_mean"].fillna(df["route_mean"])

    # Fill all numeric columns
    for col in df.select_dtypes(include=[np.number]).columns:
        if df[col].isnull().any():
            df[col] = df[col].fillna(0)

    # Temporal
    df["month_mean"] = df["month_mean"].fillna(global_mean)
    df["dow_mean"] = df["dow_mean"].fillna(global_mean)
    df["quarter_mean"] = df["quarter_mean"].fillna(global_mean)

# Create interaction features
for df in [train_df, test_df]:
    df["hub_avg"] = (df["o_mean"] + df["d_mean"]) / 2
    df["hub_diff"] = df["o_mean"] - df["d_mean"]
    df["hub_ratio"] = df["o_mean"] / (df["d_mean"] + 1)
    df["route_vs_hub"] = df["route_mean"] / (df["hub_avg"] + 1)
    df["commitment_ratio"] = df["route_15d_c_mean"] / (df["route_15d_i_mean"] + 1)
    df["recent_vs_historical"] = df["recent_c_mean"] / (df["route_15d_c_mean"] + 1)
    df["hub_interaction_strength"] = df["o_c_mean"] * df["d_c_mean"]
    df["route_popularity"] = df["route_15d_c_sum"] + df["route_15d_i_sum"]

# Encode categoricals
cat_cols = ["origin_region", "destination_region", "origin_hub_tier", "destination_hub_tier"]
for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]]).fillna("unknown")
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].fillna("unknown"))
    test_df[col] = le.transform(test_df[col].fillna("unknown"))

# Encode hub IDs
for col in ["origin_hub_id", "destination_hub_id"]:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]])
    le.fit(combined)
    train_df[col + "_enc"] = le.transform(train_df[col])
    test_df[col + "_enc"] = le.transform(test_df[col])

print(f"âœ“ Features: Train {train_df.shape}, Test {test_df.shape}")

âœ“ Features: Train (67200, 108), Test (5900, 108)


In [None]:
exclude = ["service_date", "final_service_units", "service_key", "origin_hub_id", "destination_hub_id"]
features = [c for c in train_df.columns if c not in exclude]

X = train_df[features].copy()
y = train_df.final_service_units.copy()
X_test = test_df[features].copy()

print(f"âœ“ Features: {len(features)} columns")

âœ“ Features: 104 columns


In [None]:
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

fold_scores = []
test_preds_all = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X), 1):
    print(f"\n{'='*70}")
    print(f"FOLD {fold}/{n_folds}")
    print("="*70)

    X_train_fold = X.iloc[train_idx]
    X_val_fold = X.iloc[val_idx]
    y_train_fold = y.iloc[train_idx]
    y_val_fold = y.iloc[val_idx]

    print(f"Train: {len(X_train_fold)}, Val: {len(X_val_fold)}")

    # Train ensemble for this fold
    fold_val_preds = []
    fold_test_preds = []
    fold_model_maes = []

    # Model 1: LightGBM
    print(f"\n[Fold {fold}] Training LightGBM...")
    m1 = lgb.LGBMRegressor(
        objective="mae", n_estimators=3000, learning_rate=0.03,
        num_leaves=100, max_depth=9, subsample=0.8, colsample_bytree=0.8,
        min_child_samples=10, reg_alpha=0.5, reg_lambda=0.5,
        random_state=42+fold, verbose=-1
    )
    m1.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)],
           callbacks=[lgb.early_stopping(200, verbose=False)])
    v1 = m1.predict(X_val_fold)
    t1 = m1.predict(X_test)
    mae1 = mean_absolute_error(y_val_fold, v1)
    print(f"   MAE: {mae1:.4f}")
    fold_val_preds.append(v1)
    fold_test_preds.append(t1)
    fold_model_maes.append(mae1)

    # Model 2: XGBoost
    print(f"[Fold {fold}] Training XGBoost...")
    m2 = xgb.XGBRegressor(
        objective="reg:absoluteerror", n_estimators=3000, learning_rate=0.03,
        max_depth=9, subsample=0.8, colsample_bytree=0.8,
        reg_alpha=0.5, reg_lambda=0.5, random_state=43+fold, verbosity=0
    )
    m2.fit(X_train_fold, y_train_fold, eval_set=[(X_val_fold, y_val_fold)], verbose=False)
    v2 = m2.predict(X_val_fold)
    t2 = m2.predict(X_test)
    mae2 = mean_absolute_error(y_val_fold, v2)
    print(f"   MAE: {mae2:.4f}")
    fold_val_preds.append(v2)
    fold_test_preds.append(t2)
    fold_model_maes.append(mae2)

    # Model 3: CatBoost
    print(f"[Fold {fold}] Training CatBoost...")
    m3 = CatBoostRegressor(
        loss_function="MAE", iterations=3000, learning_rate=0.03, depth=9,
        l2_leaf_reg=5, random_seed=44+fold, verbose=False
    )
    m3.fit(X_train_fold, y_train_fold, eval_set=(X_val_fold, y_val_fold),
           early_stopping_rounds=200, verbose=False)
    v3 = m3.predict(X_val_fold)
    t3 = m3.predict(X_test)
    mae3 = mean_absolute_error(y_val_fold, v3)
    print(f"   MAE: {mae3:.4f}")
    fold_val_preds.append(v3)
    fold_test_preds.append(t3)
    fold_model_maes.append(mae3)

    # Weighted ensemble for this fold
    weights = np.array([1/mae for mae in fold_model_maes])
    weights = weights / weights.sum()

    fold_ensemble_val = sum(w * pred for w, pred in zip(weights, fold_val_preds))
    fold_ensemble_test = sum(w * pred for w, pred in zip(weights, fold_test_preds))

    fold_mae = mean_absolute_error(y_val_fold, fold_ensemble_val)
    print(f"\nâœ¨ Fold {fold} Ensemble MAE: {fold_mae:.4f}")

    fold_scores.append(fold_mae)
    test_preds_all.append(fold_ensemble_test)


ðŸŽ¯ K-FOLD CROSS-VALIDATION TRAINING

FOLD 1/5
Train: 53760, Val: 13440

[Fold 1] Training LightGBM...
   MAE: 261.5687
[Fold 1] Training XGBoost...
   MAE: 255.6218
[Fold 1] Training CatBoost...
   MAE: 244.6397

âœ¨ Fold 1 Ensemble MAE: 249.8675

FOLD 2/5
Train: 53760, Val: 13440

[Fold 2] Training LightGBM...
   MAE: 271.3213
[Fold 2] Training XGBoost...
   MAE: 261.3880
[Fold 2] Training CatBoost...
   MAE: 253.7790

âœ¨ Fold 2 Ensemble MAE: 258.0515

FOLD 3/5
Train: 53760, Val: 13440

[Fold 3] Training LightGBM...
   MAE: 266.6723
[Fold 3] Training XGBoost...
   MAE: 260.0295
[Fold 3] Training CatBoost...
   MAE: 250.4105

âœ¨ Fold 3 Ensemble MAE: 254.9920

FOLD 4/5
Train: 53760, Val: 13440

[Fold 4] Training LightGBM...
   MAE: 265.0039
[Fold 4] Training XGBoost...
   MAE: 255.7322
[Fold 4] Training CatBoost...
   MAE: 245.8679

âœ¨ Fold 4 Ensemble MAE: 251.3138

FOLD 5/5
Train: 53760, Val: 13440

[Fold 5] Training LightGBM...
   MAE: 259.8517
[Fold 5] Training XGBoost...
   MA

In [None]:
# Average predictions across all folds
final_test_preds = np.mean(test_preds_all, axis=0)

avg_cv_score = np.mean(fold_scores)
std_cv_score = np.std(fold_scores)

print(f"\nCross-Validation Results:")
for i, score in enumerate(fold_scores, 1):
    print(f"   Fold {i}: {score:.4f}")
print(f"\nâœ¨ Average CV MAE: {avg_cv_score:.4f} (+/- {std_cv_score:.4f})")

# Light calibration (less aggressive than before)
train_mean = y.mean()
pred_mean = final_test_preds.mean()
adjustment = (train_mean - pred_mean) * 0.3  # Only 30% adjustment to reduce overfitting

final_test_preds = final_test_preds + adjustment

print(f"\nCalibration:")
print(f"   Adjustment: {adjustment:.2f}")
print(f"   Predictions: mean={final_test_preds.mean():.1f}, std={final_test_preds.std():.1f}")
print(f"   Target:      mean={train_mean:.1f}, std={y.std():.1f}")

# Clip to safe range
final_test_preds = np.clip(final_test_preds, y.min() * 0.8, y.max() * 1.2)



ðŸ”¥ FINAL ENSEMBLE

Cross-Validation Results:
   Fold 1: 249.8675
   Fold 2: 258.0515
   Fold 3: 254.9920
   Fold 4: 251.3138
   Fold 5: 247.9288

âœ¨ Average CV MAE: 252.4307 (+/- 3.6398)

Calibration:
   Adjustment: 5.49
   Predictions: mean=1988.9, std=1046.2
   Target:      mean=2001.7, std=1194.7


In [None]:
submission = pd.DataFrame({
    "service_key": test.service_key,
    "final_service_units": final_test_preds
})

submission.to_csv("submission.csv", index=False)

print(f"\n FINAL STATISTICS:")
print(f"   CV MAE: {avg_cv_score:.4f} (+/- {std_cv_score:.4f})")
print(f"   Predictions: mean={final_test_preds.mean():.1f}, std={final_test_preds.std():.1f}")
print(f"   Predictions: min={final_test_preds.min():.0f}, max={final_test_preds.max():.0f}")
print("\nFirst 10 predictions:")
print(submission.head(10))


SUBMISSION CREATED

 FINAL STATISTICS:
   CV MAE: 252.4307 (+/- 3.6398)
   Predictions: mean=1988.9, std=1046.2
   Predictions: min=66, max=8577

First 10 predictions:
        service_key  final_service_units
0  2025-02-11_46_45          3888.687677
1  2025-01-20_17_23          1618.974891
2  2025-01-08_02_14          1194.920319
3  2025-01-08_08_47           847.954392
4  2025-01-08_09_46          3094.590083
5  2025-01-21_45_05          1368.470940
6  2025-02-26_47_08          1367.945887
7  2025-01-03_02_19          2481.110630
8  2025-02-11_02_30          1490.038922
9  2025-01-25_05_45          2548.422487
