In [None]:
# =========================
# 1. Setup and data loading
# =========================

from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

# Adjust this base path if your folder is different
BASE_PATH = "/content/drive/MyDrive/MIT_AAIDSP_GL/Hackathon2026_ShinkansenBulletTrain"

# Train files
travel_train_path = f"{BASE_PATH}/Traveldata_train_(2).csv"
survey_train_path = f"{BASE_PATH}/Surveydata_train_(2).csv"

# Test files
travel_test_path  = f"{BASE_PATH}/Traveldata_test_(2) (1).csv"
survey_test_path  = f"{BASE_PATH}/Surveydata_test_(2).csv"

# Load CSVs
df_travel_train  = pd.read_csv(travel_train_path)
df_survey_train  = pd.read_csv(survey_train_path)
df_travel_test   = pd.read_csv(travel_test_path)
df_survey_test   = pd.read_csv(survey_test_path)

print("Train travel shape:", df_travel_train.shape)
print("Train survey shape:", df_survey_train.shape)
print("Test travel shape:", df_travel_test.shape)
print("Test survey shape:", df_survey_test.shape)

# Merge train
df_train_full = pd.merge(
    df_travel_train,
    df_survey_train,
    on="ID",
    how="inner",
    suffixes=("_travel", "_survey_train")
)

# Merge test
df_test_full = pd.merge(
    df_travel_test,
    df_survey_test,
    on="ID",
    how="inner",
    suffixes=("_travel", "_survey")
)

print("Full train shape:", df_train_full.shape)
print("Full test shape:", df_test_full.shape)
df_train_full.head()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Train travel shape: (94379, 9)
Train survey shape: (94379, 17)
Test travel shape: (35602, 9)
Test survey shape: (35602, 16)
Full train shape: (94379, 25)
Full test shape: (35602, 24)


Unnamed: 0,ID,Gender,Customer_Type,Age,Type_Travel,Travel_Class,Travel_Distance,Departure_Delay_in_Mins,Arrival_Delay_in_Mins,Overall_Experience,...,Onboard_Wifi_Service,Onboard_Entertainment,Online_Support,Ease_of_Online_Booking,Onboard_Service,Legroom,Baggage_Handling,CheckIn_Service,Cleanliness,Online_Boarding
0,98800001,Female,Loyal Customer,52.0,,Business,272,0.0,5.0,0,...,Good,Needs Improvement,Acceptable,Needs Improvement,Needs Improvement,Acceptable,Needs Improvement,Good,Needs Improvement,Poor
1,98800002,Male,Loyal Customer,48.0,Personal Travel,Eco,2200,9.0,0.0,0,...,Good,Poor,Good,Good,Excellent,Needs Improvement,Poor,Needs Improvement,Good,Good
2,98800003,Female,Loyal Customer,43.0,Business Travel,Business,1061,77.0,119.0,1,...,Needs Improvement,Good,Excellent,Excellent,Excellent,Excellent,Excellent,Good,Excellent,Excellent
3,98800004,Female,Loyal Customer,44.0,Business Travel,Business,780,13.0,18.0,0,...,Acceptable,Needs Improvement,Acceptable,Acceptable,Acceptable,Acceptable,Acceptable,Good,Acceptable,Acceptable
4,98800005,Female,Loyal Customer,50.0,Business Travel,Business,1981,0.0,0.0,1,...,Needs Improvement,Good,Excellent,Good,Good,Good,Good,Good,Good,Good


In [None]:
# ==========================================
# 2. Clean column names and define X and y
# ==========================================

# Remove suffixes in train
new_cols_train = []
for col in df_train_full.columns:
    if col.endswith("_survey_train"):
        new_cols_train.append(col.replace("_survey_train", ""))
    elif col.endswith("_travel"):
        new_cols_train.append(col.replace("_travel", ""))
    else:
        new_cols_train.append(col)
df_train_full.columns = new_cols_train

# Remove suffixes in test
new_cols_test = []
for col in df_test_full.columns:
    if col.endswith("_survey"):
        new_cols_test.append(col.replace("_survey", ""))
    elif col.endswith("_travel"):
        new_cols_test.append(col.replace("_travel", ""))
    else:
        new_cols_test.append(col)
df_test_full.columns = new_cols_test

# Target
TARGET_COL = "Overall_Experience"
assert TARGET_COL in df_train_full.columns, "Overall_Experience not found in train data."

y = df_train_full[TARGET_COL]

# Features: intersection of train/test columns, excluding ID and target
train_feature_cols = df_train_full.columns.difference(["ID", TARGET_COL])
test_feature_cols  = df_test_full.columns.difference(["ID"])

common_cols = sorted(list(set(train_feature_cols).intersection(set(test_feature_cols))))
X = df_train_full[common_cols].copy()
X_test_final = df_test_full[common_cols].copy()

print("Number of features:", len(common_cols))
X.head()

Number of features: 23


Unnamed: 0,Age,Arrival_Delay_in_Mins,Arrival_Time_Convenient,Baggage_Handling,Catering,CheckIn_Service,Cleanliness,Customer_Type,Departure_Delay_in_Mins,Ease_of_Online_Booking,...,Onboard_Service,Onboard_Wifi_Service,Online_Boarding,Online_Support,Platform_Location,Seat_Class,Seat_Comfort,Travel_Class,Travel_Distance,Type_Travel
0,52.0,5.0,Excellent,Needs Improvement,Excellent,Good,Needs Improvement,Loyal Customer,0.0,Needs Improvement,...,Needs Improvement,Good,Poor,Acceptable,Very Convenient,Green Car,Needs Improvement,Business,272,
1,48.0,0.0,Excellent,Poor,Poor,Needs Improvement,Good,Loyal Customer,9.0,Good,...,Excellent,Good,Good,Good,Needs Improvement,Ordinary,Poor,Eco,2200,Personal Travel
2,43.0,119.0,Needs Improvement,Excellent,Needs Improvement,Good,Excellent,Loyal Customer,77.0,Excellent,...,Excellent,Needs Improvement,Excellent,Excellent,Needs Improvement,Green Car,Needs Improvement,Business,1061,Business Travel
3,44.0,18.0,Needs Improvement,Acceptable,,Good,Acceptable,Loyal Customer,13.0,Acceptable,...,Acceptable,Acceptable,Acceptable,Acceptable,Needs Improvement,Ordinary,Acceptable,Business,780,Business Travel
4,50.0,0.0,Acceptable,Good,Acceptable,Good,Good,Loyal Customer,0.0,Good,...,Good,Needs Improvement,Good,Excellent,Manageable,Ordinary,Acceptable,Business,1981,Business Travel


In [None]:
# ==========================================
# 3. Handle missing values (light touch)
# ==========================================

# CatBoost can handle missing values, but we'll do a simple, consistent fill
# to avoid any weirdness and keep things explicit.

# For numeric columns: fill with median
num_cols = X.select_dtypes(include=["number"]).columns.tolist()
for col in num_cols:
    med = X[col].median()
    X[col] = X[col].fillna(med)
    X_test_final[col] = X_test_final[col].fillna(med)

# For categorical columns: fill with a placeholder
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
for col in cat_cols:
    X[col] = X[col].fillna("Missing")
    X_test_final[col] = X_test_final[col].fillna("Missing")

print("Numeric cols:", len(num_cols))
print("Categorical cols:", len(cat_cols))

Numeric cols: 4
Categorical cols: 19


In [None]:
# ==========================================================
# 4. Train/validation split and CatBoost with Stratified K-Fold
# ==========================================================

!pip install catboost -q

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# Indices of categorical features for CatBoost
cat_feature_indices = [X.columns.get_loc(c) for c in cat_cols]

X_np = X.values
y_np = y.values

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_accuracies = []

cat_params = {
    "iterations": 1200,
    "learning_rate": 0.03,
    "depth": 8,
    "l2_leaf_reg": 3,
    "loss_function": "Logloss",
    "eval_metric": "Accuracy",
    "random_seed": 42,
    "bootstrap_type": "Bayesian",
    "bagging_temperature": 0.5,
    "od_type": "Iter",
    "od_wait": 50,
    "verbose": 200
}

fold_idx = 1
for train_idx, val_idx in skf.split(X_np, y_np):
    X_tr, X_val = X_np[train_idx], X_np[val_idx]
    y_tr, y_val = y_np[train_idx], y_np[val_idx]

    train_pool = Pool(X_tr, y_tr, cat_features=cat_feature_indices)
    val_pool   = Pool(X_val, y_val, cat_features=cat_feature_indices)

    model = CatBoostClassifier(**cat_params)
    model.fit(train_pool, eval_set=val_pool, use_best_model=True)

    y_val_pred = model.predict(val_pool)
    acc = accuracy_score(y_val, y_val_pred)
    fold_accuracies.append(acc)
    print(f"Fold {fold_idx} accuracy: {acc:.5f}")
    fold_idx += 1

print("\nCV accuracies:", [round(a, 5) for a in fold_accuracies])
print("Mean CV accuracy:", np.mean(fold_accuracies))
print("Std CV accuracy:", np.std(fold_accuracies))

0:	learn: 0.8480988	test: 0.8448294	best: 0.8448294 (0)	total: 648ms	remaining: 12m 56s
200:	learn: 0.9556177	test: 0.9487179	best: 0.9487179 (200)	total: 1m 41s	remaining: 8m 25s
400:	learn: 0.9636173	test: 0.9547044	best: 0.9548633 (398)	total: 3m 29s	remaining: 6m 56s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9557109557
bestIteration = 499

Shrink model to first 500 iterations.
Fold 1 accuracy: 0.95571
0:	learn: 0.8479001	test: 0.8525111	best: 0.8525111 (0)	total: 401ms	remaining: 8m 1s
200:	learn: 0.9540018	test: 0.9522674	best: 0.9524264 (195)	total: 1m 41s	remaining: 8m 23s
400:	learn: 0.9621604	test: 0.9579360	best: 0.9579360 (396)	total: 3m 24s	remaining: 6m 48s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.9588366179
bestIteration = 484

Shrink model to first 485 iterations.
Fold 2 accuracy: 0.95884
0:	learn: 0.8482842	test: 0.8500212	best: 0.8500212 (0)	total: 612ms	remaining: 12m 14s
200:	learn: 0.9546376	test: 0.9499364	best: 

In [None]:
# ==========================================================
# Phase 2: High‑Impact CatBoost Hyperparameter Search
# ==========================================================

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
import numpy as np

# Use the same cat_features, X_np, y_np from earlier
print("Starting Phase 2 hyperparameter search...")

# Parameter grid (small but powerful)
param_grid = [
    # Slightly deeper, more regularized
    {"iterations": 1500, "learning_rate": 0.025, "depth": 8, "l2_leaf_reg": 5, "random_strength": 1.5},

    # Shallower but more trees
    {"iterations": 2000, "learning_rate": 0.02, "depth": 7, "l2_leaf_reg": 3, "random_strength": 1.0},

    # Stronger regularization
    {"iterations": 1500, "learning_rate": 0.02, "depth": 7, "l2_leaf_reg": 7, "random_strength": 2.0},

    # More aggressive bagging
    {"iterations": 1800, "learning_rate": 0.025, "depth": 8, "l2_leaf_reg": 4, "bagging_temperature": 1.0},

    # Higher border count for numeric precision
    {"iterations": 1600, "learning_rate": 0.03, "depth": 8, "l2_leaf_reg": 3, "border_count": 128},

    # Balanced setup
    {"iterations": 1400, "learning_rate": 0.02, "depth": 6, "l2_leaf_reg": 5, "random_strength": 1.2},

    # Slightly deeper with stronger L2
    {"iterations": 1700, "learning_rate": 0.025, "depth": 9, "l2_leaf_reg": 6, "random_strength": 1.0},

    # High‑iteration, low‑learning‑rate
    {"iterations": 2200, "learning_rate": 0.015, "depth": 7, "l2_leaf_reg": 4, "random_strength": 1.5},
]

base_params = {
    "loss_function": "Logloss",
    "eval_metric": "Accuracy",
    "bootstrap_type": "Bayesian",
    "od_type": "Iter",
    "od_wait": 50,
    "verbose": 0,
    "random_seed": 42,
}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_acc = 0
best_params = None

for idx, p in enumerate(param_grid, start=1):
    params = base_params.copy()
    params.update(p)

    print(f"\nTesting configuration {idx}/{len(param_grid)}:")
    print(params)

    fold_accs = []

    for train_idx, val_idx in skf.split(X_np, y_np):
        X_tr, X_val = X_np[train_idx], X_np[val_idx]
        y_tr, y_val = y_np[train_idx], y_np[val_idx]

        train_pool = Pool(X_tr, y_tr, cat_features=cat_feature_indices)
        val_pool   = Pool(X_val, y_val, cat_features=cat_feature_indices)

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, use_best_model=True)

        preds = model.predict(val_pool)
        acc = accuracy_score(y_val, preds)
        fold_accs.append(acc)

    mean_acc = np.mean(fold_accs)
    print(f"Mean CV accuracy: {mean_acc:.5f}")

    if mean_acc > best_acc:
        best_acc = mean_acc
        best_params = params

print("\n======================================")
print("Best CV accuracy:", best_acc)
print("Best parameters:", best_params)
print("======================================")

# Retrain best model on full dataset
print("\nRetraining best model on full dataset...")
full_pool = Pool(X_np, y_np, cat_features=cat_feature_indices)
best_model = CatBoostClassifier(**best_params)
best_model.fit(full_pool, verbose=200)

print("Best model trained and ready.")

Starting Phase 2 hyperparameter search...

Testing configuration 1/8:
{'loss_function': 'Logloss', 'eval_metric': 'Accuracy', 'bootstrap_type': 'Bayesian', 'od_type': 'Iter', 'od_wait': 50, 'verbose': 0, 'random_seed': 42, 'iterations': 1500, 'learning_rate': 0.025, 'depth': 8, 'l2_leaf_reg': 5, 'random_strength': 1.5}
Mean CV accuracy: 0.95512

Testing configuration 2/8:
{'loss_function': 'Logloss', 'eval_metric': 'Accuracy', 'bootstrap_type': 'Bayesian', 'od_type': 'Iter', 'od_wait': 50, 'verbose': 0, 'random_seed': 42, 'iterations': 2000, 'learning_rate': 0.02, 'depth': 7, 'l2_leaf_reg': 3, 'random_strength': 1.0}
Mean CV accuracy: 0.95530

Testing configuration 3/8:
{'loss_function': 'Logloss', 'eval_metric': 'Accuracy', 'bootstrap_type': 'Bayesian', 'od_type': 'Iter', 'od_wait': 50, 'verbose': 0, 'random_seed': 42, 'iterations': 1500, 'learning_rate': 0.02, 'depth': 7, 'l2_leaf_reg': 7, 'random_strength': 2.0}
Mean CV accuracy: 0.95465

Testing configuration 4/8:
{'loss_function':

In [None]:
# ==========================================================
# PHASE 3 — Advanced Feature Engineering
# ==========================================================

import pandas as pd
import numpy as np

print("Starting Phase 3 feature engineering...")

# Make copies so we don't overwrite earlier data
X_fe = X.copy()
X_test_fe = X_test_final.copy()

# ==========================================================
# 1. Ordinal encoding for satisfaction-like columns
# ==========================================================

ordinal_map = {
    "Extremely Poor": 0,
    "Poor": 1,
    "Needs Improvement": 2,
    "Acceptable": 3,
    "Good": 4,
    "Excellent": 5,
    "Missing": -1
}

ordinal_cols = [
    "Seat_Comfort", "Arrival_Time_Convenient", "Catering",
    "Onboard_Wifi_Service", "Onboard_Entertainment", "Online_Support",
    "Ease_of_Online_Booking", "Onboard_Service", "Legroom",
    "Baggage_Handling", "CheckIn_Service", "Cleanliness",
    "Online_Boarding"
]

for col in ordinal_cols:
    if col in X_fe.columns:
        X_fe[col] = X_fe[col].map(ordinal_map)
        X_test_fe[col] = X_test_fe[col].map(ordinal_map)

# ==========================================================
# 2. Delay-normalized features
# ==========================================================

X_fe["Delay_per_km"] = X_fe["Arrival_Delay_in_Mins"] / (X_fe["Travel_Distance"] + 1)
X_test_fe["Delay_per_km"] = X_test_fe["Arrival_Delay_in_Mins"] / (X_test_fe["Travel_Distance"] + 1)

X_fe["Total_Delay"] = X_fe["Arrival_Delay_in_Mins"] + X_fe["Departure_Delay_in_Mins"]
X_test_fe["Total_Delay"] = X_test_fe["Arrival_Delay_in_Mins"] + X_test_fe["Departure_Delay_in_Mins"]

# ==========================================================
# 3. Satisfaction deltas (differences between key service areas)
# ==========================================================

pairs = [
    ("Onboard_Service", "Cleanliness"),
    ("Onboard_Wifi_Service", "Online_Support"),
    ("Seat_Comfort", "Legroom"),
]

for a, b in pairs:
    if a in X_fe.columns and b in X_fe.columns:
        X_fe[f"{a}_minus_{b}"] = X_fe[a] - X_fe[b]
        X_test_fe[f"{a}_minus_{b}"] = X_test_fe[a] - X_test_fe[b]

# ==========================================================
# 4. Binary flags
# ==========================================================

X_fe["Long_Distance"] = (X_fe["Travel_Distance"] > 1500).astype(int)
X_test_fe["Long_Distance"] = (X_test_fe["Travel_Distance"] > 1500).astype(int)

X_fe["Major_Delay"] = (X_fe["Arrival_Delay_in_Mins"] > 60).astype(int)
X_test_fe["Major_Delay"] = (X_test_fe["Arrival_Delay_in_Mins"] > 60).astype(int)

# ==========================================================
# 5. Aggregated satisfaction score
# ==========================================================

satisfaction_cols = [c for c in ordinal_cols if c in X_fe.columns]

X_fe["Satisfaction_Sum"] = X_fe[satisfaction_cols].sum(axis=1)
X_test_fe["Satisfaction_Sum"] = X_test_fe[satisfaction_cols].sum(axis=1)

X_fe["Satisfaction_Mean"] = X_fe[satisfaction_cols].mean(axis=1)
X_test_fe["Satisfaction_Mean"] = X_test_fe[satisfaction_cols].mean(axis=1)

# ==========================================================
# 6. Frequency encoding for high-cardinality categories
# ==========================================================

freq_cols = ["Customer_Type", "Type_Travel", "Travel_Class", "Seat_Class"]

for col in freq_cols:
    if col in X_fe.columns:
        freq = X_fe[col].value_counts(normalize=True)
        X_fe[col + "_freq"] = X_fe[col].map(freq)
        X_test_fe[col + "_freq"] = X_test_fe[col].map(freq).fillna(0)

# ==========================================================
# 7. Rebuild CatBoost pools
# ==========================================================

# Identify categorical columns again (non-numeric)
cat_cols_fe = X_fe.select_dtypes(include=["object"]).columns.tolist()
cat_feature_indices_fe = [X_fe.columns.get_loc(c) for c in cat_cols_fe]

X_np_fe = X_fe.values
X_test_np_fe = X_test_fe.values
y_np = y.values

# ==========================================================
# 8. Retrain CatBoost using best Phase 2 parameters
# ==========================================================

print("Retraining CatBoost with engineered features...")

train_pool_fe = Pool(X_np_fe, y_np, cat_features=cat_feature_indices_fe)
test_pool_fe = Pool(X_test_np_fe, cat_features=cat_feature_indices_fe)

model_phase3 = CatBoostClassifier(**best_params)
model_phase3.fit(train_pool_fe, verbose=200)

print("Phase 3 model trained.")

Starting Phase 3 feature engineering...
Retraining CatBoost with engineered features...
0:	learn: 0.8798885	total: 313ms	remaining: 8m 20s
200:	learn: 0.9500630	total: 42.2s	remaining: 4m 53s
400:	learn: 0.9586243	total: 1m 23s	remaining: 4m 9s
600:	learn: 0.9634029	total: 2m 3s	remaining: 3m 25s
800:	learn: 0.9669736	total: 2m 44s	remaining: 2m 43s
1000:	learn: 0.9704913	total: 3m 32s	remaining: 2m 7s
1200:	learn: 0.9732674	total: 4m 15s	remaining: 1m 24s
1400:	learn: 0.9761917	total: 4m 57s	remaining: 42.3s
1599:	learn: 0.9786287	total: 5m 40s	remaining: 0us
Phase 3 model trained.


In [None]:
# ==========================================================
# PHASE 4 — Stacking Ensemble
# ==========================================================

print("Starting Phase 4 stacking ensemble...")

!pip install lightgbm xgboost -q

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from catboost import Pool
import lightgbm as lgb
import xgboost as xgb
import numpy as np

# Use engineered features from Phase 3
X_stack = X_fe.copy()
X_test_stack = X_test_fe.copy()

X_np_stack = X_stack.values
X_test_np_stack = X_test_stack.values
y_np = y.values

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# OOF prediction matrices
oof_cat = np.zeros(len(X_stack))
oof_lgb = np.zeros(len(X_stack))
oof_xgb = np.zeros(len(X_stack))

# Test prediction matrices (averaged across folds)
test_pred_cat = np.zeros(len(X_test_stack))
test_pred_lgb = np.zeros(len(X_test_stack))
test_pred_xgb = np.zeros(len(X_test_stack))

fold_idx = 1

for train_idx, val_idx in skf.split(X_np_stack, y_np):
    print(f"\n=== Fold {fold_idx} ===")

    X_tr, X_val = X_np_stack[train_idx], X_np_stack[val_idx]
    y_tr, y_val = y_np[train_idx], y_np[val_idx]

    # -------------------------
    # 1. CatBoost (Phase 2 best)
    # -------------------------
    train_pool = Pool(X_tr, y_tr, cat_features=cat_feature_indices_fe)
    val_pool   = Pool(X_val, y_val, cat_features=cat_feature_indices_fe)
    test_pool  = Pool(X_test_np_stack, cat_features=cat_feature_indices_fe)

    model_cat = CatBoostClassifier(**best_params)
    model_cat.fit(train_pool, eval_set=val_pool, verbose=0)

    oof_cat[val_idx] = model_cat.predict(val_pool).astype(float)
    test_pred_cat += model_cat.predict(test_pool).astype(float) / skf.n_splits

# -------------------------
# 2. LightGBM (version‑safe)
# -------------------------
    model_lgb = lgb.LGBMClassifier(
          objective="binary",
          learning_rate=0.03,
          num_leaves=64,
          feature_fraction=0.8,
          bagging_fraction=0.8,
          bagging_freq=3,
          max_depth=-1,
          n_estimators=2000,
          random_state=42
    )

    model_lgb.fit(
          X_tr, y_tr,
          eval_set=[(X_val, y_val)],
          eval_metric="binary_logloss",
          callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )

oof_lgb[val_idx] = model_lgb.predict(X_val)
test_pred_lgb += model_lgb.predict(X_test_np_stack) / skf.n_splits

    # -------------------------
    # 3. XGBoost
    # -------------------------
    model_xgb = xgb.XGBClassifier(
        n_estimators=800,
        learning_rate=0.03,
        max_depth=7,
        subsample=0.8,
        colsample_bytree=0.8,
        eval_metric="logloss",
        random_state=42,
        tree_method="hist"
    )

    model_xgb.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)

    oof_xgb[val_idx] = model_xgb.predict(X_val)
    test_pred_xgb += model_xgb.predict(X_test_np_stack) / skf.n_splits

    fold_idx += 1

# ==========================================================
# Build meta-features
# ==========================================================

meta_train = np.vstack([oof_cat, oof_lgb, oof_xgb]).T
meta_test  = np.vstack([test_pred_cat, test_pred_lgb, test_pred_xgb]).T

print("\nMeta-feature matrix shape:", meta_train.shape)

# ==========================================================
# Train meta-learner
# ==========================================================

meta_model = LogisticRegression(max_iter=2000)
meta_model.fit(meta_train, y_np)

# OOF accuracy of stacked model
stack_oof_pred = meta_model.predict(meta_train)
stack_acc = accuracy_score(y_np, stack_oof_pred)

print("\nStacked model OOF accuracy:", stack_acc)

# ==========================================================
# Final test predictions
# ==========================================================

final_test_pred = meta_model.predict(meta_test).astype(int)

print("Phase 4 stacking complete.")

IndentationError: unexpected indent (ipython-input-2118763266.py, line 86)

In [None]:
# ==================================================
# 5. Retrain on full data and generate test predictions
# ==================================================

full_train_pool = Pool(X_np, y_np, cat_features=cat_feature_indices)
test_pool       = Pool(X_test_final.values, cat_features=cat_feature_indices)

final_model = CatBoostClassifier(**cat_params)
final_model.fit(full_train_pool, verbose=200)

# Predict class labels for test set
test_preds = final_model.predict(test_pool)

# If Overall_Experience is 0/1 already, we’re done.
# If it was originally something like "satisfied"/"dissatisfied",
# you may need to map back. From your notebook it looks like 0/1.

submission = pd.DataFrame({
    "ID": df_test_full["ID"],
    "Overall_Experience": test_preds.astype(int)
})

submission.head()

0:	learn: 0.8450079	total: 584ms	remaining: 11m 40s
200:	learn: 0.9545344	total: 1m 54s	remaining: 9m 31s
400:	learn: 0.9624175	total: 3m 48s	remaining: 7m 34s
600:	learn: 0.9663802	total: 5m 43s	remaining: 5m 42s
800:	learn: 0.9691033	total: 7m 39s	remaining: 3m 48s
1000:	learn: 0.9726104	total: 9m 38s	remaining: 1m 54s
1199:	learn: 0.9754289	total: 11m 35s	remaining: 0us


Unnamed: 0,ID,Overall_Experience
0,99900001,1
1,99900002,1
2,99900003,1
3,99900004,0
4,99900005,1


In [None]:
# ==========================
# 6. Save submission to CSV
# ==========================

output_path = f"{BASE_PATH}/submission_catboost03.csv"
submission.to_csv(output_path, index=False)
print("Submission saved to:", output_path)

In [None]:
# ==================================================
# Final Prediction + Submission Block
# ==================================================

import pandas as pd

print("Generating final predictions on test set...")

# Create CatBoost Pool for test data
test_pool = Pool(X_test_final.values, cat_features=cat_feature_indices)

# Predict class labels
test_preds = best_model.predict(test_pool)

# Convert to int (CatBoost outputs strings sometimes)
test_preds = test_preds.astype(int)

# Build submission DataFrame
submission = pd.DataFrame({
    "ID": df_test_full["ID"],
    "Overall_Experience": test_preds
})

print("Preview of submission:")
print(submission.head())

# Save to Drive
output_path = f"{BASE_PATH}/submission_catboost_phase2_03.csv"
submission.to_csv(output_path, index=False)

print(f"\nSubmission saved to: {output_path}")
print("All done — your predictions are ready.")

Generating final predictions on test set...
Preview of submission:
         ID  Overall_Experience
0  99900001                   1
1  99900002                   1
2  99900003                   1
3  99900004                   0
4  99900005                   1

Submission saved to: /content/drive/MyDrive/MIT_AAIDSP_GL/Hackathon2026_ShinkansenBulletTrain/submission_catboost_phase2_02.csv
All done — your predictions are ready.
