The Tablur Models for AML Project

In [176]:
# ============================================
# AML FINAL PROJECT - TABULAR MODELS ONLY (NO TEXT)
# ============================================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    cohen_kappa_score,
    classification_report,
    confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier


In [177]:
# ============================================
# Helper evaluation functions
# ============================================

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")

def evaluate_model(name, y_true, y_pred, verbose=True):
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    qwk = quadratic_weighted_kappa(y_true, y_pred)
    
    if verbose:
        print(f"\n=== {name} ===")
        print(f"Accuracy   : {acc:.4f}")
        print(f"Macro F1   : {macro_f1:.4f}")
        print(f"QWK        : {qwk:.4f}")
    
    return {
        "model": name,
        "accuracy": acc,
        "macro_f1": macro_f1,
        "qwk": qwk
    }


In [178]:
# ============================================
# 1. Load translated training data
# ============================================

df = pd.read_csv("train_fe_english.csv")  # adjust filename if needed

print(df.shape)
df.head()


(11565, 26)


Unnamed: 0,Type,Age,Breed1Name,Breed2Name,Gender,Color1Name,Color2Name,Color3Name,MaturitySize,FurLength,...,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed,FinalBreed,ColorDiversity,Fully_Healthy,lang,Description_en
0,2,3,Tabby,,1,Black,White,,1,1,...,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2,0,2,0,en,Nibble is a 3+ month old ball of cuteness. He ...
1,2,1,Domestic Medium Hair,,1,Black,Brown,,2,2,...,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0,0,2,0,en,I just found it alone yesterday near my apartm...
2,1,1,Mixed Breed,,1,Brown,White,,2,2,...,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3,3,2,0,en,Their pregnant mother was dumped by her irresp...
3,1,4,Mixed Breed,,2,Black,Brown,,2,1,...,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2,3,2,0,en,"Good guard dog, very alert, active, obedience ..."
4,1,1,Mixed Breed,,1,Black,,,2,1,...,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2,3,1,0,en,This handsome yet cute boy is up for adoption....


In [179]:
# ============================================
# 2. BUILD X, y (TABULAR-ONLY FEATURES)
# ============================================

target_col = "AdoptionSpeed"

# Keep PetID for record-keeping / fusion
pet_ids = df["PetID"].copy()

# String / label columns to drop from features
string_drop_cols = [
    "Breed1Name",
    "Breed2Name",
    "Color1Name",
    "Color2Name",
    "Color3Name",
    "StateName",
    "lang"
]

# Text columns to drop completely (no text features in this notebook)
text_drop_cols = [
    "Description",
    "Description_en"
]

# Other non-feature columns
raw_drop_cols = [
    target_col,
    "PetID",
    "Name",
    "RescuerID",
    "desc_clean"
]

drop_cols = [c for c in (string_drop_cols + text_drop_cols + raw_drop_cols) if c in df.columns]

y = df[target_col]
X = df.drop(columns=drop_cols)

# Final safety: keep ONLY numeric columns
X = X.select_dtypes(include=["number"])

print("Final X shape (tabular-only):", X.shape)
print("First 20 feature columns:", X.columns[:20].tolist())
X.head()


Final X shape (tabular-only): (11565, 15)
First 20 feature columns: ['Type', 'Age', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Fee', 'VideoAmt', 'PhotoAmt', 'FinalBreed', 'ColorDiversity', 'Fully_Healthy']


Unnamed: 0,Type,Age,Gender,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Fee,VideoAmt,PhotoAmt,FinalBreed,ColorDiversity,Fully_Healthy
0,2,3,1,1,1,2,2,2,1,100,0,1.0,0,2,0
1,2,1,1,2,2,3,3,3,1,0,0,2.0,0,2,0
2,1,1,1,2,2,1,1,2,1,0,0,7.0,3,2,0
3,1,4,2,2,1,1,1,2,1,150,0,8.0,3,2,0
4,1,1,1,2,1,2,2,2,1,0,0,3.0,3,1,0


In [180]:
# ============================================
# 3. 5-FOLD STRATIFIED CROSS-VALIDATION (XGBOOST, TABULAR-ONLY)
# ============================================

X_all = X.to_numpy(dtype=np.float32)
y_all = y.to_numpy()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

acc_scores, f1_scores, qwk_scores = [], [], []

fold = 1
for train_idx, val_idx in skf.split(X_all, y_all):
    print(f"\n---- Fold {fold} ----")
    fold += 1
    
    X_tr, X_va = X_all[train_idx], X_all[val_idx]
    y_tr, y_va = y_all[train_idx], y_all[val_idx]
    
    xgb_cv = XGBClassifier(
        n_estimators=350,
        learning_rate=0.05,
        max_depth=6,
        min_child_weight=3,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        objective="multi:softprob",
        num_class=5,
        eval_metric="mlogloss",
        tree_method="hist",
        nthread=-1,
        random_state=42
    )
    
    xgb_cv.fit(X_tr, y_tr)
    preds = np.argmax(xgb_cv.predict_proba(X_va), axis=1)
    
    acc_scores.append(accuracy_score(y_va, preds))
    f1_scores.append(f1_score(y_va, preds, average="macro"))
    qwk_scores.append(cohen_kappa_score(y_va, preds, weights="quadratic"))

print("\n===== 5-Fold CV Results (XGBoost, TABULAR-ONLY) =====")
print("Accuracy:", np.mean(acc_scores), "+/-", np.std(acc_scores))
print("Macro F1:", np.mean(f1_scores), "+/-", np.std(f1_scores))
print("QWK    :", np.mean(qwk_scores), "+/-", np.std(qwk_scores))



---- Fold 1 ----

---- Fold 2 ----

---- Fold 3 ----

---- Fold 4 ----

---- Fold 5 ----

===== 5-Fold CV Results (XGBoost, TABULAR-ONLY) =====
Accuracy: 0.4022481625594466 +/- 0.008623401168397387
Macro F1: 0.3213170312576542 +/- 0.007616144918480027
QWK    : 0.35013814970875395 +/- 0.015588322996636655


In [181]:
# ============================================
# 4. TRAIN/VAL SPLIT (80/20) WITH PetID
# ============================================

X_train, X_val, y_train, y_val, petid_train, petid_val = train_test_split(
    X,
    y,
    pet_ids,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Val shape  :", X_val.shape)


Train shape: (9252, 15)
Val shape  : (2313, 15)


In [182]:
# ============================================
# 5. SCALE FEATURES FOR MLP
# ============================================

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)


In [183]:
# ============================================
# 6. TRAIN BASELINE TABULAR MODELS
# ============================================

results = []

# --- Decision Tree ---
dt_clf = DecisionTreeClassifier(
    max_depth=None,
    min_samples_split=2,
    random_state=42
)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_val)
results.append(evaluate_model("DecisionTree_TabularOnly", y_val, y_pred_dt))

# --- Random Forest ---
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_val)
results.append(evaluate_model("RandomForest_TabularOnly", y_val, y_pred_rf))

# --- XGBoost (tuned) ---
X_train_np = X_train.to_numpy(dtype=np.float32)
X_val_np   = X_val.to_numpy(dtype=np.float32)
y_train_np = y_train.to_numpy()
y_val_np   = y_val.to_numpy()

xgb_clf = XGBClassifier(
    n_estimators=350,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=3,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="multi:softprob",
    num_class=5,
    eval_metric="mlogloss",
    tree_method="hist",
    nthread=-1,
    random_state=42
)

xgb_clf.fit(X_train_np, y_train_np)
xgb_proba = xgb_clf.predict_proba(X_val_np)
y_pred_xgb = np.argmax(xgb_proba, axis=1).astype(int)
results.append(evaluate_model("XGBoost_TabularOnly", y_val, y_pred_xgb))

# --- CatBoost (numeric-only, tuned) ---
cb_clf = CatBoostClassifier(
    iterations=400,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=5.0,
    loss_function="MultiClass",
    eval_metric="TotalF1:average=Macro",
    random_seed=42,
    verbose=100,
    od_type="Iter",
    od_wait=40
)

cb_clf.fit(X_train_np, y_train_np)
cb_proba = cb_clf.predict_proba(X_val_np)
y_pred_cb = np.argmax(cb_proba, axis=1).astype(int)
results.append(evaluate_model("CatBoost_TabularOnly", y_val, y_pred_cb))

# --- MLP (shallow neural net) ---
mlp_clf = MLPClassifier(
    hidden_layer_sizes=(128,),
    activation="relu",
    solver="adam",
    batch_size=128,
    learning_rate_init=0.001,
    max_iter=50,
    random_state=42
)
mlp_clf.fit(X_train_scaled, y_train)
mlp_proba = mlp_clf.predict_proba(X_val_scaled)
y_pred_mlp = np.argmax(mlp_proba, axis=1).astype(int)
results.append(evaluate_model("MLP_TabularOnly", y_val, y_pred_mlp))



=== DecisionTree_TabularOnly ===
Accuracy   : 0.3320
Macro F1   : 0.2853
QWK        : 0.2059

=== RandomForest_TabularOnly ===
Accuracy   : 0.3770
Macro F1   : 0.3191
QWK        : 0.3170

=== XGBoost_TabularOnly ===
Accuracy   : 0.3887
Macro F1   : 0.3106
QWK        : 0.3462
0:	learn: 0.2568712	total: 5.94ms	remaining: 2.37s
100:	learn: 0.3311820	total: 561ms	remaining: 1.66s
200:	learn: 0.3501638	total: 1.16s	remaining: 1.15s
300:	learn: 0.3723761	total: 1.74s	remaining: 572ms
399:	learn: 0.3897409	total: 2.3s	remaining: 0us

=== CatBoost_TabularOnly ===
Accuracy   : 0.4060
Macro F1   : 0.3124
QWK        : 0.3565

=== MLP_TabularOnly ===
Accuracy   : 0.3856
Macro F1   : 0.2982
QWK        : 0.3232




In [184]:
# ============================================
# 7. WEIGHTED SOFT ENSEMBLE: XGB + CATBOOST (TABULAR-ONLY)
# ============================================

alphas = [0.5, 0.6, 0.7, 0.8, 0.9]
best_qwk = -1
best_alpha = None
best_pred = None

for a in alphas:
    blended = a * xgb_proba + (1 - a) * cb_proba
    blended_pred = np.argmax(blended, axis=1).astype(int)
    qwk = quadratic_weighted_kappa(y_val, blended_pred)
    print(f"alpha={a:.1f}, QWK={qwk:.4f}")
    
    if qwk > best_qwk:
        best_qwk = qwk
        best_alpha = a
        best_pred = blended_pred

results.append(
    evaluate_model(f"WeightedEnsemble_XGB_CB_TabularOnly_alpha_{best_alpha:.1f}", y_val, best_pred)
)

print(f"\nBest weighted ensemble alpha (tabular-only): {best_alpha}, QWK={best_qwk:.4f}")


alpha=0.5, QWK=0.3535
alpha=0.6, QWK=0.3551
alpha=0.7, QWK=0.3519
alpha=0.8, QWK=0.3567
alpha=0.9, QWK=0.3540

=== WeightedEnsemble_XGB_CB_TabularOnly_alpha_0.8 ===
Accuracy   : 0.3934
Macro F1   : 0.3148
QWK        : 0.3567

Best weighted ensemble alpha (tabular-only): 0.8, QWK=0.3567


In [185]:
# ============================================
# 8. LEADERBOARD (TABULAR-ONLY)
# ============================================

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="qwk", ascending=False).reset_index(drop=True)
results_df


Unnamed: 0,model,accuracy,macro_f1,qwk
0,WeightedEnsemble_XGB_CB_TabularOnly_alpha_0.8,0.393428,0.314838,0.356666
1,CatBoost_TabularOnly,0.405966,0.312415,0.356459
2,XGBoost_TabularOnly,0.388673,0.310592,0.346168
3,MLP_TabularOnly,0.385646,0.29825,0.323234
4,RandomForest_TabularOnly,0.377,0.319124,0.317043
5,DecisionTree_TabularOnly,0.332036,0.285343,0.205926
