In [7]:
# ============================================
# AML FINAL PROJECT - TABULAR MODELS (TM2)
# Final clean version: translated CSV, no translation step
# ============================================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    cohen_kappa_score,
    classification_report,
    confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# ============================================
# Helper evaluation functions
# ============================================

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")

def evaluate_model(name, y_true, y_pred, verbose=True):
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    qwk = quadratic_weighted_kappa(y_true, y_pred)
    
    if verbose:
        print(f"\n=== {name} ===")
        print(f"Accuracy   : {acc:.4f}")
        print(f"Macro F1   : {macro_f1:.4f}")
        print(f"QWK        : {qwk:.4f}")
    
    return {
        "model": name,
        "accuracy": acc,
        "macro_f1": macro_f1,
        "qwk": qwk
    }

In [10]:
# ============================================
# 1. Load translated training data
# ============================================

df = pd.read_csv("train_fe_english.csv")  

print(df.shape)
df.head()

(11565, 26)


Unnamed: 0,Type,Age,Breed1Name,Breed2Name,Gender,Color1Name,Color2Name,Color3Name,MaturitySize,FurLength,...,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed,FinalBreed,ColorDiversity,Fully_Healthy,lang,Description_en
0,2,3,Tabby,,1,Black,White,,1,1,...,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2,0,2,0,en,Nibble is a 3+ month old ball of cuteness. He ...
1,2,1,Domestic Medium Hair,,1,Black,Brown,,2,2,...,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0,0,2,0,en,I just found it alone yesterday near my apartm...
2,1,1,Mixed Breed,,1,Brown,White,,2,2,...,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3,3,2,0,en,Their pregnant mother was dumped by her irresp...
3,1,4,Mixed Breed,,2,Black,Brown,,2,1,...,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2,3,2,0,en,"Good guard dog, very alert, active, obedience ..."
4,1,1,Mixed Breed,,1,Black,,,2,1,...,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2,3,1,0,en,This handsome yet cute boy is up for adoption....


In [11]:
# ============================================
# 2. TEXT FEATURES: VADER, KEYWORDS, LENGTH
# ============================================

analyzer = SentimentIntensityAnalyzer()

# Ensure Description_en exists and is string
if "Description_en" not in df.columns:
    # Fallback: if somehow missing, just copy Description
    df["Description"] = df["Description"].fillna("").astype(str)
    df["Description_en"] = df["Description"]
else:
    df["Description_en"] = df["Description_en"].fillna("").astype(str)

# Description length
df["desc_length"] = df["Description_en"].str.len()

# VADER sentiment (compound)
df["sentiment_vader"] = df["Description_en"].apply(
    lambda x: analyzer.polarity_scores(x)["compound"]
)

# Keyword features
positive_keywords = [
    "friendly","playful","sweet","gentle","nice","calm",
    "good with kids","good with children","affectionate","loving",
    "healthy","vaccinated","obedient"
]

negative_keywords = [
    "fearful","aggressive","bite","biting","sick","injured",
    "old","anxious","shy","timid","problem","issue"
]

def count_keywords(text, keywords):
    t = text.lower()
    return sum(1 for kw in keywords if kw in t)

df["positive_kw"] = df["Description_en"].apply(lambda x: count_keywords(x, positive_keywords))
df["negative_kw"] = df["Description_en"].apply(lambda x: count_keywords(x, negative_keywords))

df["keyword_sentiment"] = df["positive_kw"] - df["negative_kw"]
df["keyword_sentiment_norm"] = df["keyword_sentiment"] / (df["desc_length"] + 1)

df[[
    "Description_en",
    "desc_length",
    "sentiment_vader",
    "positive_kw",
    "negative_kw",
    "keyword_sentiment",
    "keyword_sentiment_norm"
]].head()


Unnamed: 0,Description_en,desc_length,sentiment_vader,positive_kw,negative_kw,keyword_sentiment,keyword_sentiment_norm
0,Nibble is a 3+ month old ball of cuteness. He ...,359,0.9552,1,1,0,0.0
1,I just found it alone yesterday near my apartm...,118,0.128,0,0,0,0.0
2,Their pregnant mother was dumped by her irresp...,393,0.765,2,0,2,0.005076
3,"Good guard dog, very alert, active, obedience ...",146,0.9538,0,0,0,0.0
4,This handsome yet cute boy is up for adoption....,390,0.988,2,0,2,0.005115


In [12]:
# ============================================
# 3. TEXT EMBEDDINGS: MiniLM ON Description_en + PCA
# ============================================

text_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

descriptions_en = df["Description_en"].tolist()

# 384-d MiniLM embeddings
text_embeddings = text_model.encode(descriptions_en, show_progress_bar=True)

# PCA → 50 components
pca = PCA(n_components=50, random_state=42)
text_pca = pca.fit_transform(text_embeddings)

pca_df = pd.DataFrame(
    text_pca,
    columns=[f"text_pca_{i}" for i in range(50)]
)

print("PCA text embeddings shape:", pca_df.shape)

# Merge PCA embeddings into df
df = pd.concat([df.reset_index(drop=True), pca_df], axis=1)

# Remove duplicate columns, just in case
df = df.loc[:, ~df.columns.duplicated()]

print("DF shape after adding text PCA features:", df.shape)


Batches: 100%|████████████████████████████████| 362/362 [00:19<00:00, 18.18it/s]

PCA text embeddings shape: (11565, 50)
DF shape after adding text PCA features: (11565, 82)



  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T


In [13]:
# ============================================
# 4. BUILD X, y
# ============================================

target_col = "AdoptionSpeed"

# Keep PetID for record-keeping / fusion
pet_ids = df["PetID"].copy()

# String / label columns to drop from features
string_drop_cols = [
    "Breed1Name",
    "Breed2Name",
    "Color1Name",
    "Color2Name",
    "Color3Name",
    "StateName",
    "lang"           # language code is string
]

# Non-feature columns
raw_drop_cols = [
    target_col,
    "Description",
    "Description_en",
    "PetID",
    "Name",
    "RescuerID",
    "desc_clean"
]

# Only drop columns that exist
drop_cols = [c for c in (string_drop_cols + raw_drop_cols) if c in df.columns]

y = df[target_col]
X = df.drop(columns=drop_cols)

# Keep only numeric columns
X = X.select_dtypes(include=["number"])

print("Final X shape:", X.shape)
print("First 20 feature columns:", X.columns[:20].tolist())
X.head()


Final X shape: (11565, 71)
First 20 feature columns: ['Type', 'Age', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Fee', 'VideoAmt', 'PhotoAmt', 'FinalBreed', 'ColorDiversity', 'Fully_Healthy', 'desc_length', 'sentiment_vader', 'positive_kw', 'negative_kw', 'keyword_sentiment']


Unnamed: 0,Type,Age,Gender,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Fee,...,text_pca_40,text_pca_41,text_pca_42,text_pca_43,text_pca_44,text_pca_45,text_pca_46,text_pca_47,text_pca_48,text_pca_49
0,2,3,1,1,1,2,2,2,1,100,...,-0.073293,-0.129809,0.816405,-0.385351,0.669315,-0.579418,-0.029445,0.405019,0.384451,0.022024
1,2,1,1,2,2,3,3,3,1,0,...,-0.573636,0.372404,0.02855,0.021867,-0.259421,-0.468014,-0.248606,-0.197988,-0.625188,-0.52549
2,1,1,1,2,2,1,1,2,1,0,...,-0.149933,0.017911,0.016195,0.147119,-0.067226,0.070558,-0.038472,-0.094989,-0.068937,0.130873
3,1,4,2,2,1,1,1,2,1,150,...,0.504112,-0.125026,0.210927,0.427809,0.009746,0.017263,0.256424,0.148209,-0.091599,0.005495
4,1,1,1,2,1,2,2,2,1,0,...,0.098457,-0.030768,0.117658,-0.075074,-0.053599,-0.154892,-0.345568,0.285998,-0.114181,0.397141


In [14]:
# ============================================
# 5. 5-FOLD STRATIFIED CROSS-VALIDATION (XGBOOST)
# ============================================

X_all = X.to_numpy(dtype=np.float32)
y_all = y.to_numpy()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

acc_scores, f1_scores, qwk_scores = [], [], []

fold = 1
for train_idx, val_idx in skf.split(X_all, y_all):
    print(f"\n---- Fold {fold} ----")
    fold += 1
    
    X_tr, X_va = X_all[train_idx], X_all[val_idx]
    y_tr, y_va = y_all[train_idx], y_all[val_idx]
    
    xgb_cv = XGBClassifier(
        n_estimators=350,
        learning_rate=0.05,
        max_depth=6,
        min_child_weight=3,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        objective="multi:softprob",
        num_class=5,
        eval_metric="mlogloss",
        tree_method="hist",
        nthread=-1,
        random_state=42
    )
    
    xgb_cv.fit(X_tr, y_tr)
    preds = np.argmax(xgb_cv.predict_proba(X_va), axis=1)
    
    acc_scores.append(accuracy_score(y_va, preds))
    f1_scores.append(f1_score(y_va, preds, average="macro"))
    qwk_scores.append(cohen_kappa_score(y_va, preds, weights="quadratic"))

print("\n===== 5-Fold CV Results (XGBoost) =====")
print("Accuracy:", np.mean(acc_scores), "+/-", np.std(acc_scores))
print("Macro F1:", np.mean(f1_scores), "+/-", np.std(f1_scores))
print("QWK    :", np.mean(qwk_scores), "+/-", np.std(qwk_scores))



---- Fold 1 ----

---- Fold 2 ----

---- Fold 3 ----

---- Fold 4 ----

---- Fold 5 ----

===== 5-Fold CV Results (XGBoost) =====
Accuracy: 0.4293125810635538 +/- 0.008325191458304126
Macro F1: 0.35437679487905954 +/- 0.008894211911809237
QWK    : 0.36834027648427103 +/- 0.024417268615667228


In [15]:
# ============================================
# 6. TRAIN/VAL SPLIT (80/20) WITH PetID
# ============================================

X_train, X_val, y_train, y_val, petid_train, petid_val = train_test_split(
    X,
    y,
    pet_ids,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Val shape  :", X_val.shape)


Train shape: (9252, 71)
Val shape  : (2313, 71)


In [16]:
# ============================================
# 7. SCALE FEATURES FOR MLP
# ============================================

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)


In [17]:
# ============================================
# 8. TRAIN BASELINE MODELS
# ============================================

results = []

# --- Decision Tree ---
dt_clf = DecisionTreeClassifier(
    max_depth=None,
    min_samples_split=2,
    random_state=42
)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_val)
results.append(evaluate_model("DecisionTree", y_val, y_pred_dt))

# --- Random Forest ---
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_val)
results.append(evaluate_model("RandomForest", y_val, y_pred_rf))

# --- XGBoost (tuned) ---
X_train_np = X_train.to_numpy(dtype=np.float32)
X_val_np   = X_val.to_numpy(dtype=np.float32)
y_train_np = y_train.to_numpy()
y_val_np   = y_val.to_numpy()

xgb_clf = XGBClassifier(
    n_estimators=350,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=3,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="multi:softprob",
    num_class=5,
    eval_metric="mlogloss",
    tree_method="hist",
    nthread=-1,
    random_state=42
)

xgb_clf.fit(X_train_np, y_train_np)
xgb_proba = xgb_clf.predict_proba(X_val_np)
y_pred_xgb = np.argmax(xgb_proba, axis=1).astype(int)
results.append(evaluate_model("XGBoost", y_val, y_pred_xgb))

# --- CatBoost (numeric-only, tuned) ---
cb_clf = CatBoostClassifier(
    iterations=400,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=5.0,
    loss_function="MultiClass",
    eval_metric="TotalF1:average=Macro",
    random_seed=42,
    verbose=100,
    od_type="Iter",
    od_wait=40
)

cb_clf.fit(X_train_np, y_train_np)
cb_proba = cb_clf.predict_proba(X_val_np)
y_pred_cb = np.argmax(cb_proba, axis=1).astype(int)
results.append(evaluate_model("CatBoost", y_val, y_pred_cb))

# --- MLP (shallow neural net) ---
mlp_clf = MLPClassifier(
    hidden_layer_sizes=(128,),
    activation="relu",
    solver="adam",
    batch_size=128,
    learning_rate_init=0.001,
    max_iter=50,
    random_state=42
)
mlp_clf.fit(X_train_scaled, y_train)
mlp_proba = mlp_clf.predict_proba(X_val_scaled)
y_pred_mlp = np.argmax(mlp_proba, axis=1).astype(int)
results.append(evaluate_model("MLP", y_val, y_pred_mlp))



=== DecisionTree ===
Accuracy   : 0.3411
Macro F1   : 0.2937
QWK        : 0.2150

=== RandomForest ===
Accuracy   : 0.4406
Macro F1   : 0.3632
QWK        : 0.3894

=== XGBoost ===
Accuracy   : 0.4306
Macro F1   : 0.3612
QWK        : 0.3903
0:	learn: 0.2480289	total: 57.1ms	remaining: 22.8s
100:	learn: 0.3645355	total: 808ms	remaining: 2.39s
200:	learn: 0.4353537	total: 1.69s	remaining: 1.67s
300:	learn: 0.4907374	total: 2.45s	remaining: 807ms
399:	learn: 0.5473386	total: 3.28s	remaining: 0us

=== CatBoost ===
Accuracy   : 0.4224
Macro F1   : 0.3283
QWK        : 0.3773


  ret = a @ b
  ret = a @ b
  ret = a @ b



=== MLP ===
Accuracy   : 0.3744
Macro F1   : 0.3082
QWK        : 0.2593


  ret = a @ b
  ret = a @ b
  ret = a @ b


In [18]:
# ============================================
# 9. WEIGHTED SOFT ENSEMBLE: XGB + CATBOOST
# ============================================

alphas = [0.5, 0.6, 0.7, 0.8, 0.9]
best_qwk = -1
best_alpha = None
best_pred = None

for a in alphas:
    blended = a * xgb_proba + (1 - a) * cb_proba
    blended_pred = np.argmax(blended, axis=1).astype(int)
    qwk = quadratic_weighted_kappa(y_val, blended_pred)
    print(f"alpha={a:.1f}, QWK={qwk:.4f}")
    
    if qwk > best_qwk:
        best_qwk = qwk
        best_alpha = a
        best_pred = blended_pred

results.append(
    evaluate_model(f"WeightedEnsemble_XGB_CB_alpha_{best_alpha:.1f}", y_val, best_pred)
)

print(f"\nBest weighted ensemble alpha: {best_alpha}, QWK={best_qwk:.4f}")


alpha=0.5, QWK=0.3950
alpha=0.6, QWK=0.3883
alpha=0.7, QWK=0.3849
alpha=0.8, QWK=0.3840
alpha=0.9, QWK=0.3871

=== WeightedEnsemble_XGB_CB_alpha_0.5 ===
Accuracy   : 0.4336
Macro F1   : 0.3519
QWK        : 0.3950

Best weighted ensemble alpha: 0.5, QWK=0.3950


In [19]:
# ============================================
# 10. LEADERBOARD
# ============================================

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="qwk", ascending=False).reset_index(drop=True)
results_df


Unnamed: 0,model,accuracy,macro_f1,qwk
0,WeightedEnsemble_XGB_CB_alpha_0.5,0.433636,0.351893,0.394959
1,XGBoost,0.43061,0.361242,0.390291
2,RandomForest,0.440553,0.363199,0.389354
3,CatBoost,0.422395,0.328304,0.377262
4,MLP,0.374406,0.308206,0.2593
5,DecisionTree,0.341115,0.293734,0.214999


## TAB + TEXT + IMAGES

In [20]:
import pandas as pd
import os

OUTPUT_DIR = "processed"  # same as before

img_full = pd.read_csv(os.path.join(OUTPUT_DIR, "image_all_features_per_pet.csv"))
img_full["PetID"] = img_full["PetID"].astype(str)

df["PetID"] = df["PetID"].astype(str)
df = df.merge(img_full, on="PetID", how="left")

print("DF shape after merging image features:", df.shape)
df.head()

DF shape after merging image features: (11565, 1377)


Unnamed: 0,Type,Age,Breed1Name,Breed2Name,Gender,Color1Name,Color2Name,Color3Name,MaturitySize,FurLength,...,img_emb_1273,img_emb_1274,img_emb_1275,img_emb_1276,img_emb_1277,img_emb_1278,img_emb_1279,img_extra_0,img_extra_1,img_extra_2
0,2,3,Tabby,,1,Black,White,,1,1,...,-0.136594,0.112785,0.083095,-0.089407,0.220778,-0.055448,0.333759,95.860984,16.616995,672.776551
1,2,1,Domestic Medium Hair,,1,Black,Brown,,2,2,...,-0.087675,-0.005474,0.423294,-0.059712,0.219841,0.051374,0.000132,88.262347,50.059973,453.839186
2,1,1,Mixed Breed,,1,Brown,White,,2,2,...,0.649306,0.169462,0.26731,1.148312,-0.097708,-0.162029,-0.073036,170.6451,11.606312,60.519822
3,1,4,Mixed Breed,,2,Black,Brown,,2,1,...,0.426876,0.130353,0.167361,-0.1297,0.855895,-0.047542,-0.136626,78.406617,76.517158,512.140914
4,1,1,Mixed Breed,,1,Black,,,2,1,...,0.164303,0.344191,0.431245,0.458547,0.213458,-0.049097,0.111202,165.758596,47.471541,363.798839


In [22]:
target_col = "AdoptionSpeed"
pet_ids = df["PetID"].copy()

string_drop_cols = [
    "Breed1Name","Breed2Name","Color1Name","Color2Name","Color3Name",
    "Name","Description","Description_en", "desc_clean"
]

raw_drop_cols = [
    target_col,
    "PetID",
    "RescuerID"
]

# Only drop columns that actually exist
drop_cols = [c for c in (string_drop_cols + raw_drop_cols) if c in df.columns]

y = df[target_col]
X_full = df.drop(columns=drop_cols)

# Keep only numeric columns
X_full = X_full.select_dtypes(include=["number"])

# Optional: fill missing image features
X_full = X_full.fillna(0.0)

print("Final X_full shape (tab+text+image):", X_full.shape)
print("Example image feature cols:", [c for c in X_full.columns if c.startswith("img_")][:10])

Final X_full shape (tab+text+image): (11565, 1360)
Example image feature cols: ['img_emb_0', 'img_emb_1', 'img_emb_2', 'img_emb_3', 'img_emb_4', 'img_emb_5', 'img_emb_6', 'img_emb_7', 'img_emb_8', 'img_emb_9']


In [23]:
X_train, X_val, y_train, y_val = train_test_split(
    X_full, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)

In [24]:
from xgboost import XGBClassifier

X_train_np = X_train.to_numpy(dtype=np.float32)
X_val_np   = X_val.to_numpy(dtype=np.float32)
y_train_np = y_train.to_numpy()
y_val_np   = y_val.to_numpy()

xgb_full = XGBClassifier(
    n_estimators=400,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    objective="multi:softprob",
    num_class=5,
    random_state=42,
    n_jobs=-1
)

xgb_full.fit(X_train_np, y_train_np)
y_pred_xgb_full = xgb_full.predict(X_val_np)

results.append(evaluate_model("XGB_tab+text+image", y_val_np, y_pred_xgb_full))


=== XGB_tab+text+image ===
Accuracy   : 0.4155
Macro F1   : 0.3271
QWK        : 0.4125


Adding image embeddings and image-derived metadata improved the QWK from 0.395 to 0.4125 (+4.4%). This demonstrates that images contain meaningful complementary information beyond text and tabular features.

In [28]:
from sklearn.ensemble import RandomForestClassifier

rf_full = RandomForestClassifier(
    n_estimators=400,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    n_jobs=-1,
    random_state=42
)

rf_full.fit(X_train, y_train)
y_pred_rf_full = rf_full.predict(X_val)

results.append(
    evaluate_model("RF_tab+text+image", y_val, y_pred_rf_full)
)


=== RF_tab+text+image ===
Accuracy   : 0.3969
Macro F1   : 0.2872
QWK        : 0.3625


In [29]:
from catboost import CatBoostClassifier

cb_full = CatBoostClassifier(
    iterations=400,
    learning_rate=0.05,
    depth=6,
    loss_function="MultiClass",
    random_seed=42,
    verbose=False
)

cb_full.fit(X_train, y_train)
y_pred_cb_full = cb_full.predict(X_val)

results.append(
    evaluate_model("CatBoost_tab+text+image", y_val, y_pred_cb_full)
)


=== CatBoost_tab+text+image ===
Accuracy   : 0.4107
Macro F1   : 0.3082
QWK        : 0.3761


In [30]:
import lightgbm as lgb

lgb_full = lgb.LGBMClassifier(
    objective="multiclass",
    num_class=5,
    n_estimators=400,
    learning_rate=0.05,
    max_depth=-1,
    subsample=0.9,
    colsample_bytree=0.9,
    random_state=42
)

lgb_full.fit(X_train, y_train)
y_pred_lgb_full = lgb_full.predict(X_val)

results.append(
    evaluate_model("LGBM_tab+text+image", y_val, y_pred_lgb_full)
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026462 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 341354
[LightGBM] [Info] Number of data points in the train set: 9252, number of used features: 1358
[LightGBM] [Info] Start training from score -3.552865
[LightGBM] [Info] Start training from score -1.556498
[LightGBM] [Info] Start training from score -1.296620
[LightGBM] [Info] Start training from score -1.510910
[LightGBM] [Info] Start training from score -1.323054

=== LGBM_tab+text+image ===
Accuracy   : 0.3990
Macro F1   : 0.3108
QWK        : 0.3718


In [31]:
from sklearn.neural_network import MLPClassifier

mlp_full = MLPClassifier(
    hidden_layer_sizes=(256, 128),
    activation="relu",
    solver="adam",
    alpha=1e-4,
    batch_size=256,
    learning_rate_init=1e-3,
    max_iter=40,
    random_state=42
)

mlp_full.fit(X_train_scaled, y_train)
y_pred_mlp_full = mlp_full.predict(X_val_scaled)

results.append(
    evaluate_model("MLP_tab+text+image", y_val, y_pred_mlp_full)
)

  ret = a @ b
  ret = a @ b
  ret = a @ b



=== MLP_tab+text+image ===
Accuracy   : 0.3476
Macro F1   : 0.3030
QWK        : 0.2872


  ret = a @ b
  ret = a @ b
  ret = a @ b


In [32]:
results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="qwk", ascending=False).reset_index(drop=True)
results_df

Unnamed: 0,model,accuracy,macro_f1,qwk
0,XGB_tab+text+image,0.415478,0.327093,0.412491
1,WeightedEnsemble_XGB_CB_alpha_0.5,0.433636,0.351893,0.394959
2,XGBoost,0.43061,0.361242,0.390291
3,RandomForest,0.440553,0.363199,0.389354
4,CatBoost,0.422395,0.328304,0.377262
5,CatBoost_tab+text+image,0.410722,0.308207,0.376061
6,LGBM_tab+text+image,0.399049,0.310769,0.371839
7,RF_tab+text+image,0.396887,0.287216,0.362499
8,MLP_tab+text+image,0.347601,0.303024,0.287245
9,MLP,0.374406,0.308206,0.2593


## LATE FUSION (ENSEMBLE)

In [35]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

In [36]:
# Inspect columns once (optional)
pd.set_option("display.max_columns", 200)
df.head()

Unnamed: 0,Type,Age,Breed1Name,Breed2Name,Gender,Color1Name,Color2Name,Color3Name,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Fee,StateName,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed,FinalBreed,ColorDiversity,Fully_Healthy,lang,Description_en,desc_length,sentiment_vader,positive_kw,negative_kw,keyword_sentiment,keyword_sentiment_norm,text_pca_0,text_pca_1,text_pca_2,text_pca_3,text_pca_4,text_pca_5,text_pca_6,text_pca_7,text_pca_8,text_pca_9,text_pca_10,text_pca_11,text_pca_12,text_pca_13,text_pca_14,text_pca_15,text_pca_16,text_pca_17,text_pca_18,text_pca_19,text_pca_20,text_pca_21,text_pca_22,text_pca_23,text_pca_24,text_pca_25,text_pca_26,text_pca_27,text_pca_28,text_pca_29,text_pca_30,text_pca_31,text_pca_32,text_pca_33,text_pca_34,text_pca_35,text_pca_36,text_pca_37,text_pca_38,text_pca_39,text_pca_40,text_pca_41,text_pca_42,text_pca_43,text_pca_44,text_pca_45,text_pca_46,text_pca_47,text_pca_48,text_pca_49,meta_num_labels_mean,meta_num_labels_max,meta_num_objects_mean,meta_num_objects_max,meta_pet_bbox_ratio_mean,meta_pet_bbox_ratio_max,has_outdoor,has_indoor,has_cage,has_sofa,has_person,has_leash,img_emb_0,img_emb_1,img_emb_2,img_emb_3,img_emb_4,img_emb_5,...,img_emb_1183,img_emb_1184,img_emb_1185,img_emb_1186,img_emb_1187,img_emb_1188,img_emb_1189,img_emb_1190,img_emb_1191,img_emb_1192,img_emb_1193,img_emb_1194,img_emb_1195,img_emb_1196,img_emb_1197,img_emb_1198,img_emb_1199,img_emb_1200,img_emb_1201,img_emb_1202,img_emb_1203,img_emb_1204,img_emb_1205,img_emb_1206,img_emb_1207,img_emb_1208,img_emb_1209,img_emb_1210,img_emb_1211,img_emb_1212,img_emb_1213,img_emb_1214,img_emb_1215,img_emb_1216,img_emb_1217,img_emb_1218,img_emb_1219,img_emb_1220,img_emb_1221,img_emb_1222,img_emb_1223,img_emb_1224,img_emb_1225,img_emb_1226,img_emb_1227,img_emb_1228,img_emb_1229,img_emb_1230,img_emb_1231,img_emb_1232,img_emb_1233,img_emb_1234,img_emb_1235,img_emb_1236,img_emb_1237,img_emb_1238,img_emb_1239,img_emb_1240,img_emb_1241,img_emb_1242,img_emb_1243,img_emb_1244,img_emb_1245,img_emb_1246,img_emb_1247,img_emb_1248,img_emb_1249,img_emb_1250,img_emb_1251,img_emb_1252,img_emb_1253,img_emb_1254,img_emb_1255,img_emb_1256,img_emb_1257,img_emb_1258,img_emb_1259,img_emb_1260,img_emb_1261,img_emb_1262,img_emb_1263,img_emb_1264,img_emb_1265,img_emb_1266,img_emb_1267,img_emb_1268,img_emb_1269,img_emb_1270,img_emb_1271,img_emb_1272,img_emb_1273,img_emb_1274,img_emb_1275,img_emb_1276,img_emb_1277,img_emb_1278,img_emb_1279,img_extra_0,img_extra_1,img_extra_2
0,2,3,Tabby,,1,Black,White,,1,1,2,2,2,1,100,Selangor,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2,0,2,0,en,Nibble is a 3+ month old ball of cuteness. He ...,359,0.9552,1,1,0,0.0,0.920802,1.291181,0.135128,0.71704,0.717277,-0.197229,0.172093,0.304017,-0.072859,-0.120476,0.453987,0.216476,0.072828,-0.257291,0.320963,-0.055026,-0.633374,0.202543,-0.238153,-0.762101,-0.097006,-0.169448,-0.22502,0.265708,0.214324,0.114597,0.132242,0.530406,0.269161,-0.236024,0.439594,0.390318,0.396609,-0.364095,0.200009,-0.688852,-0.313581,0.628293,0.394892,0.279709,-0.073293,-0.129809,0.816405,-0.385351,0.669315,-0.579418,-0.029445,0.405019,0.384451,0.022024,7.0,7.0,0.0,0.0,1.0,1.0,False,False,False,False,False,False,0.253315,-0.147033,-0.159378,-0.063343,0.088517,0.035912,...,-0.000438,-0.088691,-0.119152,-0.082784,-0.138598,0.360975,0.562074,-0.042902,0.937752,0.104908,-0.138851,0.120384,-0.075085,0.19433,-0.190696,0.395318,0.070946,0.616706,0.427427,-0.108692,-0.073971,0.498128,-0.053334,0.494011,0.179093,0.254009,0.654511,0.07707,-0.014533,0.768697,0.457137,0.219005,0.005789,0.056821,-0.150787,0.11296,0.601744,0.237302,0.703524,-0.10108,0.326818,0.24238,1.098816,0.438159,-0.171865,0.227446,0.64652,0.140478,-0.111385,0.190063,-0.044445,0.030492,0.29508,-0.138532,0.208445,0.960768,-0.120214,0.084768,0.118403,-0.141015,-0.072211,0.471263,0.230499,0.298361,0.533311,1.334119,0.153752,-0.049932,0.353855,1.17452,-0.167671,-0.093367,-0.167355,-0.006237,0.091932,0.316237,0.085592,-0.119665,0.392363,0.093704,0.484011,0.035628,-0.098856,-0.107103,0.192022,0.069002,0.148139,-0.035314,-0.115387,-0.12294,-0.136594,0.112785,0.083095,-0.089407,0.220778,-0.055448,0.333759,95.860984,16.616995,672.776551
1,2,1,Domestic Medium Hair,,1,Black,Brown,,2,2,3,3,3,1,0,Kuala Lumpur,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0,0,2,0,en,I just found it alone yesterday near my apartm...,118,0.128,0,0,0,0.0,0.929282,-0.801489,-0.267498,1.614028,-0.243638,1.015112,0.179723,0.810935,1.694596,-1.010623,-0.970709,0.382808,0.032167,-0.329348,-0.396295,0.546327,-0.205836,-0.791065,-0.28194,0.394747,-0.998817,0.433103,0.741618,0.904335,-0.041792,1.29067,0.167779,-0.287299,0.226645,-0.024893,0.253999,0.378617,-0.184791,-0.480515,-0.195442,0.357018,0.25391,0.127955,-0.161267,0.219266,-0.573636,0.372404,0.02855,0.021867,-0.259421,-0.468014,-0.248606,-0.197988,-0.625188,-0.52549,6.0,7.0,0.0,0.0,1.0,1.0,False,False,False,False,False,False,0.099347,-0.073799,0.547016,-0.163702,0.150605,-0.128184,...,-0.141972,-0.072291,0.17448,-0.158815,0.053174,0.097632,0.722183,0.328768,1.053139,0.376578,0.517577,-0.057245,0.009764,0.023346,0.003593,0.110963,-0.107202,0.666894,-0.077537,0.574154,0.95616,0.52828,-0.163515,0.364682,-0.11872,1.393114,0.284167,-0.005097,0.191998,-0.010159,0.438819,1.658448,-0.15556,-0.100507,-0.17101,-0.147842,-0.114402,0.209926,0.935389,-0.114541,0.042057,0.073992,0.227654,0.805469,-0.123706,-0.13794,0.671217,-0.0944,-0.183277,-0.130305,-0.114554,0.112258,0.426169,0.13026,0.191637,0.172121,0.102745,0.011389,0.022246,0.084267,-0.121059,0.043679,-0.121993,0.509439,-0.093814,1.36126,-0.047837,0.597112,-0.080614,0.278558,-0.165606,-0.130856,-0.149502,0.309425,0.399702,0.17695,-0.160451,-0.033422,0.968318,-0.158938,1.018635,0.235091,0.449708,-0.055028,-0.040692,0.005303,-0.11361,-0.062992,-0.147972,-0.090161,-0.087675,-0.005474,0.423294,-0.059712,0.219841,0.051374,0.000132,88.262347,50.059973,453.839186
2,1,1,Mixed Breed,,1,Brown,White,,2,2,1,1,2,1,0,Selangor,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3,3,2,0,en,Their pregnant mother was dumped by her irresp...,393,0.765,2,0,2,0.005076,-0.619915,-1.145372,0.04372,0.660904,0.595294,-0.607251,0.625584,0.222657,0.537193,-0.25916,-0.481595,0.077827,-0.584054,-0.056226,-0.061655,-0.341129,0.746076,0.125877,0.078089,0.242185,0.146234,0.187441,0.080294,0.007903,-0.201665,-0.009195,0.279357,0.570311,0.442642,-0.031727,0.350558,-0.193185,-0.043421,0.112597,-0.208335,0.213942,-0.333624,0.31043,0.079708,0.293688,-0.149933,0.017911,0.016195,0.147119,-0.067226,0.070558,-0.038472,-0.094989,-0.068937,0.130873,6.428571,8.0,0.0,0.0,1.0,1.0,False,False,False,False,False,False,0.101447,-0.063293,-0.093635,0.224517,-0.098054,-0.013184,...,-0.170245,-0.170508,0.422838,0.057635,0.079133,0.321061,0.9838,0.36884,-0.135446,-0.066677,0.319312,0.242802,0.509065,-0.133684,0.180578,0.342768,0.194621,1.311904,0.521186,0.764609,0.303922,-0.15442,0.083368,-0.071894,-0.127281,0.089745,0.366771,0.127164,-0.127109,0.105359,0.975765,0.211522,-0.153974,0.497649,-0.137671,0.02113,-0.018481,-0.180539,0.611055,0.013289,-0.119799,0.695659,-0.118015,0.36006,0.40164,-0.115903,0.287129,-0.206609,0.298198,0.037929,-0.2124,0.558098,-0.051834,-0.104374,0.631735,-0.122027,-0.132327,-0.161016,0.020294,-0.005839,0.143701,0.039236,0.235688,0.014048,0.24767,0.262652,-0.11358,0.46011,-0.186429,0.09316,-0.004945,-0.175045,-0.184777,0.09751,0.043261,-0.194847,-0.107872,0.487542,0.004651,-0.032771,-0.136951,0.368776,-0.075298,-0.111123,-0.081349,0.904262,0.994807,-0.027412,0.044497,-0.177019,0.649306,0.169462,0.26731,1.148312,-0.097708,-0.162029,-0.073036,170.6451,11.606312,60.519822
3,1,4,Mixed Breed,,2,Black,Brown,,2,1,1,1,2,1,150,Kuala Lumpur,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2,3,2,0,en,"Good guard dog, very alert, active, obedience ...",146,0.9538,0,0,0,0.0,0.853588,-1.28229,0.353736,-0.968165,-0.290231,-0.327622,1.286836,-0.390838,-0.528601,1.00264,-0.539857,0.437081,-0.571064,0.189213,-0.186179,-0.8769,0.061771,-0.474309,-0.114951,0.822269,0.659218,-0.232731,0.005603,-0.143775,-0.213409,-0.196598,-0.853667,-0.039296,0.101195,-0.982499,0.200745,-0.211118,-0.043626,0.25402,0.276765,-0.144689,-0.158045,-0.063314,0.792604,0.64822,0.504112,-0.125026,0.210927,0.427809,0.009746,0.017263,0.256424,0.148209,-0.091599,0.005495,5.0,7.0,0.0,0.0,1.0,1.0,False,False,False,False,False,False,0.117383,-0.104424,-0.06124,0.065842,0.007137,-0.076127,...,0.257509,-0.123011,0.042316,-0.127414,0.214902,-0.086323,0.217377,0.132903,-0.082446,0.186813,0.122565,-0.093853,-0.042337,0.036589,-0.081807,0.189272,0.337898,-0.131134,0.675439,-0.073681,0.057086,-0.130853,-0.033129,-0.095099,-0.02865,-0.160842,-0.060218,-0.028411,-0.077889,-0.114826,-0.052719,-0.166818,-0.05825,0.382237,0.097473,0.152473,-0.126444,-0.115252,-0.161876,-0.112472,-0.181193,1.412924,-0.117422,0.776293,-0.081474,-0.00396,-0.02528,0.086079,0.462565,0.004173,-0.15648,-0.184923,-0.135586,0.011782,-0.088672,-0.045311,0.206729,-0.088609,0.055138,-0.110499,0.259102,0.223814,1.002972,0.011697,-0.006536,-0.177647,-0.061948,-0.040874,0.081618,-0.047728,1.273801,0.187455,-0.176121,-0.022736,0.048159,-0.111902,-0.206868,-0.112291,-0.072379,-0.076118,-0.072404,1.05652,0.12277,-0.113892,-0.090621,0.422209,-0.033464,-0.063907,-0.124114,0.028317,0.426876,0.130353,0.167361,-0.1297,0.855895,-0.047542,-0.136626,78.406617,76.517158,512.140914
4,1,1,Mixed Breed,,1,Black,,,2,1,2,2,2,1,0,Selangor,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2,3,1,0,en,This handsome yet cute boy is up for adoption....,390,0.988,2,0,2,0.005115,-0.721672,0.262817,0.138968,-0.608829,0.315475,0.645163,-1.55072,0.106427,-0.142785,-0.119706,0.182346,0.37799,-0.080663,0.434432,-0.024718,-0.423052,0.119705,0.15798,-0.73891,-0.327601,-0.209316,-0.190467,0.227409,-0.200204,-0.100778,-0.542187,0.353649,-0.031922,-0.020719,-0.106349,-0.069964,-0.33005,-0.176801,0.325704,0.086193,0.351063,-0.627458,0.213067,-0.345617,0.296575,0.098457,-0.030768,0.117658,-0.075074,-0.053599,-0.154892,-0.345568,0.285998,-0.114181,0.397141,6.0,8.0,0.0,0.0,1.0,1.0,False,False,False,False,False,False,0.156996,-0.096605,0.037499,-0.121142,-0.008129,-0.038452,...,-0.163045,-0.199039,0.582212,-0.16463,0.330087,-0.217565,0.49908,0.126534,0.357457,0.199736,0.635899,-0.128257,-0.202145,-0.14035,-0.039296,0.64468,-0.193455,0.017978,-0.072865,0.115932,0.12845,-0.188148,0.428777,0.064106,-0.158436,-0.052995,0.270057,0.073515,-0.074846,0.129783,0.563621,1.423388,-0.145817,0.051896,-0.17237,-0.11799,0.070138,-0.172437,-0.096533,-0.136449,-0.124172,1.154379,-0.037301,0.986156,-0.140456,-0.014885,-0.187959,-0.128891,0.271884,-0.229844,-0.181199,-0.156247,-0.187877,0.036921,-0.104416,0.177885,0.533816,0.364311,-0.201739,-0.185395,0.003838,0.517327,0.29423,-0.02923,-0.045094,-0.13566,-0.19409,0.656312,0.137927,-0.08996,1.058234,-0.225514,-0.150835,-0.153706,0.037287,0.00058,-0.213238,0.02553,0.674092,-0.122738,0.051178,0.349407,-0.04478,-0.147298,-0.169727,0.21685,0.27698,-0.144409,-0.064037,0.725024,0.164303,0.344191,0.431245,0.458547,0.213458,-0.049097,0.111202,165.758596,47.471541,363.798839


In [37]:
# 1. Text features (embeddings / TF-IDF / sentence embeddings)
#    Adjust prefixes as needed.
text_prefixes = ("desc_", "text_", "sent_", "BERT_", "tfidf_")
text_cols = [c for c in df.columns if c.startswith(text_prefixes)]

# 2. Image features (embeddings + quality + metadata)
img_prefixes = ("img_emb_", "img_", "meta_")   # e.g. img_brightness, meta_pet_bbox_ratio_mean, has_outdoor
img_cols = [c for c in df.columns if c.startswith(img_prefixes)]

# 3. Tabular = everything numeric except target, text, image
exclude_cols = set(text_cols) | set(img_cols) | {"AdoptionSpeed", "PetID"}
tab_cols = [c for c in df.columns
            if c not in exclude_cols and np.issubdtype(df[c].dtype, np.number)]

print("Tabular cols:", len(tab_cols))
print("Text cols   :", len(text_cols))
print("Image cols  :", len(img_cols))

Tabular cols: 20
Text cols   : 51
Image cols  : 1289


In [38]:
y = df["AdoptionSpeed"].to_numpy()
pet_ids = df["PetID"].to_numpy()  # useful if you need it later

X_T   = df[tab_cols].to_numpy()
X_X   = df[text_cols].to_numpy()
X_I   = df[img_cols].to_numpy()
X_TX  = np.concatenate([X_T, X_X], axis=1)
X_TXI = np.concatenate([X_T, X_X, X_I], axis=1)

print(X_T.shape, X_X.shape, X_I.shape, X_TX.shape, X_TXI.shape)

(11565, 20) (11565, 51) (11565, 1289) (11565, 71) (11565, 1360)


In [39]:
idx = np.arange(len(df))

idx_train, idx_val = train_test_split(
    idx,
    test_size=0.2,
    stratify=y,
    random_state=42
)

y_train = y[idx_train]
y_val   = y[idx_val]

# Slice all feature matrices with the same indices
X_T_train,   X_T_val   = X_T[idx_train],   X_T[idx_val]
X_X_train,   X_X_val   = X_X[idx_train],   X_X[idx_val]
X_I_train,   X_I_val   = X_I[idx_train],   X_I[idx_val]
X_TX_train,  X_TX_val  = X_TX[idx_train],  X_TX[idx_val]
X_TXI_train, X_TXI_val = X_TXI[idx_train], X_TXI[idx_val]

X_T_train.shape, X_TXI_val.shape

((9252, 20), (2313, 1360))

In [40]:
def make_xgb():
    return XGBClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        objective="multi:softprob",
        num_class=5,
        random_state=42,
        n_jobs=-1
    )

In [41]:
results = []

# 1. Tabular only (T)
xgb_T = make_xgb()
xgb_T.fit(X_T_train, y_train)
y_pred_T = xgb_T.predict(X_T_val)
results.append(evaluate_model("XGB_T (tab only)", y_val, y_pred_T))

# 2. Text only (X)
xgb_X = make_xgb()
xgb_X.fit(X_X_train, y_train)
y_pred_X = xgb_X.predict(X_X_val)
results.append(evaluate_model("XGB_X (text only)", y_val, y_pred_X))

# 3. Image only (I)
xgb_I = make_xgb()
xgb_I.fit(X_I_train, y_train)
y_pred_I = xgb_I.predict(X_I_val)
results.append(evaluate_model("XGB_I (image only)", y_val, y_pred_I))

# 4. Tabular + Text (T+X)
xgb_TX = make_xgb()
xgb_TX.fit(X_TX_train, y_train)
y_pred_TX = xgb_TX.predict(X_TX_val)
results.append(evaluate_model("XGB_TX (tab+text)", y_val, y_pred_TX))

# 5. Full early fusion (T+X+I)
xgb_TXI = make_xgb()
xgb_TXI.fit(X_TXI_train, y_train)
y_pred_TXI = xgb_TXI.predict(X_TXI_val)
results.append(evaluate_model("XGB_TXI (tab+text+image)", y_val, y_pred_TXI))

# Build ablation leaderboard
ablation_df = (
    pd.DataFrame(results)
    .sort_values("qwk", ascending=False)
    .reset_index(drop=True)
)
ablation_df


=== XGB_T (tab only) ===
Accuracy   : 0.3995
Macro F1   : 0.3371
QWK        : 0.3476

=== XGB_X (text only) ===
Accuracy   : 0.3671
Macro F1   : 0.3101
QWK        : 0.2202

=== XGB_I (image only) ===
Accuracy   : 0.3580
Macro F1   : 0.2752
QWK        : 0.2976

=== XGB_TX (tab+text) ===
Accuracy   : 0.4267
Macro F1   : 0.3633
QWK        : 0.3833

=== XGB_TXI (tab+text+image) ===
Accuracy   : 0.4168
Macro F1   : 0.3245
QWK        : 0.4018


Unnamed: 0,model,accuracy,macro_f1,qwk
0,XGB_TXI (tab+text+image),0.416775,0.324516,0.40178
1,XGB_TX (tab+text),0.426719,0.363291,0.38328
2,XGB_T (tab only),0.399481,0.337099,0.347647
3,XGB_I (image only),0.357977,0.275178,0.297599
4,XGB_X (text only),0.367056,0.310101,0.220204


In [42]:
# Probabilities for each of the 3 models
proba_TX  = xgb_TX.predict_proba(X_TX_val)    # (n_val, 5)
proba_I   = xgb_I.predict_proba(X_I_val)      # (n_val, 5)
proba_TXI = xgb_TXI.predict_proba(X_TXI_val)  # (n_val, 5)

print("TX:",  proba_TX.shape)
print("I :",  proba_I.shape)
print("TXI:", proba_TXI.shape)
print("y_val:", y_val.shape)

TX: (2313, 5)
I : (2313, 5)
TXI: (2313, 5)
y_val: (2313,)


In [43]:
fusion_results = []

alphas = np.arange(0.0, 1.01, 0.1)  # weight for T+X
betas  = np.arange(0.0, 1.01, 0.1)  # weight for I

for alpha in alphas:
    for beta in betas:
        gamma = 1.0 - alpha - beta   # weight for T+X+I
        if gamma < 0 or gamma > 1:
            continue  # invalid combo, skip

        # blend probabilities
        proba_blend = (
            alpha * proba_TX +
            beta  * proba_I +
            gamma * proba_TXI
        )

        y_pred_blend = proba_blend.argmax(axis=1)

        name = f"LateFusion_3mod_a{alpha:.1f}_b{beta:.1f}_g{gamma:.1f}"
        res = evaluate_model(name, y_val, y_pred_blend, verbose=False)
        res["alpha_TX"]  = alpha
        res["beta_I"]    = beta
        res["gamma_TXI"] = gamma
        fusion_results.append(res)

fusion_df = (
    pd.DataFrame(fusion_results)
    .sort_values("qwk", ascending=False)
    .reset_index(drop=True)
)

fusion_df.head(10)

Unnamed: 0,model,accuracy,macro_f1,qwk,alpha_TX,beta_I,gamma_TXI
0,LateFusion_3mod_a0.4_b0.1_g0.5,0.450497,0.361742,0.449487,0.4,0.1,0.5
1,LateFusion_3mod_a0.4_b0.0_g0.6,0.451362,0.363411,0.446536,0.4,0.0,0.6
2,LateFusion_3mod_a0.3_b0.0_g0.7,0.449633,0.356134,0.444322,0.3,0.0,0.7
3,LateFusion_3mod_a0.5_b0.2_g0.3,0.450497,0.370332,0.443935,0.5,0.2,0.3
4,LateFusion_3mod_a0.4_b0.3_g0.3,0.450065,0.365483,0.443791,0.4,0.3,0.3
5,LateFusion_3mod_a0.3_b0.1_g0.6,0.446606,0.353597,0.443364,0.3,0.1,0.6
6,LateFusion_3mod_a0.4_b0.2_g0.4,0.453524,0.368294,0.442697,0.4,0.2,0.4
7,LateFusion_3mod_a0.5_b0.1_g0.4,0.452659,0.372893,0.442532,0.5,0.1,0.4
8,LateFusion_3mod_a0.5_b0.0_g0.5,0.450497,0.372424,0.441884,0.5,0.0,0.5
9,LateFusion_3mod_a0.5_b0.4_g0.1,0.450497,0.369317,0.441667,0.5,0.4,0.1


In [44]:
best_fusion = fusion_df.iloc[0].copy()
best_fusion["model"] = (
    f"LateFusion_best (α_TX={best_fusion['alpha_TX']:.2f}, "
    f"β_I={best_fusion['beta_I']:.2f}, γ_TXI={best_fusion['gamma_TXI']:.2f})"
)

cols = ["model", "accuracy", "macro_f1", "qwk"]

final_leaderboard = (
    pd.concat(
        [ablation_df[cols], best_fusion[cols].to_frame().T],
        ignore_index=True
    )
    .sort_values("qwk", ascending=False)
    .reset_index(drop=True)
)

final_leaderboard

Unnamed: 0,model,accuracy,macro_f1,qwk
0,"LateFusion_best (α_TX=0.40, β_I=0.10, γ_TXI=0.50)",0.450497,0.361742,0.449487
1,XGB_TXI (tab+text+image),0.416775,0.324516,0.40178
2,XGB_TX (tab+text),0.426719,0.363291,0.38328
3,XGB_T (tab only),0.399481,0.337099,0.347647
4,XGB_I (image only),0.357977,0.275178,0.297599
5,XGB_X (text only),0.367056,0.310101,0.220204


Late fusion of tabular+text, image-only, and tabular+text+image models improves Quadratic Weighted Kappa from 0.402 (best early-fusion model) to 0.4495.
The optimal weights place 50% importance on the full early-fusion model, 40% on the tabular+text model, and 10% on the image-only model, suggesting that structured data and text embeddings carry most of the predictive signal, while image features provide small but meaningful complementary information.

## LIGHT GBM + CATBOOST + RANDOM FORESTS

In [47]:
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier

# ------------ model factories ------------ #

def make_xgb():
    return XGBClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        objective="multi:softprob",
        num_class=5,
        random_state=42,
        n_jobs=-1
    )

def make_catboost():
    return CatBoostClassifier(
        loss_function="MultiClass",
        depth=6,
        learning_rate=0.05,
        iterations=600,
        l2_leaf_reg=3.0,
        random_seed=42,
        verbose=False
    )

def make_lgbm():
    return LGBMClassifier(
        objective="multiclass",
        num_class=5,
        learning_rate=0.05,
        n_estimators=600,
        max_depth=-1,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42,
        n_jobs=-1
    )

def make_rf():
    return RandomForestClassifier(
        n_estimators=400,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        max_features="sqrt",
        n_jobs=-1,
        random_state=42
    )

# ------------ train & evaluate ------------ #

results = []

# 1. XGB – Tabular only (T)
xgb_T = make_xgb()
xgb_T.fit(X_T_train, y_train)
y_pred_T = xgb_T.predict(X_T_val)
results.append(evaluate_model("XGB_T (tab only)", y_val, y_pred_T))

# 2. XGB – Text only (X)
xgb_X = make_xgb()
xgb_X.fit(X_X_train, y_train)
y_pred_X = xgb_X.predict(X_X_val)
results.append(evaluate_model("XGB_X (text only)", y_val, y_pred_X))

# 3. XGB – Image only (I)
xgb_I = make_xgb()
xgb_I.fit(X_I_train, y_train)
y_pred_I = xgb_I.predict(X_I_val)
results.append(evaluate_model("XGB_I (image only)", y_val, y_pred_I))

# 4. XGB – Tab + Text (TX)
xgb_TX = make_xgb()
xgb_TX.fit(X_TX_train, y_train)
y_pred_TX = xgb_TX.predict(X_TX_val)
results.append(evaluate_model("XGB_TX (tab+text)", y_val, y_pred_TX))

# 5. XGB – Full (TXI)
xgb_TXI = make_xgb()
xgb_TXI.fit(X_TXI_train, y_train)
y_pred_TXI = xgb_TXI.predict(X_TXI_val)
results.append(evaluate_model("XGB_TXI (tab+text+image)", y_val, y_pred_TXI))

# 6. CatBoost – Full (TXI)
cb_TXI = make_catboost()
cb_TXI.fit(X_TXI_train, y_train)
y_pred_cb_TXI = cb_TXI.predict(X_TXI_val).astype(int).ravel()
results.append(evaluate_model("CB_TXI (tab+text+image)", y_val, y_pred_cb_TXI))

# 7. LightGBM – Full (TXI)
lgb_TXI = make_lgbm()
lgb_TXI.fit(X_TXI_train, y_train)
y_pred_lgb_TXI = lgb_TXI.predict(X_TXI_val)
results.append(evaluate_model("LGBM_TXI (tab+text+image)", y_val, y_pred_lgb_TXI))

# 8. RandomForest – Full (TXI)
rf_TXI = make_rf()
rf_TXI.fit(X_TXI_train, y_train)
y_pred_rf_TXI = rf_TXI.predict(X_TXI_val)
results.append(evaluate_model("RF_TXI (tab+text+image)", y_val, y_pred_rf_TXI))

# ------------ ablation leaderboard ------------ #

ablation_df = (
    pd.DataFrame(results)
    .sort_values("qwk", ascending=False)
    .reset_index(drop=True)
)
ablation_df


=== XGB_T (tab only) ===
Accuracy   : 0.3995
Macro F1   : 0.3371
QWK        : 0.3476

=== XGB_X (text only) ===
Accuracy   : 0.3671
Macro F1   : 0.3101
QWK        : 0.2202

=== XGB_I (image only) ===
Accuracy   : 0.3580
Macro F1   : 0.2752
QWK        : 0.2976

=== XGB_TX (tab+text) ===
Accuracy   : 0.4267
Macro F1   : 0.3633
QWK        : 0.3833

=== XGB_TXI (tab+text+image) ===
Accuracy   : 0.4168
Macro F1   : 0.3245
QWK        : 0.4018

=== CB_TXI (tab+text+image) ===
Accuracy   : 0.4198
Macro F1   : 0.3188
QWK        : 0.3858
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.029082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 341361
[LightGBM] [Info] Number of data points in the train set: 9252, number of used features: 1360
[LightGBM] [Info] Start training from score -3.552865
[LightGBM] [Info] Start training from score -1.556498
[LightGBM] [Info] Start training from score -1.296620
[LightGBM] 




=== LGBM_TXI (tab+text+image) ===
Accuracy   : 0.4159
Macro F1   : 0.3303
QWK        : 0.3817

=== RF_TXI (tab+text+image) ===
Accuracy   : 0.3843
Macro F1   : 0.2744
QWK        : 0.3428


Unnamed: 0,model,accuracy,macro_f1,qwk
0,XGB_TXI (tab+text+image),0.416775,0.324516,0.40178
1,CB_TXI (tab+text+image),0.419801,0.318776,0.385815
2,XGB_TX (tab+text),0.426719,0.363291,0.38328
3,LGBM_TXI (tab+text+image),0.41591,0.330348,0.381718
4,XGB_T (tab only),0.399481,0.337099,0.347647
5,RF_TXI (tab+text+image),0.384349,0.274437,0.342777
6,XGB_I (image only),0.357977,0.275178,0.297599
7,XGB_X (text only),0.367056,0.310101,0.220204


In [48]:
# Base modalities
proba_TX   = xgb_TX.predict_proba(X_TX_val)     # (n_val, 5)
proba_I    = xgb_I.predict_proba(X_I_val)       # (n_val, 5)

# Full multimodal models
proba_TXI_xgb = xgb_TXI.predict_proba(X_TXI_val)
proba_TXI_cb  = cb_TXI.predict_proba(X_TXI_val)
proba_TXI_lgb = lgb_TXI.predict_proba(X_TXI_val)
proba_TXI_rf  = rf_TXI.predict_proba(X_TXI_val)

print("TX:",  proba_TX.shape)
print("I :",  proba_I.shape)
print("TXI XGB:", proba_TXI_xgb.shape)
print("TXI CB :", proba_TXI_cb.shape)
print("TXI LGB:", proba_TXI_lgb.shape)
print("TXI RF :", proba_TXI_rf.shape)
print("y_val:", y_val.shape)

TX: (2313, 5)
I : (2313, 5)
TXI XGB: (2313, 5)
TXI CB : (2313, 5)
TXI LGB: (2313, 5)
TXI RF : (2313, 5)
y_val: (2313,)




In [49]:
fusion_results = []

alpha_grid = [0.1, 0.2, 0.3]      # TX
beta_grid  = [0.0, 0.1, 0.2]      # I
gamma_grid = [0.2, 0.3, 0.4]      # XGB_TXI
delta_grid = [0.0, 0.1, 0.2]      # CB_TXI
eps_grid   = [0.0, 0.1, 0.2]      # LGBM_TXI
zeta_grid  = [0.0, 0.1, 0.2]      # RF_TXI

for a in alpha_grid:
    for b in beta_grid:
        for g in gamma_grid:
            for d in delta_grid:
                for e in eps_grid:
                    for z in zeta_grid:
                        w_sum = a + b + g + d + e + z
                        if abs(w_sum - 1.0) > 1e-6:
                            continue

                        proba_blend = (
                            a * proba_TX +
                            b * proba_I +
                            g * proba_TXI_xgb +
                            d * proba_TXI_cb +
                            e * proba_TXI_lgb +
                            z * proba_TXI_rf
                        )

                        y_pred_blend = proba_blend.argmax(axis=1)

                        name = (
                            f"LateFusion_6mod_a{a:.1f}_b{b:.1f}"
                            f"_g{g:.1f}_d{d:.1f}_e{e:.1f}_z{z:.1f}"
                        )

                        res = evaluate_model(name, y_val, y_pred_blend, verbose=False)
                        res["alpha_TX"]   = a
                        res["beta_I"]     = b
                        res["gamma_TXI"]  = g
                        res["delta_CB"]   = d
                        res["eps_LGBM"]   = e
                        res["zeta_RF"]    = z
                        fusion_results.append(res)

fusion_df = (
    pd.DataFrame(fusion_results)
    .sort_values("qwk", ascending=False)
    .reset_index(drop=True)
)

fusion_df.head(10)

Unnamed: 0,model,accuracy,macro_f1,qwk,alpha_TX,beta_I,gamma_TXI,delta_CB,eps_LGBM,zeta_RF
0,LateFusion_6mod_a0.3_b0.0_g0.3_d0.2_e0.0_z0.2,0.459144,0.367225,0.46089,0.3,0.0,0.3,0.2,0.0,0.2
1,LateFusion_6mod_a0.3_b0.1_g0.3_d0.2_e0.0_z0.1,0.453956,0.362547,0.457545,0.3,0.1,0.3,0.2,0.0,0.1
2,LateFusion_6mod_a0.3_b0.1_g0.3_d0.1_e0.0_z0.2,0.458279,0.365383,0.457529,0.3,0.1,0.3,0.1,0.0,0.2
3,LateFusion_6mod_a0.3_b0.0_g0.4_d0.2_e0.0_z0.1,0.453956,0.358569,0.454875,0.3,0.0,0.4,0.2,0.0,0.1
4,LateFusion_6mod_a0.3_b0.1_g0.2_d0.2_e0.0_z0.2,0.456982,0.363869,0.454816,0.3,0.1,0.2,0.2,0.0,0.2
5,LateFusion_6mod_a0.3_b0.1_g0.2_d0.2_e0.1_z0.1,0.456982,0.365834,0.454599,0.3,0.1,0.2,0.2,0.1,0.1
6,LateFusion_6mod_a0.3_b0.2_g0.3_d0.2_e0.0_z0.0,0.451362,0.361633,0.453834,0.3,0.2,0.3,0.2,0.0,0.0
7,LateFusion_6mod_a0.3_b0.1_g0.2_d0.1_e0.1_z0.2,0.455253,0.363761,0.453485,0.3,0.1,0.2,0.1,0.1,0.2
8,LateFusion_6mod_a0.3_b0.1_g0.4_d0.2_e0.0_z0.0,0.452659,0.357504,0.453203,0.3,0.1,0.4,0.2,0.0,0.0
9,LateFusion_6mod_a0.3_b0.1_g0.3_d0.2_e0.1_z0.0,0.450497,0.362317,0.453104,0.3,0.1,0.3,0.2,0.1,0.0


In [50]:
best_fusion = fusion_df.iloc[0].copy()
best_fusion["model"] = (
    "LateFusion_best "
    f"(α_TX={best_fusion['alpha_TX']:.2f}, "
    f"β_I={best_fusion['beta_I']:.2f}, "
    f"γ_XGBTXI={best_fusion['gamma_TXI']:.2f}, "
    f"δ_CB={best_fusion['delta_CB']:.2f}, "
    f"ε_LGBM={best_fusion['eps_LGBM']:.2f}, "
    f"ζ_RF={best_fusion['zeta_RF']:.2f})"
)

cols = ["model", "accuracy", "macro_f1", "qwk"]

final_leaderboard = (
    pd.concat(
        [ablation_df[cols], best_fusion[cols].to_frame().T],
        ignore_index=True
    )
    .sort_values("qwk", ascending=False)
    .reset_index(drop=True)
)

final_leaderboard

Unnamed: 0,model,accuracy,macro_f1,qwk
0,"LateFusion_best (α_TX=0.30, β_I=0.00, γ_XGBTXI...",0.459144,0.367225,0.46089
1,XGB_TXI (tab+text+image),0.416775,0.324516,0.40178
2,CB_TXI (tab+text+image),0.419801,0.318776,0.385815
3,XGB_TX (tab+text),0.426719,0.363291,0.38328
4,LGBM_TXI (tab+text+image),0.41591,0.330348,0.381718
5,XGB_T (tab only),0.399481,0.337099,0.347647
6,RF_TXI (tab+text+image),0.384349,0.274437,0.342777
7,XGB_I (image only),0.357977,0.275178,0.297599
8,XGB_X (text only),0.367056,0.310101,0.220204
