In [40]:
# ============================================
# AML FINAL PROJECT - TABULAR MODELS (TM2)
# Final clean version: translated CSV, no translation step
# ============================================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    cohen_kappa_score,
    classification_report,
    confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA


In [41]:
# ============================================
# Helper evaluation functions
# ============================================

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")

def evaluate_model(name, y_true, y_pred, verbose=True):
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    qwk = quadratic_weighted_kappa(y_true, y_pred)
    
    if verbose:
        print(f"\n=== {name} ===")
        print(f"Accuracy   : {acc:.4f}")
        print(f"Macro F1   : {macro_f1:.4f}")
        print(f"QWK        : {qwk:.4f}")
    
    return {
        "model": name,
        "accuracy": acc,
        "macro_f1": macro_f1,
        "qwk": qwk
    }


In [42]:
# ============================================
# 1. Load translated training data
# ============================================

# Use your actual translated filename here:
df = pd.read_csv("train_fe_english.csv")  # or whatever it's called

print(df.shape)
df.head()


(11565, 26)


Unnamed: 0,Type,Age,Breed1Name,Breed2Name,Gender,Color1Name,Color2Name,Color3Name,MaturitySize,FurLength,...,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed,FinalBreed,ColorDiversity,Fully_Healthy,lang,Description_en
0,2,3,Tabby,,1,Black,White,,1,1,...,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2,0,2,0,en,Nibble is a 3+ month old ball of cuteness. He ...
1,2,1,Domestic Medium Hair,,1,Black,Brown,,2,2,...,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0,0,2,0,en,I just found it alone yesterday near my apartm...
2,1,1,Mixed Breed,,1,Brown,White,,2,2,...,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3,3,2,0,en,Their pregnant mother was dumped by her irresp...
3,1,4,Mixed Breed,,2,Black,Brown,,2,1,...,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2,3,2,0,en,"Good guard dog, very alert, active, obedience ..."
4,1,1,Mixed Breed,,1,Black,,,2,1,...,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2,3,1,0,en,This handsome yet cute boy is up for adoption....


In [43]:
# ============================================
# 2. TEXT FEATURES: VADER, KEYWORDS, LENGTH
# ============================================

analyzer = SentimentIntensityAnalyzer()

# Ensure Description_en exists and is string
if "Description_en" not in df.columns:
    # Fallback: if somehow missing, just copy Description
    df["Description"] = df["Description"].fillna("").astype(str)
    df["Description_en"] = df["Description"]
else:
    df["Description_en"] = df["Description_en"].fillna("").astype(str)

# Description length
df["desc_length"] = df["Description_en"].str.len()

# VADER sentiment (compound)
df["sentiment_vader"] = df["Description_en"].apply(
    lambda x: analyzer.polarity_scores(x)["compound"]
)

# Keyword features
positive_keywords = [
    "friendly","playful","sweet","gentle","nice","calm",
    "good with kids","good with children","affectionate","loving",
    "healthy","vaccinated","obedient"
]

negative_keywords = [
    "fearful","aggressive","bite","biting","sick","injured",
    "old","anxious","shy","timid","problem","issue"
]

def count_keywords(text, keywords):
    t = text.lower()
    return sum(1 for kw in keywords if kw in t)

df["positive_kw"] = df["Description_en"].apply(lambda x: count_keywords(x, positive_keywords))
df["negative_kw"] = df["Description_en"].apply(lambda x: count_keywords(x, negative_keywords))

df["keyword_sentiment"] = df["positive_kw"] - df["negative_kw"]
df["keyword_sentiment_norm"] = df["keyword_sentiment"] / (df["desc_length"] + 1)

df[[
    "Description_en",
    "desc_length",
    "sentiment_vader",
    "positive_kw",
    "negative_kw",
    "keyword_sentiment",
    "keyword_sentiment_norm"
]].head()


Unnamed: 0,Description_en,desc_length,sentiment_vader,positive_kw,negative_kw,keyword_sentiment,keyword_sentiment_norm
0,Nibble is a 3+ month old ball of cuteness. He ...,359,0.9552,1,1,0,0.0
1,I just found it alone yesterday near my apartm...,118,0.128,0,0,0,0.0
2,Their pregnant mother was dumped by her irresp...,393,0.765,2,0,2,0.005076
3,"Good guard dog, very alert, active, obedience ...",146,0.9538,0,0,0,0.0
4,This handsome yet cute boy is up for adoption....,390,0.988,2,0,2,0.005115


In [44]:
# ============================================
# 3. TEXT EMBEDDINGS: MiniLM ON Description_en + PCA
# ============================================

text_model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

descriptions_en = df["Description_en"].tolist()

# 384-d MiniLM embeddings
text_embeddings = text_model.encode(descriptions_en, show_progress_bar=True)

# PCA → 50 components
pca = PCA(n_components=50, random_state=42)
text_pca = pca.fit_transform(text_embeddings)

pca_df = pd.DataFrame(
    text_pca,
    columns=[f"text_pca_{i}" for i in range(50)]
)

print("PCA text embeddings shape:", pca_df.shape)

# Merge PCA embeddings into df
df = pd.concat([df.reset_index(drop=True), pca_df], axis=1)

# Remove duplicate columns, just in case
df = df.loc[:, ~df.columns.duplicated()]

print("DF shape after adding text PCA features:", df.shape)


Batches:   0%|          | 0/362 [00:00<?, ?it/s]

PCA text embeddings shape: (11565, 50)
DF shape after adding text PCA features: (11565, 82)


In [45]:
# ============================================
# 4. BUILD X, y
# ============================================

target_col = "AdoptionSpeed"

# Keep PetID for record-keeping / fusion
pet_ids = df["PetID"].copy()

# String / label columns to drop from features
string_drop_cols = [
    "Breed1Name",
    "Breed2Name",
    "Color1Name",
    "Color2Name",
    "Color3Name",
    "StateName",
    "lang"           # language code is string
]

# Non-feature columns
raw_drop_cols = [
    target_col,
    "Description",
    "Description_en",
    "PetID",
    "Name",
    "RescuerID",
    "desc_clean"
]

# Only drop columns that exist
drop_cols = [c for c in (string_drop_cols + raw_drop_cols) if c in df.columns]

y = df[target_col]
X = df.drop(columns=drop_cols)

# Keep only numeric columns
X = X.select_dtypes(include=["number"])

print("Final X shape:", X.shape)
print("First 20 feature columns:", X.columns[:20].tolist())
X.head()


Final X shape: (11565, 71)
First 20 feature columns: ['Type', 'Age', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Fee', 'VideoAmt', 'PhotoAmt', 'FinalBreed', 'ColorDiversity', 'Fully_Healthy', 'desc_length', 'sentiment_vader', 'positive_kw', 'negative_kw', 'keyword_sentiment']


Unnamed: 0,Type,Age,Gender,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Fee,...,text_pca_40,text_pca_41,text_pca_42,text_pca_43,text_pca_44,text_pca_45,text_pca_46,text_pca_47,text_pca_48,text_pca_49
0,2,3,1,1,1,2,2,2,1,100,...,-0.073312,-0.129763,0.816426,-0.385333,0.669316,-0.579443,-0.029467,0.405,0.384466,0.021978
1,2,1,1,2,2,3,3,3,1,0,...,-0.573618,0.372427,0.028509,0.021874,-0.259432,-0.468007,-0.24861,-0.197996,-0.625169,-0.525467
2,1,1,1,2,2,1,1,2,1,0,...,-0.14993,0.017905,0.016204,0.147109,-0.067223,0.070558,-0.038469,-0.094986,-0.068954,0.130841
3,1,4,2,2,1,1,1,2,1,150,...,0.504112,-0.125056,0.210978,0.4278,0.009744,0.017245,0.256411,0.148213,-0.091621,0.005473
4,1,1,1,2,1,2,2,2,1,0,...,0.098453,-0.030766,0.11768,-0.07508,-0.053609,-0.154886,-0.345576,0.286,-0.11419,0.397126


In [46]:
# ============================================
# 5. 5-FOLD STRATIFIED CROSS-VALIDATION (XGBOOST)
# ============================================

X_all = X.to_numpy(dtype=np.float32)
y_all = y.to_numpy()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

acc_scores, f1_scores, qwk_scores = [], [], []

fold = 1
for train_idx, val_idx in skf.split(X_all, y_all):
    print(f"\n---- Fold {fold} ----")
    fold += 1
    
    X_tr, X_va = X_all[train_idx], X_all[val_idx]
    y_tr, y_va = y_all[train_idx], y_all[val_idx]
    
    xgb_cv = XGBClassifier(
        n_estimators=350,
        learning_rate=0.05,
        max_depth=6,
        min_child_weight=3,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        objective="multi:softprob",
        num_class=5,
        eval_metric="mlogloss",
        tree_method="hist",
        nthread=-1,
        random_state=42
    )
    
    xgb_cv.fit(X_tr, y_tr)
    preds = np.argmax(xgb_cv.predict_proba(X_va), axis=1)
    
    acc_scores.append(accuracy_score(y_va, preds))
    f1_scores.append(f1_score(y_va, preds, average="macro"))
    qwk_scores.append(cohen_kappa_score(y_va, preds, weights="quadratic"))

print("\n===== 5-Fold CV Results (XGBoost) =====")
print("Accuracy:", np.mean(acc_scores), "+/-", np.std(acc_scores))
print("Macro F1:", np.mean(f1_scores), "+/-", np.std(f1_scores))
print("QWK    :", np.mean(qwk_scores), "+/-", np.std(qwk_scores))



---- Fold 1 ----

---- Fold 2 ----

---- Fold 3 ----

---- Fold 4 ----

---- Fold 5 ----

===== 5-Fold CV Results (XGBoost) =====
Accuracy: 0.428966709900562 +/- 0.002670721177947356
Macro F1: 0.3552623158370684 +/- 0.006649124187425148
QWK    : 0.36517375645814354 +/- 0.01724935388774115


In [47]:
# ============================================
# 6. TRAIN/VAL SPLIT (80/20) WITH PetID
# ============================================

X_train, X_val, y_train, y_val, petid_train, petid_val = train_test_split(
    X,
    y,
    pet_ids,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Val shape  :", X_val.shape)


Train shape: (9252, 71)
Val shape  : (2313, 71)


In [48]:
# ============================================
# 7. SCALE FEATURES FOR MLP
# ============================================

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)


In [49]:
# ============================================
# 8. TRAIN BASELINE MODELS
# ============================================

results = []

# --- Decision Tree ---
dt_clf = DecisionTreeClassifier(
    max_depth=None,
    min_samples_split=2,
    random_state=42
)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_val)
results.append(evaluate_model("DecisionTree", y_val, y_pred_dt))

# --- Random Forest ---
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_val)
results.append(evaluate_model("RandomForest", y_val, y_pred_rf))

# --- XGBoost (tuned) ---
X_train_np = X_train.to_numpy(dtype=np.float32)
X_val_np   = X_val.to_numpy(dtype=np.float32)
y_train_np = y_train.to_numpy()
y_val_np   = y_val.to_numpy()

xgb_clf = XGBClassifier(
    n_estimators=350,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=3,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="multi:softprob",
    num_class=5,
    eval_metric="mlogloss",
    tree_method="hist",
    nthread=-1,
    random_state=42
)

xgb_clf.fit(X_train_np, y_train_np)
xgb_proba = xgb_clf.predict_proba(X_val_np)
y_pred_xgb = np.argmax(xgb_proba, axis=1).astype(int)
results.append(evaluate_model("XGBoost", y_val, y_pred_xgb))

# --- CatBoost (numeric-only, tuned) ---
cb_clf = CatBoostClassifier(
    iterations=400,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=5.0,
    loss_function="MultiClass",
    eval_metric="TotalF1:average=Macro",
    random_seed=42,
    verbose=100,
    od_type="Iter",
    od_wait=40
)

cb_clf.fit(X_train_np, y_train_np)
cb_proba = cb_clf.predict_proba(X_val_np)
y_pred_cb = np.argmax(cb_proba, axis=1).astype(int)
results.append(evaluate_model("CatBoost", y_val, y_pred_cb))

# --- MLP (shallow neural net) ---
mlp_clf = MLPClassifier(
    hidden_layer_sizes=(128,),
    activation="relu",
    solver="adam",
    batch_size=128,
    learning_rate_init=0.001,
    max_iter=50,
    random_state=42
)
mlp_clf.fit(X_train_scaled, y_train)
mlp_proba = mlp_clf.predict_proba(X_val_scaled)
y_pred_mlp = np.argmax(mlp_proba, axis=1).astype(int)
results.append(evaluate_model("MLP", y_val, y_pred_mlp))



=== DecisionTree ===
Accuracy   : 0.3437
Macro F1   : 0.3031
QWK        : 0.2038

=== RandomForest ===
Accuracy   : 0.4401
Macro F1   : 0.3593
QWK        : 0.3850

=== XGBoost ===
Accuracy   : 0.4246
Macro F1   : 0.3615
QWK        : 0.3898
0:	learn: 0.2480289	total: 21.9ms	remaining: 8.74s
100:	learn: 0.3652008	total: 2.01s	remaining: 5.95s
200:	learn: 0.4442797	total: 3.95s	remaining: 3.91s
300:	learn: 0.4923581	total: 5.88s	remaining: 1.93s
399:	learn: 0.5507355	total: 7.78s	remaining: 0us

=== CatBoost ===
Accuracy   : 0.4189
Macro F1   : 0.3265
QWK        : 0.3750

=== MLP ===
Accuracy   : 0.3701
Macro F1   : 0.3047
QWK        : 0.2648




In [50]:
# ============================================
# 9. WEIGHTED SOFT ENSEMBLE: XGB + CATBOOST
# ============================================

alphas = [0.5, 0.6, 0.7, 0.8, 0.9]
best_qwk = -1
best_alpha = None
best_pred = None

for a in alphas:
    blended = a * xgb_proba + (1 - a) * cb_proba
    blended_pred = np.argmax(blended, axis=1).astype(int)
    qwk = quadratic_weighted_kappa(y_val, blended_pred)
    print(f"alpha={a:.1f}, QWK={qwk:.4f}")
    
    if qwk > best_qwk:
        best_qwk = qwk
        best_alpha = a
        best_pred = blended_pred

results.append(
    evaluate_model(f"WeightedEnsemble_XGB_CB_alpha_{best_alpha:.1f}", y_val, best_pred)
)

print(f"\nBest weighted ensemble alpha: {best_alpha}, QWK={best_qwk:.4f}")


alpha=0.5, QWK=0.3911
alpha=0.6, QWK=0.3965
alpha=0.7, QWK=0.3918
alpha=0.8, QWK=0.3848
alpha=0.9, QWK=0.3898

=== WeightedEnsemble_XGB_CB_alpha_0.6 ===
Accuracy   : 0.4272
Macro F1   : 0.3528
QWK        : 0.3965

Best weighted ensemble alpha: 0.6, QWK=0.3965


In [51]:
# ============================================
# 10. LEADERBOARD
# ============================================

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="qwk", ascending=False).reset_index(drop=True)
results_df


Unnamed: 0,model,accuracy,macro_f1,qwk
0,WeightedEnsemble_XGB_CB_alpha_0.6,0.427151,0.352797,0.396509
1,XGBoost,0.424557,0.361458,0.389847
2,RandomForest,0.440121,0.35931,0.38498
3,CatBoost,0.418936,0.32651,0.374986
4,MLP,0.370082,0.304721,0.264796
5,DecisionTree,0.343709,0.303133,0.203755
