In [13]:
# ============================================
# AML FINAL PROJECT - TABULAR MODELS (TM2)
# Final clean version: translated CSV, no translation step
# ============================================

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    cohen_kappa_score,
    classification_report,
    confusion_matrix
)
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA


In [16]:
# ============================================
# Helper evaluation functions
# ============================================

def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights="quadratic")

def evaluate_model(name, y_true, y_pred, verbose=True):
    acc = accuracy_score(y_true, y_pred)
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    qwk = quadratic_weighted_kappa(y_true, y_pred)
    
    if verbose:
        print(f"\n=== {name} ===")
        print(f"Accuracy   : {acc:.4f}")
        print(f"Macro F1   : {macro_f1:.4f}")
        print(f"QWK        : {qwk:.4f}")
    
    return {
        "model": name,
        "accuracy": acc,
        "macro_f1": macro_f1,
        "qwk": qwk
    }


In [17]:
# ============================================
# 1. Load translated training data
# ============================================

# Use your actual translated filename here:
df = pd.read_csv("train_fe_english.csv")  # or whatever it's called

print(df.shape)
df.head()


(11565, 26)


Unnamed: 0,Type,Age,Breed1Name,Breed2Name,Gender,Color1Name,Color2Name,Color3Name,MaturitySize,FurLength,...,VideoAmt,Description,PetID,PhotoAmt,AdoptionSpeed,FinalBreed,ColorDiversity,Fully_Healthy,lang,Description_en
0,2,3,Tabby,,1,Black,White,,1,1,...,0,Nibble is a 3+ month old ball of cuteness. He ...,86e1089a3,1.0,2,0,2,0,en,Nibble is a 3+ month old ball of cuteness. He ...
1,2,1,Domestic Medium Hair,,1,Black,Brown,,2,2,...,0,I just found it alone yesterday near my apartm...,6296e909a,2.0,0,0,2,0,en,I just found it alone yesterday near my apartm...
2,1,1,Mixed Breed,,1,Brown,White,,2,2,...,0,Their pregnant mother was dumped by her irresp...,3422e4906,7.0,3,3,2,0,en,Their pregnant mother was dumped by her irresp...
3,1,4,Mixed Breed,,2,Black,Brown,,2,1,...,0,"Good guard dog, very alert, active, obedience ...",5842f1ff5,8.0,2,3,2,0,en,"Good guard dog, very alert, active, obedience ..."
4,1,1,Mixed Breed,,1,Black,,,2,1,...,0,This handsome yet cute boy is up for adoption....,850a43f90,3.0,2,3,1,0,en,This handsome yet cute boy is up for adoption....


In [18]:
# ============================================
# 2. TEXT FEATURES: VADER, KEYWORDS, LENGTH
# ============================================

analyzer = SentimentIntensityAnalyzer()

# Ensure Description_en exists and is string
if "Description_en" not in df.columns:
    # Fallback: if somehow missing, just copy Description
    df["Description"] = df["Description"].fillna("").astype(str)
    df["Description_en"] = df["Description"]
else:
    df["Description_en"] = df["Description_en"].fillna("").astype(str)

# Description length
df["desc_length"] = df["Description_en"].str.len()

# VADER sentiment (compound)
df["sentiment_vader"] = df["Description_en"].apply(
    lambda x: analyzer.polarity_scores(x)["compound"]
)

# Keyword features
positive_keywords = [
    "friendly","playful","sweet","gentle","nice","calm",
    "good with kids","good with children","affectionate","loving",
    "healthy","vaccinated","obedient"
]

negative_keywords = [
    "fearful","aggressive","bite","biting","sick","injured",
    "old","anxious","shy","timid","problem","issue"
]

def count_keywords(text, keywords):
    t = text.lower()
    return sum(1 for kw in keywords if kw in t)

df["positive_kw"] = df["Description_en"].apply(lambda x: count_keywords(x, positive_keywords))
df["negative_kw"] = df["Description_en"].apply(lambda x: count_keywords(x, negative_keywords))

df["keyword_sentiment"] = df["positive_kw"] - df["negative_kw"]
df["keyword_sentiment_norm"] = df["keyword_sentiment"] / (df["desc_length"] + 1)

df[[
    "Description_en",
    "desc_length",
    "sentiment_vader",
    "positive_kw",
    "negative_kw",
    "keyword_sentiment",
    "keyword_sentiment_norm"
]].head()


Unnamed: 0,Description_en,desc_length,sentiment_vader,positive_kw,negative_kw,keyword_sentiment,keyword_sentiment_norm
0,Nibble is a 3+ month old ball of cuteness. He ...,359,0.9552,1,1,0,0.0
1,I just found it alone yesterday near my apartm...,118,0.128,0,0,0,0.0
2,Their pregnant mother was dumped by her irresp...,393,0.765,2,0,2,0.005076
3,"Good guard dog, very alert, active, obedience ...",146,0.9538,0,0,0,0.0
4,This handsome yet cute boy is up for adoption....,390,0.988,2,0,2,0.005115


In [19]:
# ============================================
# 3. TEXT EMBEDDINGS: CLIP TEXT ENCODER ON Description_en + PCA
# ============================================

from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

# Use a CLIP-based text encoder from sentence-transformers
# You can try:
#   "clip-ViT-B-32" (512-d)
#   "clip-ViT-L-14" (768-d, heavier)
clip_model = SentenceTransformer("clip-ViT-B-32")

# Make sure we have clean English descriptions
descriptions_en = df["Description_en"].fillna("").astype(str).tolist()

# Get CLIP text embeddings
clip_text_embeddings = clip_model.encode(
    descriptions_en,
    show_progress_bar=True
)   # shape: (n_samples, 512) for ViT-B-32

print("Raw CLIP text embeddings shape:", clip_text_embeddings.shape)

# PCA → 50 components (you can tune this)
pca_components = 50
pca = PCA(n_components=pca_components, random_state=42)
clip_text_pca = pca.fit_transform(clip_text_embeddings)

clip_pca_df = pd.DataFrame(
    clip_text_pca,
    columns=[f"clip_text_pca_{i}" for i in range(pca_components)]
)

print("PCA-reduced CLIP text shape:", clip_pca_df.shape)

# Merge CLIP PCA features into main df
df = pd.concat([df.reset_index(drop=True), clip_pca_df], axis=1)

# Remove any duplicate columns just in case
df = df.loc[:, ~df.columns.duplicated()]

print("DF shape after adding CLIP text PCA features:", df.shape)


modules.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


vocab.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/604 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Batches:   0%|          | 0/362 [00:00<?, ?it/s]

Raw CLIP text embeddings shape: (11565, 512)
PCA-reduced CLIP text shape: (11565, 50)
DF shape after adding CLIP text PCA features: (11565, 82)


In [20]:
# ============================================
# 4. BUILD X, y
# ============================================

target_col = "AdoptionSpeed"

# Keep PetID for record-keeping / fusion
pet_ids = df["PetID"].copy()

# String / label columns to drop from features
string_drop_cols = [
    "Breed1Name",
    "Breed2Name",
    "Color1Name",
    "Color2Name",
    "Color3Name",
    "StateName",
    "lang"           # language code is string
]

# Non-feature columns
raw_drop_cols = [
    target_col,
    "Description",
    "Description_en",
    "PetID",
    "Name",
    "RescuerID",
    "desc_clean"
]

# Only drop columns that exist
drop_cols = [c for c in (string_drop_cols + raw_drop_cols) if c in df.columns]

y = df[target_col]
X = df.drop(columns=drop_cols)

# Keep only numeric columns
X = X.select_dtypes(include=["number"])

print("Final X shape:", X.shape)
print("First 20 feature columns:", X.columns[:20].tolist())
X.head()


Final X shape: (11565, 71)
First 20 feature columns: ['Type', 'Age', 'Gender', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health', 'Fee', 'VideoAmt', 'PhotoAmt', 'FinalBreed', 'ColorDiversity', 'Fully_Healthy', 'desc_length', 'sentiment_vader', 'positive_kw', 'negative_kw', 'keyword_sentiment']


Unnamed: 0,Type,Age,Gender,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Fee,...,clip_text_pca_40,clip_text_pca_41,clip_text_pca_42,clip_text_pca_43,clip_text_pca_44,clip_text_pca_45,clip_text_pca_46,clip_text_pca_47,clip_text_pca_48,clip_text_pca_49
0,2,3,1,1,1,2,2,2,1,100,...,0.076847,0.187917,0.052599,0.309394,-0.169841,-0.041322,-0.013239,-0.225467,-0.115706,-0.025613
1,2,1,1,2,2,3,3,3,1,0,...,-0.02482,-0.288663,-0.103171,-0.355694,-0.15438,0.102484,0.385159,0.319945,-0.363156,-0.240386
2,1,1,1,2,2,1,1,2,1,0,...,0.177801,-0.320696,0.255823,0.24049,-0.021647,-0.08305,0.378852,-0.236843,-0.333081,-0.241618
3,1,4,2,2,1,1,1,2,1,150,...,-0.313598,-0.060633,-0.190628,0.09734,0.015878,0.357949,0.165794,0.62552,0.091371,-0.015644
4,1,1,1,2,1,2,2,2,1,0,...,-0.084076,0.285581,-0.061182,0.199514,0.067879,0.008248,0.119003,0.264303,0.108563,0.073413


In [21]:
# ============================================
# 5. 5-FOLD STRATIFIED CROSS-VALIDATION (XGBOOST)
# ============================================

X_all = X.to_numpy(dtype=np.float32)
y_all = y.to_numpy()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

acc_scores, f1_scores, qwk_scores = [], [], []

fold = 1
for train_idx, val_idx in skf.split(X_all, y_all):
    print(f"\n---- Fold {fold} ----")
    fold += 1
    
    X_tr, X_va = X_all[train_idx], X_all[val_idx]
    y_tr, y_va = y_all[train_idx], y_all[val_idx]
    
    xgb_cv = XGBClassifier(
        n_estimators=350,
        learning_rate=0.05,
        max_depth=6,
        min_child_weight=3,
        subsample=0.9,
        colsample_bytree=0.9,
        reg_lambda=1.0,
        objective="multi:softprob",
        num_class=5,
        eval_metric="mlogloss",
        tree_method="hist",
        nthread=-1,
        random_state=42
    )
    
    xgb_cv.fit(X_tr, y_tr)
    preds = np.argmax(xgb_cv.predict_proba(X_va), axis=1)
    
    acc_scores.append(accuracy_score(y_va, preds))
    f1_scores.append(f1_score(y_va, preds, average="macro"))
    qwk_scores.append(cohen_kappa_score(y_va, preds, weights="quadratic"))

print("\n===== 5-Fold CV Results (XGBoost) =====")
print("Accuracy:", np.mean(acc_scores), "+/-", np.std(acc_scores))
print("Macro F1:", np.mean(f1_scores), "+/-", np.std(f1_scores))
print("QWK    :", np.mean(qwk_scores), "+/-", np.std(qwk_scores))



---- Fold 1 ----

---- Fold 2 ----

---- Fold 3 ----

---- Fold 4 ----

---- Fold 5 ----

===== 5-Fold CV Results (XGBoost) =====
Accuracy: 0.42888024210981407 +/- 0.006021784815038526
Macro F1: 0.3541064980734946 +/- 0.009967433390739031
QWK    : 0.37587070800339395 +/- 0.01458101897852299


In [22]:
# ============================================
# 6. TRAIN/VAL SPLIT (80/20) WITH PetID
# ============================================

X_train, X_val, y_train, y_val, petid_train, petid_val = train_test_split(
    X,
    y,
    pet_ids,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Val shape  :", X_val.shape)


Train shape: (9252, 71)
Val shape  : (2313, 71)


In [23]:
# ============================================
# 7. SCALE FEATURES FOR MLP
# ============================================

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled   = scaler.transform(X_val)


In [24]:
# ============================================
# 8. TRAIN BASELINE MODELS
# ============================================

results = []

# --- Decision Tree ---
dt_clf = DecisionTreeClassifier(
    max_depth=None,
    min_samples_split=2,
    random_state=42
)
dt_clf.fit(X_train, y_train)
y_pred_dt = dt_clf.predict(X_val)
results.append(evaluate_model("DecisionTree", y_val, y_pred_dt))

# --- Random Forest ---
rf_clf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)
rf_clf.fit(X_train, y_train)
y_pred_rf = rf_clf.predict(X_val)
results.append(evaluate_model("RandomForest", y_val, y_pred_rf))

# --- XGBoost (tuned) ---
X_train_np = X_train.to_numpy(dtype=np.float32)
X_val_np   = X_val.to_numpy(dtype=np.float32)
y_train_np = y_train.to_numpy()
y_val_np   = y_val.to_numpy()

xgb_clf = XGBClassifier(
    n_estimators=350,
    learning_rate=0.05,
    max_depth=6,
    min_child_weight=3,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_lambda=1.0,
    reg_alpha=0.0,
    objective="multi:softprob",
    num_class=5,
    eval_metric="mlogloss",
    tree_method="hist",
    nthread=-1,
    random_state=42
)

xgb_clf.fit(X_train_np, y_train_np)
xgb_proba = xgb_clf.predict_proba(X_val_np)
y_pred_xgb = np.argmax(xgb_proba, axis=1).astype(int)
results.append(evaluate_model("XGBoost", y_val, y_pred_xgb))

# --- CatBoost (numeric-only, tuned) ---
cb_clf = CatBoostClassifier(
    iterations=400,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=5.0,
    loss_function="MultiClass",
    eval_metric="TotalF1:average=Macro",
    random_seed=42,
    verbose=100,
    od_type="Iter",
    od_wait=40
)

cb_clf.fit(X_train_np, y_train_np)
cb_proba = cb_clf.predict_proba(X_val_np)
y_pred_cb = np.argmax(cb_proba, axis=1).astype(int)
results.append(evaluate_model("CatBoost", y_val, y_pred_cb))

# --- MLP (shallow neural net) ---
mlp_clf = MLPClassifier(
    hidden_layer_sizes=(128,),
    activation="relu",
    solver="adam",
    batch_size=128,
    learning_rate_init=0.001,
    max_iter=50,
    random_state=42
)
mlp_clf.fit(X_train_scaled, y_train)
mlp_proba = mlp_clf.predict_proba(X_val_scaled)
y_pred_mlp = np.argmax(mlp_proba, axis=1).astype(int)
results.append(evaluate_model("MLP", y_val, y_pred_mlp))



=== DecisionTree ===
Accuracy   : 0.3316
Macro F1   : 0.2798
QWK        : 0.1927

=== RandomForest ===
Accuracy   : 0.4475
Macro F1   : 0.3648
QWK        : 0.4034

=== XGBoost ===
Accuracy   : 0.4328
Macro F1   : 0.3701
QWK        : 0.3903
0:	learn: 0.2077584	total: 33.4ms	remaining: 13.3s
100:	learn: 0.3617501	total: 1.86s	remaining: 5.5s
200:	learn: 0.4299498	total: 3.63s	remaining: 3.59s
300:	learn: 0.4915156	total: 5.36s	remaining: 1.76s
399:	learn: 0.5441593	total: 7.09s	remaining: 0us

=== CatBoost ===
Accuracy   : 0.4306
Macro F1   : 0.3351
QWK        : 0.3732

=== MLP ===
Accuracy   : 0.3692
Macro F1   : 0.3029
QWK        : 0.2751




In [25]:
# ============================================
# 9. WEIGHTED SOFT ENSEMBLE: XGB + CATBOOST
# ============================================

alphas = [0.5, 0.6, 0.7, 0.8, 0.9]
best_qwk = -1
best_alpha = None
best_pred = None

for a in alphas:
    blended = a * xgb_proba + (1 - a) * cb_proba
    blended_pred = np.argmax(blended, axis=1).astype(int)
    qwk = quadratic_weighted_kappa(y_val, blended_pred)
    print(f"alpha={a:.1f}, QWK={qwk:.4f}")
    
    if qwk > best_qwk:
        best_qwk = qwk
        best_alpha = a
        best_pred = blended_pred

results.append(
    evaluate_model(f"WeightedEnsemble_XGB_CB_alpha_{best_alpha:.1f}", y_val, best_pred)
)

print(f"\nBest weighted ensemble alpha: {best_alpha}, QWK={best_qwk:.4f}")


alpha=0.5, QWK=0.3949
alpha=0.6, QWK=0.4007
alpha=0.7, QWK=0.3962
alpha=0.8, QWK=0.3951
alpha=0.9, QWK=0.3871

=== WeightedEnsemble_XGB_CB_alpha_0.6 ===
Accuracy   : 0.4440
Macro F1   : 0.3677
QWK        : 0.4007

Best weighted ensemble alpha: 0.6, QWK=0.4007


In [26]:
# ============================================
# 10. LEADERBOARD
# ============================================

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="qwk", ascending=False).reset_index(drop=True)
results_df


Unnamed: 0,model,accuracy,macro_f1,qwk
0,RandomForest,0.447471,0.364817,0.403379
1,WeightedEnsemble_XGB_CB_alpha_0.6,0.444012,0.367681,0.400667
2,XGBoost,0.432771,0.370134,0.39034
3,CatBoost,0.43061,0.335062,0.37321
4,MLP,0.369217,0.302893,0.275054
5,DecisionTree,0.331604,0.27984,0.192715
