In [None]:
# 03c: TF-IDF + adaptive SVD for Adult / PetFinder / Breast
!pip -q install scikit-learn==1.5.2

import os, numpy as np, pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

# Mount Drive (idempotent)
from google.colab import drive
try:
    drive.mount('/content/drive')
except Exception:
    pass

PROJ = "/content/drive/MyDrive/dissertation"
DATA_DIR = f"{PROJ}/data"
OUT_DIR  = f"{PROJ}/outputs/tfidf"
os.makedirs(OUT_DIR, exist_ok=True)

def cats_and_target(ds):
    if ds=="adult":     return ["occupation","workclass","native_country"], "income",             f"{DATA_DIR}/Adult_clean.csv"
    if ds=="petfinder": return ["Breed1","Color1","MaturitySize"],         "AdoptionSpeed_bin",  f"{DATA_DIR}/Petfinder_clean.csv"
    if ds=="breast":    return ["TNM_PATH_T","TNM_PATH_N","hospid"],       "OS5yr_bin",          f"{DATA_DIR}/Breast_clean.csv"
    raise ValueError(ds)

def as_tokens(df, cats):
    cols = []
    for c in cats:
        s = c + ":" + df[c].astype(str).str.strip().replace({"": "Unknown"}).fillna("Unknown")
        cols.append(s)
    return pd.Series([" ".join(t) for t in zip(*cols)], index=df.index)

def build_tfidf(ds, svd_cap=192):
    cats, target, path = cats_and_target(ds)
    df = pd.read_csv(path, low_memory=False)
    for c in cats:
        df[c] = df[c].astype(str).str.strip().replace({"": np.nan}).fillna("Unknown")

    corpus = as_tokens(df, cats)
    vec = TfidfVectorizer(token_pattern=r"[^ ]+", lowercase=False)
    Xtf = vec.fit_transform(corpus)
    n_features = Xtf.shape[1]

    # adaptive SVD size: <= n_features-1 and <= svd_cap (and >= 2)
    svd_dim = max(2, min(svd_cap, n_features - 1))
    svd = TruncatedSVD(n_components=svd_dim, random_state=42)
    X = svd.fit_transform(Xtf)

    cols = [f"tfidf_svd_{i}" for i in range(svd_dim)]
    out = pd.DataFrame(X, columns=cols)
    out_path = os.path.join(OUT_DIR, f"{ds}_tfidf_svd{svd_dim}.csv")
    out.to_csv(out_path, index=False)
    print(f"{ds}: TF-IDF(vocab={n_features}) → SVD({svd_dim}) → {out_path} | shape={out.shape}")

for ds in ["adult","petfinder","breast"]:
    build_tfidf(ds, svd_cap=192)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ adult: TF-IDF(vocab=66) → SVD(65) → /content/drive/MyDrive/dissertation/outputs/tfidf/adult_tfidf_svd65.csv | shape=(48842, 65)
✅ petfinder: TF-IDF(vocab=229) → SVD(192) → /content/drive/MyDrive/dissertation/outputs/tfidf/petfinder_tfidf_svd192.csv | shape=(11537, 192)
✅ breast: TF-IDF(vocab=1374) → SVD(192) → /content/drive/MyDrive/dissertation/outputs/tfidf/breast_tfidf_svd192.csv | shape=(59784, 192)


In [None]:
# ==================================================
# Bridge Evaluation – TF-IDF SVD Features + Numeric
# ==================================================
!pip -q install lightgbm==4.5.0 scikit-learn==1.5.2

import pandas as pd, numpy as np, os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score

PROJ = "/content/drive/MyDrive/dissertation"
DATA_DIR = f"{PROJ}/data"
EMB_DIR  = f"{PROJ}/outputs/tfidf"
OUT_DIR  = f"{PROJ}/outputs/bridge_eval"
os.makedirs(OUT_DIR, exist_ok=True)

def cats_target_nums(ds):
    if ds=="adult":
        return f"{DATA_DIR}/Adult_clean.csv", f"{EMB_DIR}/adult_tfidf_svd65.csv",  "income"
    if ds=="petfinder":
        return f"{DATA_DIR}/Petfinder_clean.csv", f"{EMB_DIR}/petfinder_tfidf_svd192.csv", "AdoptionSpeed_bin"
    if ds=="breast":
        return f"{DATA_DIR}/Breast_clean.csv", f"{EMB_DIR}/breast_tfidf_svd192.csv", "OS5yr_bin"

def run_bridge(ds):
    raw_path, emb_path, target = cats_target_nums(ds)
    df = pd.read_csv(raw_path, low_memory=False)
    df_emb = pd.read_csv(emb_path)

    # --- numeric columns (ignore target and non-numeric)
    num_cols = df.select_dtypes(include=["int64","float64"]).columns.tolist()
    num_cols = [c for c in num_cols if c != target]
    X_num = df[num_cols].fillna(df[num_cols].median())

    # --- combine numeric + TF-IDF SVD embeddings
    X = np.hstack([StandardScaler().fit_transform(X_num), df_emb.values])
    y = df[target].astype(int).values

    Xtr, Xte, ytr, yte = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

    def eval_model(name, mdl):
        mdl.fit(Xtr, ytr)
        prob = mdl.predict_proba(Xte)[:,1]
        pred = (prob > 0.5).astype(int)
        return {
            "Model": name,
            "AUC": roc_auc_score(yte, prob),
            "F1": f1_score(yte, pred),
            "ACC": accuracy_score(yte, pred),
            "PREC": precision_score(yte, pred, zero_division=0),
            "REC": recall_score(yte, pred, zero_division=0)
        }

    res_lr = eval_model("LR+TFIDF+Num", LogisticRegression(max_iter=300, n_jobs=-1))
    res_lgb = eval_model("LGBM+TFIDF+Num", LGBMClassifier(n_estimators=300, learning_rate=0.05, random_state=42))
    df_res = pd.DataFrame([res_lr, res_lgb])
    out_path = os.path.join(OUT_DIR, f"{ds}_bridge_tfidf.csv")
    df_res.to_csv(out_path, index=False)
    print(f"{ds} results saved → {out_path}")
    display(df_res)

for ds in ["adult","petfinder","breast"]:
    run_bridge(ds)


[LightGBM] [Info] Number of positive: 8181, number of negative: 26008
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 15519
[LightGBM] [Info] Number of data points in the train set: 34189, number of used features: 70
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.239287 -> initscore=-1.156590
[LightGBM] [Info] Start training from score -1.156590
✅ adult results saved → /content/drive/MyDrive/dissertation/outputs/bridge_eval/adult_bridge_tfidf.csv


Unnamed: 0,Model,AUC,F1,ACC,PREC,REC
0,LR+TFIDF+Num,0.847855,0.535938,0.823313,0.721177,0.426412
1,LGBM+TFIDF+Num,0.89168,0.628473,0.853068,0.795544,0.519395


[LightGBM] [Info] Number of positive: 1932, number of negative: 6143
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.026219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 47423
[LightGBM] [Info] Number of data points in the train set: 8075, number of used features: 195
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.239257 -> initscore=-1.156757
[LightGBM] [Info] Start training from score -1.156757
✅ petfinder results saved → /content/drive/MyDrive/dissertation/outputs/bridge_eval/petfinder_bridge_tfidf.csv


Unnamed: 0,Model,AUC,F1,ACC,PREC,REC
0,LR+TFIDF+Num,0.650963,0.077178,0.758232,0.448718,0.04222
1,LGBM+TFIDF+Num,0.661117,0.25641,0.757077,0.480132,0.17491


[LightGBM] [Info] Number of positive: 12579, number of negative: 29269
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.170304 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 51959
[LightGBM] [Info] Number of data points in the train set: 41848, number of used features: 252
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.300588 -> initscore=-0.844500
[LightGBM] [Info] Start training from score -0.844500
✅ breast results saved → /content/drive/MyDrive/dissertation/outputs/bridge_eval/breast_bridge_tfidf.csv


Unnamed: 0,Model,AUC,F1,ACC,PREC,REC
0,LR+TFIDF+Num,0.999998,0.998701,0.999219,0.998887,0.998516
1,LGBM+TFIDF+Num,0.999996,0.997867,0.998718,0.997959,0.997774
