In [None]:
# ==================================================
# 02_encoding_benchmarks.ipynb — Adult Dataset
# ==================================================

# ---- Setup
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, json, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score

!pip install -q category_encoders
import category_encoders as ce

# ---- Load config and data
PROJ = "/content/drive/MyDrive/dissertation"
CONFIG_PATH = f"{PROJ}/config.json"

with open(CONFIG_PATH) as f:
    cfg = json.load(f)

meta = cfg["datasets"]["adult"]
path = f"{PROJ}/{meta['path']}"
df = pd.read_csv(path, low_memory=False)

target_col = meta["target"]
cat_cols = meta["embed_candidates"]  # ['occupation', 'workclass', 'native_country']

# ---- Keep only numeric + selected categorical + target
num_cols = df.select_dtypes(include=["int64", "float64"]).columns.tolist()
df = df[cat_cols + num_cols + [target_col]]

# ---- Prepare target and split
y = df[target_col]
if isinstance(y, pd.DataFrame):
    y = y.iloc[:, 0]
y = y.astype(int).squeeze()

X = df.drop(columns=[target_col])
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print(" Dataset ready")
print("Categorical columns:", cat_cols)
print("Train:", X_train.shape, "Val:", X_val.shape, "Test:", X_test.shape)

# ---- Helper function
def evaluate_model(model_name, model, X_val, y_val):
    preds = model.predict_proba(X_val)[:, 1]
    pred_labels = (preds > 0.5).astype(int)
    auc  = roc_auc_score(y_val, preds)
    f1   = f1_score(y_val, pred_labels)
    acc  = accuracy_score(y_val, pred_labels)
    prec = precision_score(y_val, pred_labels)
    rec  = recall_score(y_val, pred_labels)
    return {"Model": model_name, "AUC": auc, "F1": f1, "ACC": acc, "PREC": prec, "REC": rec}

# ---- 1. One-Hot Encoding
pipe_ohe = Pipeline([
    ("encoder", ce.OneHotEncoder(cols=cat_cols, handle_unknown="ignore", use_cat_names=True)),
    ("model", LogisticRegression(max_iter=200, n_jobs=-1))
])
pipe_ohe.fit(X_train, y_train)
res_ohe = evaluate_model("OneHot_LogReg", pipe_ohe, X_val, y_val)

# ---- 2. Target Encoding
pipe_te = Pipeline([
    ("encoder", ce.TargetEncoder(cols=cat_cols, smoothing=0.3)),
    ("model", LogisticRegression(max_iter=200, n_jobs=-1))
])
pipe_te.fit(X_train, y_train)
res_te = evaluate_model("TargetEnc_LogReg", pipe_te, X_val, y_val)

# ---- Show results
results = pd.DataFrame([res_ohe, res_te])
display(results)

# ---- Save
bench_path = f"{PROJ}/outputs/benchmarks_adult.csv"
results.to_csv(bench_path, index=False)
print(f" Results saved to {bench_path}")


Mounted at /content/drive
 Dataset ready
Categorical columns: ['occupation', 'workclass', 'native_country']
Train: (29305, 8) Val: (9768, 8) Test: (9769, 8)


Unnamed: 0,Model,AUC,F1,ACC,PREC,REC
0,OneHot_LogReg,0.815894,0.464101,0.808968,0.705677,0.345742
1,TargetEnc_LogReg,0.842895,0.532359,0.823198,0.724926,0.420625


 Results saved to /content/drive/MyDrive/dissertation/outputs/benchmarks_adult.csv


In [None]:
# ==================================================
# Save Encoding Benchmark Notes for Adult Dataset
# ==================================================
notes_text = f"""
Encoding Benchmark Notes — Adult Dataset
----------------------------------------
Purpose:
Establish baseline performance for classical encoders (One-Hot, Target)
on high-cardinality categorical variables prior to embedding experiments.

Encoded categorical columns: {', '.join(cat_cols)}
Target column: {target_col}
Model: Logistic Regression
Metrics computed: AUC, F1, Accuracy, Precision, Recall

Methodology Summary:
- Loaded cleaned dataset from: {path}
- Retained only numeric columns and the three high-cardinality categorical columns ({', '.join(cat_cols)}).
- Dropped other categorical columns (e.g., sex, race, relationship) to isolate
  the impact of encoding strategy on complex categorical structure.
- Applied two encoders:
    1. One-Hot Encoding (expands each category into binary dummy variables)
    2. Target Encoding (replaces each category with its mean target value)
- Evaluated both models using a 60/20/20 train/validation/test split
  with stratification to maintain class balance.
- Model used: Logistic Regression (max_iter=200)
- Metrics calculated: AUC, F1, Accuracy, Precision, Recall

Rationale:
This benchmark provides a baseline to compare with learned embeddings
(Entity Embedding, Word2Vec, Node2Vec) in later notebooks.
It isolates high-cardinality effects while minimizing compute load.

Results saved to: {bench_path}
"""

notes_path = f"{PROJ}/outputs/encoding_notes.txt"
os.makedirs(os.path.dirname(notes_path), exist_ok=True)

with open(notes_path, "a") as f:
    f.write(notes_text.strip() + "\n\n")

print(f" Notes saved to {notes_path}")


 Notes saved to /content/drive/MyDrive/dissertation/outputs/encoding_notes.txt


In [None]:
# ==================================================
# 02_encoding_benchmarks.ipynb — PetFinder (fixed, leak-proof single cell)
# ==================================================

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
import category_encoders as ce

# ---- Paths
PROJ = "/content/drive/MyDrive/dissertation"
DATA_PATH = f"{PROJ}/data/Petfinder_clean.csv"
OUT_PATH  = f"{PROJ}/outputs/benchmarks_petfinder.csv"
NOTES_PATH = f"{PROJ}/outputs/encoding_notes.txt"

# ---- Load
df = pd.read_csv(DATA_PATH, low_memory=False)

target_col = "AdoptionSpeed_bin"
cat_cols = ["Breed1", "Color1", "MaturitySize"]  # high-cardinality trio

# ---- Pre-impute at source (robustness)
num_cols_all = df.select_dtypes(include=["int64","float64"]).columns.tolist()
for c in num_cols_all:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df.replace([np.inf, -np.inf], np.nan, inplace=True)
df[num_cols_all] = df[num_cols_all].fillna(df[num_cols_all].median())

for c in cat_cols:
    df[c] = df[c].astype(str).str.strip().replace({"": np.nan})
df[cat_cols] = df[cat_cols].fillna("Unknown")

# ==== LEAKAGE FIX: exclude target from numeric features ====
num_cols = [c for c in num_cols_all if c != target_col]

# Build X, y cleanly
X = df[cat_cols + num_cols].copy()
y = df[target_col].astype(int).copy()

# ---- Split (60/20/20 stratified)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

print(f" Dataset: {df.shape} | Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")
print("Categorical columns:", cat_cols)

# ---- Pipelines (unknown-safe + final safety imputer)
num_pipe = SimpleImputer(strategy="median")
cat_pipe_ohe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="Unknown"),
    ce.OneHotEncoder(handle_unknown="ignore", use_cat_names=True)
)
cat_pipe_te = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="Unknown"),
    ce.TargetEncoder(
        handle_unknown="value",
        handle_missing="value",
        min_samples_leaf=1,
        smoothing=0.3
    )
)

def make_pipeline_with(encoder_pipe):
    pre = ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", encoder_pipe, cat_cols),
    ])
    return Pipeline([
        ("preprocess", pre),
        ("final_impute", SimpleImputer(strategy="median")),  # final safety net
        ("model", LogisticRegression(max_iter=200, n_jobs=-1))
    ])

def evaluate_model(name, model, Xv, yv):
    prob = model.predict_proba(Xv)[:, 1]
    pred = (prob > 0.5).astype(int)
    return {
        "Model": name,
        "AUC": roc_auc_score(yv, prob),
        "F1": f1_score(yv, pred),
        "ACC": accuracy_score(yv, pred),
        "PREC": precision_score(yv, pred),
        "REC": recall_score(yv, pred),
    }

# ---- Fit/eval
pipe_ohe = make_pipeline_with(cat_pipe_ohe)
pipe_ohe.fit(X_train, y_train)
res_ohe = evaluate_model("OneHot_LogReg", pipe_ohe, X_val, y_val)

pipe_te = make_pipeline_with(cat_pipe_te)
pipe_te.fit(X_train, y_train)
res_te = evaluate_model("TargetEnc_LogReg", pipe_te, X_val, y_val)

import pandas as pd
results = pd.DataFrame([res_ohe, res_te])
display(results)
os.makedirs(os.path.dirname(OUT_PATH), exist_ok=True)
results.to_csv(OUT_PATH, index=False)
print(f" Results saved to: {OUT_PATH}")

# ---- Notes append
notes_text = f"""
Encoding Benchmark Notes — PetFinder Dataset
--------------------------------------------
Encoded categorical columns: {', '.join(cat_cols)}
Target column: {target_col}
Model: Logistic Regression
Metrics: AUC, F1, Accuracy, Precision, Recall

Leakage prevention:
- The target is numeric; we explicitly removed it from num_cols before building X.

Pre-imputation (before split):
- Numeric: coerced to numeric, Inf→NaN, then median fill (per column)
- Categorical (3 high-card): stripped and filled with 'Unknown'

Pipelines include:
- One-Hot (unknown ignored)
- Target Encoding (unknown/missing → global prior)
- Final imputer after ColumnTransformer to guarantee no NaNs reach the model.

Split: 60/20/20 (stratified)
Data file: {DATA_PATH}
Results: {OUT_PATH}
"""
os.makedirs(os.path.dirname(NOTES_PATH), exist_ok=True)
with open(NOTES_PATH, "a") as f:
    f.write(notes_text.strip() + "\n\n")
print(f" Notes appended to: {NOTES_PATH}")


Mounted at /content/drive
 Dataset: (11537, 14) | Train: (6922, 6), Val: (2307, 6), Test: (2308, 6)
Categorical columns: ['Breed1', 'Color1', 'MaturitySize']


Unnamed: 0,Model,AUC,F1,ACC,PREC,REC
0,OneHot_LogReg,0.650138,0.048359,0.761162,0.518519,0.025362
1,TargetEnc_LogReg,0.635691,0.076667,0.759861,0.479167,0.041667


 Results saved to: /content/drive/MyDrive/dissertation/outputs/benchmarks_petfinder.csv
 Notes appended to: /content/drive/MyDrive/dissertation/outputs/encoding_notes.txt


In [None]:
notes_text = f"""
Encoding Benchmark Notes — PetFinder Dataset
--------------------------------------------
Encoded categorical columns: {', '.join(cat_cols)}
Target column: AdoptionSpeed_bin
Model: Logistic Regression
Metrics: AUC, F1, Accuracy, Precision, Recall

Imputation details:
- Numeric: SimpleImputer(median); if a column was all-NaN in TRAIN, we prefilled with 0 before fitting.
- Categorical: SimpleImputer(constant='Unknown') so even all-NaN columns impute safely.

Encoders:
- One-Hot (handle_unknown='ignore')
- Target Encoding (smoothing=0.3)

Splits: 60/20/20 with stratification.
Results CSV saved in /outputs.
"""

with open(f"{PROJ}/outputs/encoding_notes.txt", "a") as f:
    f.write(notes_text.strip() + "\n\n")
print(" Notes updated.")


 Notes updated.


In [None]:
# ==================================================
# Save Encoding Benchmark Notes for Adult Dataset
# ==================================================
notes_text = f"""
Encoding Benchmark Notes — Adult Dataset
----------------------------------------
Purpose:
Establish baseline performance for classical encoders (One-Hot, Target)
on high-cardinality categorical variables prior to embedding experiments.

Encoded categorical columns: {', '.join(cat_cols)}
Target column: {target_col}
Model: Logistic Regression
Metrics computed: AUC, F1, Accuracy, Precision, Recall

Methodology Summary:
- Loaded cleaned dataset from: {path}
- Retained only numeric columns and the three high-cardinality categorical columns ({', '.join(cat_cols)}).
- Dropped other categorical columns (e.g., sex, race, relationship) to isolate
  the impact of encoding strategy on complex categorical structure.
- Applied two encoders:
    1. One-Hot Encoding (expands each category into binary dummy variables)
    2. Target Encoding (replaces each category with its mean target value)
- Evaluated both models using a 60/20/20 train/validation/test split
  with stratification to maintain class balance.
- Model used: Logistic Regression (max_iter=200)
- Metrics calculated: AUC, F1, Accuracy, Precision, Recall

Rationale:
This benchmark provides a baseline to compare with learned embeddings
(Entity Embedding, Word2Vec, Node2Vec) in later notebooks.
It isolates high-cardinality effects while minimizing compute load.

Results saved to: {bench_path}
"""

notes_path = f"{PROJ}/outputs/encoding_notes.txt"
os.makedirs(os.path.dirname(notes_path), exist_ok=True)

with open(notes_path, "a") as f:
    f.write(notes_text.strip() + "\n\n")

print(f" Notes saved to {notes_path}")


 Notes saved to /content/drive/MyDrive/dissertation/outputs/encoding_notes.txt


In [None]:
# ==================================================
# Leakage Guard + Refit — Breast Benchmark
# ==================================================
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_score, recall_score
import category_encoders as ce

# ---- Paths
PROJ = "/content/drive/MyDrive/dissertation"
DATA_PATH = f"{PROJ}/data/Breast_clean.csv"
OUT_PATH  = f"{PROJ}/outputs/benchmarks_breast.csv"

# ---- Load
df = pd.read_csv(DATA_PATH, low_memory=False)
df = df.loc[:, ~df.columns.duplicated()]

target_col = "OS5yr_bin"
cat_cols   = ["TNM_PATH_T", "TNM_PATH_N", "hospid"]

# ---- Define leakage blacklist
leak_blacklist = {
    "OS5yr","OS5yr_bin","five_yr_surv","died","PUF_VITAL_STATUS",
    "DX_LASTCONTACT_DEATH_MONTHS","PUF_30_DAY_MORT_CD","PUF_90_DAY_MORT_CD",
    "READM_HOSP_30_DAYS","RX_SUMM_SURGRAD_SEQ","RX_SUMM_SYSTEMIC_SUR_SEQ","followup_months"
}

# ---- Numeric list (exclude blacklist, target, and forced cats)
num_cols_all = df.select_dtypes(include=["int64","float64"]).columns.tolist()
force_cats = set(cat_cols)
num_cols = [c for c in num_cols_all if c not in leak_blacklist and c != target_col and c not in force_cats]

# ---- Clean + impute
for c in cat_cols:
    df[c] = df[c].astype(str).str.strip().replace({"": np.nan})
df[cat_cols] = df[cat_cols].fillna("Unknown")

df.replace([np.inf, -np.inf], np.nan, inplace=True)
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce")
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# ---- Final dataset
X = df[cat_cols + num_cols].copy()
y = df[target_col].astype(int).copy()

print(f" Features after leakage guard → cats={len(cat_cols)}, nums={len(num_cols)}, total={X.shape[1]}")
print("Dropped due to leakage:", sorted(set(leak_blacklist) & set(df.columns)))

# ---- Split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# ---- Pipelines
num_pipe = SimpleImputer(strategy="median")

cat_pipe_ohe = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="Unknown"),
    ce.OneHotEncoder(handle_unknown="ignore", use_cat_names=True)
)

cat_pipe_te = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="Unknown"),
    ce.TargetEncoder(handle_unknown="value", handle_missing="value", smoothing=0.3)
)

def make_pipeline_with(encoder_pipe):
    pre = ColumnTransformer([
        ("num", num_pipe, num_cols),
        ("cat", encoder_pipe, cat_cols)
    ])
    return Pipeline([
        ("preprocess", pre),
        ("final_impute", SimpleImputer(strategy="median")),
        ("model", LogisticRegression(max_iter=200, n_jobs=-1))
    ])

def evaluate_model(name, model, Xv, yv):
    prob = model.predict_proba(Xv)[:, 1]
    pred = (prob > 0.5).astype(int)
    return {
        "Model": name,
        "AUC": roc_auc_score(yv, prob),
        "F1": f1_score(yv, pred),
        "ACC": accuracy_score(yv, pred),
        "PREC": precision_score(yv, pred),
        "REC": recall_score(yv, pred)
    }

# ---- Run both encoders
pipe_ohe = make_pipeline_with(cat_pipe_ohe)
pipe_ohe.fit(X_train, y_train)
res_ohe = evaluate_model("OneHot_LogReg", pipe_ohe, X_val, y_val)

pipe_te = make_pipeline_with(cat_pipe_te)
pipe_te.fit(X_train, y_train)
res_te = evaluate_model("TargetEnc_LogReg", pipe_te, X_val, y_val)

results = pd.DataFrame([res_ohe, res_te])
display(results)

results.to_csv(OUT_PATH, index=False)
print(f" Results saved to: {OUT_PATH}")


Mounted at /content/drive
 Features after leakage guard → cats=3, nums=62, total=65
Dropped due to leakage: ['DX_LASTCONTACT_DEATH_MONTHS', 'OS5yr', 'OS5yr_bin', 'PUF_30_DAY_MORT_CD', 'PUF_90_DAY_MORT_CD', 'PUF_VITAL_STATUS', 'READM_HOSP_30_DAYS', 'RX_SUMM_SURGRAD_SEQ', 'RX_SUMM_SYSTEMIC_SUR_SEQ', 'died', 'five_yr_surv', 'followup_months']


Unnamed: 0,Model,AUC,F1,ACC,PREC,REC
0,OneHot_LogReg,0.648035,0.106098,0.698419,0.486364,0.059544
1,TargetEnc_LogReg,0.64796,0.103799,0.698168,0.482679,0.058152


 Results saved to: /content/drive/MyDrive/dissertation/outputs/benchmarks_breast.csv


In [None]:
print("X shape:", X.shape)
print("Cats:", cat_cols)
print({c: df[c].nunique() for c in cat_cols})
print(df[cat_cols].dtypes)


X shape: (59784, 65)
Cats: ['TNM_PATH_T', 'TNM_PATH_N', 'hospid']
{'TNM_PATH_T': 25, 'TNM_PATH_N': 31, 'hospid': 1318}
TNM_PATH_T    object
TNM_PATH_N    object
hospid        object
dtype: object


In [None]:
# ==================================================
# Append Leakage Guard Notes — Breast
# ==================================================
PROJ = "/content/drive/MyDrive/dissertation"
NOTES_PATH = f"{PROJ}/outputs/encoding_notes.txt"
DATA_PATH = f"{PROJ}/data/Breast_clean.csv"
OUT_PATH  = f"{PROJ}/outputs/benchmarks_breast.csv"

cat_cols = ["TNM_PATH_T", "TNM_PATH_N", "hospid"]
leak_dropped = [
    "DX_LASTCONTACT_DEATH_MONTHS","OS5yr","OS5yr_bin","PUF_30_DAY_MORT_CD","PUF_90_DAY_MORT_CD",
    "PUF_VITAL_STATUS","READM_HOSP_30_DAYS","RX_SUMM_SURGRAD_SEQ","RX_SUMM_SYSTEMIC_SUR_SEQ",
    "died","five_yr_surv","followup_months"
]

notes_text = f"""
Leakage Guard — Breast Dataset
------------------------------
We detected near-perfect metrics and removed variables that encode survival or
status post-diagnosis to avoid target leakage.

Dropped columns:
{', '.join(leak_dropped)}

Kept categorical: {', '.join(cat_cols)}
- 'hospid' treated as categorical for alignment with embedding experiments.

Numeric features retained: 61 (post-filter)
Total model features: 64

Controls:
- Duplicated names dropped (first kept)
- Target excluded from numeric features
- No feature appears in both branches
- Median and constant imputers for robustness
- Final imputer after preprocessing
- Safe encoders (ignore unknowns / smooth targets)

Files:
- Data: {DATA_PATH}
- Results: {OUT_PATH}
"""

import os
os.makedirs(os.path.dirname(NOTES_PATH), exist_ok=True)
with open(NOTES_PATH, "a") as f:
    f.write(notes_text.strip() + "\n\n")
print(f" Leakage-guard notes appended to: {NOTES_PATH}")


 Leakage-guard notes appended to: /content/drive/MyDrive/dissertation/outputs/encoding_notes.txt


In [None]:
import pandas as pd, os

PROJ = "/content/drive/MyDrive/dissertation"
paths = {
    "adult": f"{PROJ}/outputs/benchmarks_adult.csv",
    "petfinder": f"{PROJ}/outputs/benchmarks_petfinder.csv",
    "breast": f"{PROJ}/outputs/benchmarks_breast.csv",
}

frames = []
for name, p in paths.items():
    if os.path.exists(p):
        df = pd.read_csv(p)
        df.insert(0, "Dataset", name)
        frames.append(df)

summary = pd.concat(frames, ignore_index=True)
display(summary)
summary.to_csv(f"{PROJ}/outputs/benchmarks_summary.csv", index=False)
print("Saved:", f"{PROJ}/outputs/benchmarks_summary.csv")


Unnamed: 0,Dataset,Model,AUC,F1,ACC,PREC,REC
0,adult,OneHot_LogReg,0.815894,0.464101,0.808968,0.705677,0.345742
1,adult,TargetEnc_LogReg,0.842895,0.532359,0.823198,0.724926,0.420625
2,petfinder,OneHot_LogReg,0.650138,0.048359,0.761162,0.518519,0.025362
3,petfinder,TargetEnc_LogReg,0.635691,0.076667,0.759861,0.479167,0.041667
4,breast,OneHot_LogReg,0.648035,0.106098,0.698419,0.486364,0.059544
5,breast,TargetEnc_LogReg,0.64796,0.103799,0.698168,0.482679,0.058152


✅ Saved: /content/drive/MyDrive/dissertation/outputs/benchmarks_summary.csv


In [None]:
# --- Restart runtime first (Runtime → Restart runtime), then run this cell ---
# Stable combo for gensim on Colab Python 3.12
!pip -q install --upgrade "numpy==2.0.2" "scipy==1.11.4" "gensim==4.3.3"

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, pandas as pd, numpy as np
PROJ = "/content/drive/MyDrive/dissertation"
DATA_DIR = f"{PROJ}/data"
OUT_DIR  = f"{PROJ}/outputs/embeddings"
os.makedirs(OUT_DIR, exist_ok=True)

def make_sentences(frame: pd.DataFrame, cols):
    sents = []
    sub = frame[cols].astype(str)
    for _, row in sub.iterrows():
        sents.append([f"{c}:{row[c]}" for c in cols])  # keep prefixes
    return sents

def save_vectors(keys, matrix, path_csv):
    out = pd.DataFrame(matrix, index=keys)
    out.index.name = "token"
    out.to_csv(path_csv)
    print("Saved:", path_csv, out.shape)


In [None]:
# ===== Adult: W2V & FastText =====
from gensim.models import Word2Vec
from gensim.models.fasttext import FastText

DATASET = "adult"
DATA_PATH = f"{DATA_DIR}/Adult_clean.csv"
CATS = ["occupation","workclass","native_country"]

df = pd.read_csv(DATA_PATH, low_memory=False)
for c in CATS:
    df[c] = df[c].astype(str).str.strip().replace({"": np.nan})
df[CATS] = df[CATS].fillna("Unknown")

print("Adult uniques:", {c: df[c].nunique() for c in CATS}, "| rows:", len(df))
sentences = make_sentences(df, CATS)

# --- Word2Vec
w2v = Word2Vec(sentences=sentences, vector_size=64, window=5, min_count=1,
               workers=2, sg=1, negative=5, epochs=10)
w_keys = w2v.wv.index_to_key
w_mat  = np.vstack([w2v.wv[k] for k in w_keys])
w2v_path = f"{OUT_DIR}/{DATASET}_w2v_64d_prefixed.csv"
save_vectors(w_keys, w_mat, w2v_path)

# --- FastText
ft = FastText(sentences=sentences, vector_size=64, window=5, min_count=1,
              workers=2, sg=1, negative=5, epochs=10)
f_keys = ft.wv.index_to_key
f_mat  = np.vstack([ft.wv[k] for k in f_keys])
ft_path = f"{OUT_DIR}/{DATASET}_fasttext_64d_prefixed.csv"
save_vectors(f_keys, f_mat, ft_path)


In [None]:
# ===== PetFinder: W2V & FastText =====
from gensim.models import Word2Vec
from gensim.models.fasttext import FastText

DATASET = "petfinder"
DATA_PATH = f"{DATA_DIR}/Petfinder_clean.csv"
CATS = ["Breed1","Color1","MaturitySize"]

df = pd.read_csv(DATA_PATH, low_memory=False)
for c in CATS:
    df[c] = df[c].astype(str).str.strip().replace({"": np.nan})
df[CATS] = df[CATS].fillna("Unknown")

print("PetFinder uniques:", {c: df[c].nunique() for c in CATS}, "| rows:", len(df))
sentences = make_sentences(df, CATS)

# --- Word2Vec
w2v = Word2Vec(sentences=sentences, vector_size=64, window=5, min_count=1,
               workers=2, sg=1, negative=5, epochs=10)
w_keys = w2v.wv.index_to_key
w_mat  = np.vstack([w2v.wv[k] for k in w_keys])
w2v_path = f"{OUT_DIR}/{DATASET}_w2v_64d_prefixed.csv"
save_vectors(w_keys, w_mat, w2v_path)

# --- FastText
ft = FastText(sentences=sentences, vector_size=64, window=5, min_count=1,
              workers=2, sg=1, negative=5, epochs=10)
f_keys = ft.wv.index_to_key
f_mat  = np.vstack([ft.wv[k] for k in f_keys])
ft_path = f"{OUT_DIR}/{DATASET}_fasttext_64d_prefixed.csv"
save_vectors(f_keys, f_mat, ft_path)


In [None]:
# Install Node2Vec with a compatible networkx; won’t touch NumPy/Gensim
!pip -q uninstall -y node2vec networkx > /dev/null
!pip -q install "networkx==2.8.8" "node2vec==0.4.3"

import networkx as nx
from node2vec import Node2Vec

def build_graph_from_cats(frame: pd.DataFrame, cols):
    G = nx.Graph()
    rows = frame[cols].astype(str).values.tolist()
    for r in rows:
        toks = [f"{c}:{v}" for c, v in zip(cols, r)]
        for i in range(len(toks)):
            for j in range(i+1, len(toks)):
                u, v = toks[i], toks[j]
                if G.has_edge(u, v):
                    G[u][v]["weight"] += 1.0
                else:
                    G.add_edge(u, v, weight=1.0)
    return G


In [None]:
# ===== Adult: Node2Vec =====
DATASET = "adult"
DATA_PATH = f"{DATA_DIR}/Adult_clean.csv"
CATS = ["occupation","workclass","native_country"]

df = pd.read_csv(DATA_PATH, low_memory=False)
for c in CATS:
    df[c] = df[c].astype(str).str.strip().replace({"": np.nan})
df[CATS] = df[CATS].fillna("Unknown")

G = build_graph_from_cats(df, CATS)
print("Adult graph:", G.number_of_nodes(), "nodes,", G.number_of_edges(), "edges")

n2v = Node2Vec(G, dimensions=64, walk_length=10, num_walks=50,
               workers=2, p=1, q=1, weight_key="weight")
n2v_model = n2v.fit(window=5, min_count=1, sg=1, epochs=5)

keys = list(G.nodes())
mat  = np.vstack([n2v_model.wv[k] for k in keys])
n2v_path = f"{OUT_DIR}/{DATASET}_node2vec_64d_prefixed.csv"
save_vectors(keys, mat, n2v_path)


In [None]:
# ===== PetFinder: Node2Vec =====
DATASET = "petfinder"
DATA_PATH = f"{DATA_DIR}/Petfinder_clean.csv"
CATS = ["Breed1","Color1","MaturitySize"]

df = pd.read_csv(DATA_PATH, low_memory=False)
for c in CATS:
    df[c] = df[c].astype(str).str.strip().replace({"": np.nan})
df[CATS] = df[CATS].fillna("Unknown")

G = build_graph_from_cats(df, CATS)
print("PetFinder graph:", G.number_of_nodes(), "nodes,", G.number_of_edges(), "edges")

n2v = Node2Vec(G, dimensions=64, walk_length=10, num_walks=50,
               workers=2, p=1, q=1, weight_key="weight")
n2v_model = n2v.fit(window=5, min_count=1, sg=1, epochs=5)

keys = list(G.nodes())
mat  = np.vstack([n2v_model.wv[k] for k in keys])
n2v_path = f"{OUT_DIR}/{DATASET}_node2vec_64d_prefixed.csv"
save_vectors(keys, mat, n2v_path)
