In [None]:
# 04b: Embedding Structure Evaluation (PCA, NN-overlap, IDPE-style)
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import os, re, glob, numpy as np, pandas as pd
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

PROJ = "/content/drive/MyDrive/dissertation"
OUT = f"{PROJ}/outputs"
EMB = f"{OUT}/embeddings"                  # W2V/FastText/Node2Vec CSVs (prefixed)
ENT = f"{OUT}/entity_embeddings_merged"    # merged per-dataset entity CSVs
TFIDF = f"{OUT}/tfidf"                     # tfidf_svd*.csv
FIGS = f"{OUT}/figs"
os.makedirs(FIGS, exist_ok=True)

# keep dimensions consistent
EMB_DIM = 64
ENT_DIM = 64*3     # 192 (three features × 64d)
# TF-IDF SVD dims vary per dataset; we'll detect from file


Mounted at /content/drive


In [None]:
def load_token_emb(path, token_col="token"):
    """Load token→vector table (first column is token). Returns (tokens, X)."""
    df = pd.read_csv(path)
    if token_col not in df.columns:
        # assume first col is token
        df = df.rename(columns={df.columns[0]: "token"})
    tokens = df["token"].astype(str).tolist()
    X = df.drop(columns=["token"]).values
    return tokens, X

def pca_scatter(X, labels, title, save_as):
    pca = PCA(n_components=2, random_state=42)
    X2 = pca.fit_transform(X)
    plt.figure(figsize=(6,4))
    plt.scatter(X2[:,0], X2[:,1], s=10, alpha=0.6)
    plt.title(f"{title}\nPC1 {pca.explained_variance_ratio_[0]*100:.1f}% | PC2 {pca.explained_variance_ratio_[1]*100:.1f}%")
    plt.tight_layout()
    plt.savefig(os.path.join(FIGS, save_as), dpi=140)
    plt.close()

def mean_nn_overlap(Xa, Xb, topk=10):
    """Average top-k neighbor overlap (by cosine) between two embeddings on shared tokens."""
    Sa = cosine_similarity(Xa)
    Sb = cosine_similarity(Xb)
    n = Sa.shape[0]
    overlaps = []
    for i in range(n):
        na = np.argsort(-Sa[i])[1:topk+1]
        nb = np.argsort(-Sb[i])[1:topk+1]
        overlaps.append(len(set(na) & set(nb)) / topk)
    return float(np.mean(overlaps))

def align_on_tokens(tokens_a, Xa, tokens_b, Xb):
    """Intersect tokens and return Xa', Xb' aligned."""
    idx_a = {t:i for i,t in enumerate(tokens_a)}
    idx_b = {t:i for i,t in enumerate(tokens_b)}
    common = [t for t in tokens_a if t in idx_b]
    A = np.stack([Xa[idx_a[t]] for t in common], axis=0)
    B = np.stack([Xb[idx_b[t]] for t in common], axis=0)
    return common, A, B


In [None]:
files = {
    "adult": {
        "w2v":     f"{EMB}/adult_w2v_64d_prefixed.csv",
        "ft":      f"{EMB}/adult_fasttext_64d_prefixed.csv",
        "n2v":     f"{EMB}/adult_node2vec_64d_prefixed.csv",
        "entity":  f"{ENT}/adult_entity_64d_merged.csv",
        "tfidf":   f"{TFIDF}/adult_tfidf_svd65.csv",   # created in 03c
    },
    "petfinder": {
        "w2v":     f"{EMB}/petfinder_w2v_64d_prefixed.csv",
        "ft":      f"{EMB}/petfinder_fasttext_64d_prefixed.csv",
        "n2v":     f"{EMB}/petfinder_node2vec_64d_prefixed.csv",
        "entity":  f"{ENT}/petfinder_entity_64d_merged.csv",
        "tfidf":   f"{TFIDF}/petfinder_tfidf_svd192.csv",
    },
    "breast": {
        "w2v":     f"{EMB}/breast_w2v_64d_prefixed.csv",
        "ft":      f"{EMB}/breast_fasttext_64d_prefixed.csv",
        "n2v":     f"{EMB}/breast_node2vec_64d_prefixed.csv",
        "entity":  f"{ENT}/breast_entity_64d_merged.csv",
        "tfidf":   f"{TFIDF}/breast_tfidf_svd192.csv",
    },
}


In [None]:
for ds, mp in files.items():
    # token-level (w2v/ft/n2v)
    for meth in ["w2v", "ft", "n2v"]:
        tok, X = load_token_emb(mp[meth])
        pca_scatter(X, tok, f"{ds.upper()} • {meth.upper()}", f"{ds}_{meth}_pca.png")

    # entity: tokens are concatenated (three cat spaces)
    tok_e, Xe = load_token_emb(mp["entity"])
    pca_scatter(Xe, tok_e, f"{ds.upper()} • ENTITY (merged)", f"{ds}_entity_pca.png")

    # tfidf: row-level vectors (one per record) — PCA shows distribution; no tokens
    Xt = pd.read_csv(mp["tfidf"]).values
    pca_scatter(Xt, None, f"{ds.upper()} • TFIDF+SVD rows", f"{ds}_tfidf_pca.png")

print("PCA figures saved to:", FIGS)


✅ PCA figures saved to: /content/drive/MyDrive/dissertation/outputs/figs


In [None]:
pairs = [("w2v","ft"), ("w2v","n2v"), ("ft","n2v"), ("w2v","entity"), ("ft","entity"), ("n2v","entity")]
records = []

for ds, mp in files.items():
    # load all token-level first
    tok = {}; X = {}
    for m in ["w2v","ft","n2v","entity"]:
        t, V = load_token_emb(mp[m])
        tok[m], X[m] = t, V

    for a,b in pairs:
        common, A, B = align_on_tokens(tok[a], X[a], tok[b], X[b])
        if len(common) < 5:
            ov = np.nan
        else:
            ov = mean_nn_overlap(A, B, topk=10)
        records.append({"Dataset": ds, "Metric": "NN_Overlap@10", "MethodPair": f"{a}-{b}", "Value": ov, "N_tokens": len(common)})

nn_df = pd.DataFrame(records)
display(nn_df.head())


Unnamed: 0,Dataset,Metric,MethodPair,Value,N_tokens
0,adult,NN_Overlap@10,w2v-ft,0.487879,66
1,adult,NN_Overlap@10,w2v-n2v,0.29697,66
2,adult,NN_Overlap@10,ft-n2v,0.290909,66
3,adult,NN_Overlap@10,w2v-entity,0.159091,66
4,adult,NN_Overlap@10,ft-entity,0.177273,66


In [None]:
def stage_index_T(s):  # simple order for T
    order = ["pIS","p0","p1","p1MI","p1A","p1B","p1C","p2","p3","p4"]
    if s in order: return order.index(s)
    # fallback: numeric suffix if present
    m = re.search(r"p(\d+)", s)
    return int(m.group(1)) if m else 99

def stage_index_N(s):  # simple order for N
    order = ["p0","p0I-","p0I+","p1","p1MI","p1A","p1B","p1C","p2","p3"]
    if s in order: return order.index(s)
    m = re.search(r"p(\d+)", s)
    return int(m.group(1)) if m else 99

def idpe_like_for_space(tokens, X, prefix, indexer):
    # select only tokens with given prefix
    idx = [i for i,t in enumerate(tokens) if t.startswith(prefix)]
    if len(idx) < 5:
        return np.nan
    sub_tokens = [tokens[i] for i in idx]
    sub_X = X[idx]
    # stage index per token (drop unknowns)
    labels = []
    for t in sub_tokens:
        code = t.split(":",1)[1]
        labels.append(indexer(code))
    labels = np.array(labels)
    # pairwise “true” distance (|idx_i - idx_j|)
    D_true = np.abs(labels[:,None] - labels[None,:]).astype(float)
    # embedding distance (1 - cosine)
    S = cosine_similarity(sub_X)
    D_emb = 1.0 - S
    # correlate upper triangle
    iu = np.triu_indices_from(D_true, k=1)
    rho, _ = spearmanr(D_true[iu], D_emb[iu])
    return float(rho)

rows = []
ds = "breast"
for meth in ["w2v","ft","n2v","entity"]:
    tokens, X = load_token_emb(files[ds][meth])
    rho_T = idpe_like_for_space(tokens, X, "TNM_PATH_T:", stage_index_T)
    rho_N = idpe_like_for_space(tokens, X, "TNM_PATH_N:", stage_index_N)
    rows.append({"Dataset": ds, "Metric": "IDPE_Spearman", "Method": meth, "Space": "TNM_T", "Value": rho_T})
    rows.append({"Dataset": ds, "Metric": "IDPE_Spearman", "Method": meth, "Space": "TNM_N", "Value": rho_N})

idpe_df = pd.DataFrame(rows)
display(idpe_df)


Unnamed: 0,Dataset,Metric,Method,Space,Value
0,breast,IDPE_Spearman,w2v,TNM_T,0.357004
1,breast,IDPE_Spearman,w2v,TNM_N,0.264846
2,breast,IDPE_Spearman,ft,TNM_T,0.517789
3,breast,IDPE_Spearman,ft,TNM_N,0.396083
4,breast,IDPE_Spearman,n2v,TNM_T,0.325697
5,breast,IDPE_Spearman,n2v,TNM_N,0.133088
6,breast,IDPE_Spearman,entity,TNM_T,0.044696
7,breast,IDPE_Spearman,entity,TNM_N,0.025259


In [None]:
summary = []

# add NN-overlap rows
for _,r in nn_df.iterrows():
    summary.append({
        "Dataset": r["Dataset"],
        "Metric": r["Metric"],
        "Method": r["MethodPair"],
        "Space": "tokens_shared",
        "Value": r["Value"],
        "N_tokens": r["N_tokens"]
    })

# add IDPE rows
for _,r in idpe_df.iterrows():
    summary.append({
        "Dataset": r["Dataset"],
        "Metric": r["Metric"],
        "Method": r["Method"],
        "Space": r["Space"],
        "Value": r["Value"],
        "N_tokens": np.nan
    })

summary_df = pd.DataFrame(summary)
out_csv = os.path.join(OUT, "embedding_structure_metrics.csv")
summary_df.to_csv(out_csv, index=False)
print("Saved:", out_csv)
display(summary_df.head(12))


✅ Saved: /content/drive/MyDrive/dissertation/outputs/embedding_structure_metrics.csv


Unnamed: 0,Dataset,Metric,Method,Space,Value,N_tokens
0,adult,NN_Overlap@10,w2v-ft,tokens_shared,0.487879,66.0
1,adult,NN_Overlap@10,w2v-n2v,tokens_shared,0.29697,66.0
2,adult,NN_Overlap@10,ft-n2v,tokens_shared,0.290909,66.0
3,adult,NN_Overlap@10,w2v-entity,tokens_shared,0.159091,66.0
4,adult,NN_Overlap@10,ft-entity,tokens_shared,0.177273,66.0
5,adult,NN_Overlap@10,n2v-entity,tokens_shared,0.187879,66.0
6,petfinder,NN_Overlap@10,w2v-ft,tokens_shared,0.084659,176.0
7,petfinder,NN_Overlap@10,w2v-n2v,tokens_shared,0.084091,176.0
8,petfinder,NN_Overlap@10,ft-n2v,tokens_shared,0.096591,176.0
9,petfinder,NN_Overlap@10,w2v-entity,tokens_shared,0.05625,176.0
