### 🧭 Overview

This notebook demonstrates how to use the edu-alias-mapper-e5 model — a fine-tuned Sentence Transformer that maps university aliases (e.g., “UdeG”, “montana.edu”) to their canonical names (e.g., “University of Guadalajara”, “Montana State University”).

To speed up loading in environments like Jupyter, download the model locally once before running:

```shell
huggingface-cli download craigtrim/edu-alias-mapper-e5 \
  --local-dir ~/.cache/huggingface/hub/models--craigtrim--edu-alias-mapper-e5
```

After this one-time setup, the notebook loads the model from your local cache for fast, offline lookups.

In [None]:
# %%
# 🧠 Example alias→label lookup using craigtrim/edu-alias-mapper-e5 (no FAISS needed)

import os
import time
import json
import tempfile
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer, util

# ---------------------------------------------------------------------
# 🌐 Environment setup
# ---------------------------------------------------------------------
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"  # hide HF CLI progress bars

# ---------------------------------------------------------------------
# ⚙️ Config: model + data
# ---------------------------------------------------------------------
MODEL_ID = "craigtrim/edu-alias-mapper-e5"
BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
DATA_PATH = os.path.join(BASE_DIR, "data", "raw", "dbpedia_schools.parquet")

print(f"📁 Using dataset at → {DATA_PATH}")

# ---------------------------------------------------------------------
# 📁 Cache paths (portable temp directory)
# ---------------------------------------------------------------------
TMP_DIR = tempfile.gettempdir()
EMB_PATH = os.path.join(TMP_DIR, "edu_alias_label_embs.npy")
LABELS_PATH = os.path.join(TMP_DIR, "edu_alias_labels.json")

print(f"📦 Using cache directory → {TMP_DIR}")
print(f"💾 Embeddings cache → {os.path.basename(EMB_PATH)}")
print(f"💾 Labels cache → {os.path.basename(LABELS_PATH)}\n")

📁 Temp directory in use → /var/folders/53/k1mcwcc57qzd7rbr7nxb2w7m0000gn/T


In [None]:
# %%
# 🕒 Stopwatch helper
def timer(label: str = ""):
    start = time.perf_counter()
    def stop():
        elapsed = time.perf_counter() - start
        print(f"⏱️  {label} completed in {elapsed:,.2f}s ({elapsed/60:.2f} min)")
    return stop


# %%
# 🚀 Load model
t = timer("Model load")
print(f"🚀 Loading model → {MODEL_ID}")
model = SentenceTransformer(MODEL_ID)
t()
print("✅ Model ready for inference\n")


# %%
# 📚 Load data
t = timer("Data load")
df = pd.read_parquet(DATA_PATH)
aliases = df["alias"].dropna().unique().tolist()
labels  = df["label"].dropna().unique().tolist()
t()
print(f"📘 Loaded {len(aliases):,} aliases and {len(labels):,} labels\n")


# %%
# 🧮 Generate label embeddings
t = timer("Embedding generation")
print(f"🧩 Encoding {len(labels):,} labels (batch_size=32)…")

label_embs = model.encode(
    labels,                 # for testing: use labels[:10]
    normalize_embeddings=True,
    show_progress_bar=True,
    batch_size=32,
)

t()
print("✅ Embeddings generated successfully\n")


# %%
# 💾 Persist or load embeddings (cross-platform temp-safe)
TMP_DIR = tempfile.gettempdir()
EMB_PATH = os.path.join(TMP_DIR, "edu_alias_label_embs.npy")
LABELS_PATH = os.path.join(TMP_DIR, "edu_alias_labels.json")

if not os.path.exists(EMB_PATH):
    print(f"💾 Saving embeddings → {EMB_PATH}")
    np.save(EMB_PATH, label_embs)

    print(f"💾 Saving label list → {LABELS_PATH}")
    with open(LABELS_PATH, "w") as f:
        json.dump(labels, f)
else:
    print(f"📦 Loading cached embeddings from {EMB_PATH}")
    label_embs = np.load(EMB_PATH)

    print(f"📦 Loading cached labels from {LABELS_PATH}")
    with open(LABELS_PATH) as f:
        labels = json.load(f)

print(f"✅ Embeddings ready for lookup ({len(labels):,} labels)\n")

🚀 Loading model → craigtrim/edu-alias-mapper-e5
⏱️  Model load completed in 1.95s (0.03 min)
✅ Model ready for inference

⏱️  Data load completed in 0.03s (0.00 min)
📘 Loaded 20,202 aliases and 20,272 labels

🧩 Encoding 20,272 labels (batch_size=32)…


Batches:   0%|          | 0/634 [00:00<?, ?it/s]

In [4]:
# %%
# 🔍 Lookup helper
def lookup(query: str, top_k: int = 5):
    """Find the most semantically similar university names for a given query."""
    print(f"\n🔎 Query → {query}")
    t = timer("Lookup")

    q_emb = model.encode([query], normalize_embeddings=True)
    scores = util.cos_sim(q_emb, label_embs)[0]
    top = sorted(zip(labels, scores), key=lambda x: x[1], reverse=True)[:top_k]

    t()
    print(f"📈 Top {top_k} matches:")
    for rank, (lbl, score) in enumerate(top, 1):
        print(f"  {rank:>2}. {lbl:<60} ({float(score):.4f})")
    print("🏁 Lookup complete.\n")


# %%
# 🧪 Example query
lookup("University Georgia Athens")


🔎 Query → University Georgia Athens
⏱️  Lookup completed in 0.70 seconds (0.01 min)
📈 Top 5 matches:
   1. University of Georgia                                        (0.8899)
   2. The University of Georgia                                    (0.8422)
   3. University of Athens                                         (0.7754)
   4. David Aghmashenebeli University of Georgia                   (0.7618)
   5. Agricultural University of Georgia                           (0.7483)
🏁 Lookup complete.

