# Expert Finder: Search & Visualization

Search for Canadian university professors by expertise using semantic similarity,
and explore clusters of similar researchers visually.

In [None]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import umap

from db import get_all_professors

DB_PATH = Path("professors.db")
MODEL_NAME = "all-MiniLM-L6-v2"

# Load data
profs = get_all_professors(DB_PATH)
df = pd.DataFrame(profs)
df["expertise_keywords"] = df["expertise_keywords"].apply(lambda x: json.loads(x) if x else [])
df["keywords_str"] = df["expertise_keywords"].apply(lambda x: ", ".join(x))

# Load embeddings from DB
has_embedding = df["embedding"].notna()
embeddings = np.array([
    np.frombuffer(row["embedding"], dtype=np.float32)
    for _, row in df[has_embedding].iterrows()
])
df = df[has_embedding].reset_index(drop=True)

# Load model for query embedding
model = SentenceTransformer(MODEL_NAME)

print(f"Loaded {len(df)} professors with {embeddings.shape[0]} embeddings")

In [None]:
def search_experts(query: str, top_n: int = 20, school: str = None, faculty: str = None):
    """Search for professors by natural language query."""
    mask = pd.Series([True] * len(df))
    if school:
        mask &= df["school"] == school
    if faculty:
        mask &= df["faculty"] == faculty

    filtered_df = df[mask].reset_index(drop=True)
    filtered_emb = embeddings[mask.values]

    if len(filtered_df) == 0:
        print("No professors match the filters.")
        return pd.DataFrame()

    query_emb = model.encode([query])
    sims = cosine_similarity(query_emb, filtered_emb)[0]

    filtered_df = filtered_df.copy()
    filtered_df["similarity"] = sims
    results = filtered_df.nlargest(top_n, "similarity")
    return results[["name", "school", "faculty", "department", "keywords_str", "email", "similarity"]]

In [None]:
# Example: search for experts
search_experts("climate change policy and economics")

In [None]:
# UMAP dimensionality reduction for visualization
reducer = umap.UMAP(n_neighbors=15, min_dist=0.1, random_state=42)
coords = reducer.fit_transform(embeddings)
df["umap_x"] = coords[:, 0]
df["umap_y"] = coords[:, 1]

In [None]:
# Scatter plot colored by school
fig = px.scatter(
    df, x="umap_x", y="umap_y",
    color="school",
    hover_data=["name", "faculty", "department", "keywords_str"],
    title="Canadian University Professors by Expertise",
    width=1200, height=800,
)
fig.update_traces(marker=dict(size=4, opacity=0.7))
fig.show()

In [None]:
# Scatter plot colored by faculty
fig2 = px.scatter(
    df, x="umap_x", y="umap_y",
    color="faculty",
    hover_data=["name", "school", "department", "keywords_str"],
    title="Canadian University Professors by Faculty",
    width=1200, height=800,
)
fig2.update_traces(marker=dict(size=4, opacity=0.7))
fig2.show()