<a href="https://colab.research.google.com/github/cs-iuu/word-sense-2025-fall-ai/blob/main/notebooks/14.1.cross_lingual_wsi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cross-Lingual Word Sense Induction (WSI)


POS tagging using A1: "lincoln/multilingual-xlm-roberta-base-ud-pos"

Features:
- POS tagging (A1 multilingual XLM-R UD POS)
- Context extraction filtered by POS
- Contextual token embeddings using xlm-roberta-base
- Clustering (KMeans) for Word Sense Induction (WSI)
- Automatic sense descriptions & LLM (optional) glosses
- Cross-lingual gloss alignment (Hungarian matching + many-to-one)
- Visualizations (UMAP)
- Polysemy metrics and paired comparisons
- Mongolian ↔ English using XLM-R contextual embeddings

## Setup

In [None]:
!pip install -q transformers sentencepiece sentence-transformers umap-learn scikit-learn matplotlib scipy tqdm stanza

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.7 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.6/1.7 MB[0m [31m17.1 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.7/1.7 MB[0m [31m32.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/608.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m608.4/608.4 kB[0m [31m34.3 MB/s[0m eta [36m0:00:00[0m
[?25h

- Optional: OpenAI for high-quality glosses (set use_openai=True and provide API key via env)

In [None]:
!pip install -q openai

## Imports

In [None]:
import os
import re
import json
import math
import random
from collections import Counter, defaultdict
from tqdm.auto import tqdm


import numpy as np
import matplotlib.pyplot as plt


import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import umap
from scipy.optimize import linear_sum_assignment


# For optional stanza-based fallback
import stanza


# Optional OpenAI
import openai

##Settings

In [None]:
RANDOM_SEED = 42
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)


# POS model choice: A1 (user selected)
# POS_MODEL = "lincoln/multilingual-xlm-roberta-base-ud-pos"
POS_MODEL = "jordigonzm/mdeberta-v3-base-multilingual-pos-tagger"
# Embedding encoder (contextual): XLM-R
EMB_MODEL = "xlm-roberta-base"


# Optional: use OpenAI for gloss generation
USE_OPENAI = False
OPENAI_MODEL = "gpt-4o-mini" # example


# Target words (user-provided)
MONGOLIAN_TARGETS = ['зам', 'гэр', 'амар', 'сайн', 'хүн', 'ам']
ENGLISH_TARGETS = ['road', 'home', 'rest', 'good', 'person', 'mouth']
TARGET_PAIRS = list(zip(MONGOLIAN_TARGETS, ENGLISH_TARGETS))


# Clustering defaults
DEFAULT_K = 2
SIMILARITY_THRESHOLD = 0.35

## Load models

In [None]:
import os
from google.colab import userdata

# Access the HF_TOKEN from Colab secrets
hf_token = userdata.get('HF_TOKEN')

print("Loading POS model (A1):", POS_MODEL)
pos_tokenizer = AutoTokenizer.from_pretrained(POS_MODEL, token=hf_token)
pos_model = AutoModelForTokenClassification.from_pretrained(POS_MODEL, token=hf_token)
label_map = pos_model.config.id2label # mapping id->label


print("Loading embedding encoder:", EMB_MODEL)
emb_tokenizer = AutoTokenizer.from_pretrained(EMB_MODEL, token=hf_token)
emb_model = AutoModel.from_pretrained(EMB_MODEL, token=hf_token)
emb_model.eval()


# Optional: small sentence-transformers model for sentence-level fallback (not required)
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')

Loading POS model (A1): jordigonzm/mdeberta-v3-base-multilingual-pos-tagger
Loading embedding encoder: xlm-roberta-base


## Define target words and load the corpus

In [None]:
import os
import re
import json
import math
import random
from collections import Counter, defaultdict
from tqdm.auto import tqdm


import numpy as np
import matplotlib.pyplot as plt


import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForTokenClassification
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import umap
from scipy.optimize import linear_sum_assignment


# For optional stanza-based fallback
import stanza


# Optional OpenAI
import openai
# ---------------------------
# UTILITY: POS tagging with token-classification model
# We'll align wordpieces back to textual words using SentencePiece marker ' '.
# This assumes the tokenizer is a SentencePiece-based tokenizer with   markers (true for XLM-R).
# ---------------------------

def pos_tag_sentence(sentence):
    """
    Returns list of (word, UPOS) for the sentence using the token-classification model.
    """
    tokens = pos_tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = pos_model(**tokens)
    preds = outputs.logits.argmax(-1).squeeze().tolist()
    token_ids = tokens.input_ids[0].tolist()
    token_strs = pos_tokenizer.convert_ids_to_tokens(token_ids)

    results = []
    current_word = None
    current_labels = []

    # Iterate over tokens ignoring special tokens
    for tok, lab_id in zip(token_strs, preds):
        if tok in pos_tokenizer.all_special_tokens:
            continue
        if tok.startswith(" "):
            # start of new wordpiece
            if current_word is not None:
                # consolidate label: majority label
                if current_labels:
                    lbl = Counter(current_labels).most_common(1)[0][0]
                else:
                    lbl = None
                results.append((current_word, lbl))
            current_word = tok[1:]
            current_labels = [label_map[lab_id]]
        else:
            # continuation
            if current_word is None:
                current_word = tok
                current_labels = [label_map[lab_id]]
            else:
                current_word += tok.replace(' ', '')
                current_labels.append(label_map[lab_id])

    # flush
    if current_word is not None:
        if current_labels:
            lbl = Counter(current_labels).most_common(1)[0][0]
        else:
            lbl = None
        results.append((current_word, lbl))

    return results

# Quick test (comment/uncomment for quick run)
# print(pos_tag_sentence("Тэр хүн гэртээ очсон."))

# ---------------------------
# CONTEXT EXTRACTION FILTERED BY POS
# ---------------------------

def extract_pos_filtered_contexts(corpus, target_word, desired_pos):
    """Return list of sentences from corpus where target_word appears with desired_pos tag."""
    out = []
    for sent in corpus:
        try:
            tags = pos_tag_sentence(sent)
        except Exception as e:
            # fallback: simple substring match
            if target_word in sent:
                out.append(sent)
            continue
        for tok, pos in tags:
            if tok == target_word and pos == desired_pos:
                out.append(sent)
                break
    return out

# ---------------------------
# CONTEXTUAL TOKEN EMBEDDING EXTRACTION
# We'll extract token-level embeddings by matching token ids of target word.
# ---------------------------

def get_contextual_embedding(sentence, target_word):
    """Return averaged contextual embedding for target_word in sentence, or None if not found."""
    inputs = emb_tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = emb_model(**inputs)
    last_hidden = outputs.last_hidden_state.squeeze(0)  # seq_len x dim

    # Get tokenized target and ids
    target_tokens = emb_tokenizer.tokenize(target_word)
    target_ids = emb_tokenizer.convert_tokens_to_ids(target_tokens)

    ids = inputs['input_ids'].squeeze(0).tolist()

    # find matching spans
    positions = []
    for i in range(len(ids) - len(target_ids) + 1):
        if ids[i:i+len(target_ids)] == target_ids:
            positions.append(range(i, i+len(target_ids)))

    if not positions:
        return None

    vecs = []
    for pos in positions:
        subvecs = last_hidden[list(pos)]  # len_sub x dim
        vecs.append(subvecs.mean(dim=0).numpy())

    return np.mean(np.stack(vecs, axis=0), axis=0)

# ---------------------------
# CLUSTERING / WSI
# ---------------------------

def cluster_embeddings(embeddings, k=2):
    if len(embeddings) == 0:
        return None, None
    k = min(k, len(embeddings))
    if k <= 1:
        labels = np.zeros(len(embeddings), dtype=int)
        centers = np.array([embeddings.mean(axis=0)])
        return labels, centers
    km = KMeans(n_clusters=k, random_state=RANDOM_SEED).fit(embeddings)
    return km.labels_, km.cluster_centers_

# ---------------------------
# SENSE DESCRIPTION: keywords + representative sentence
# ---------------------------

def describe_sense(sentences, embeddings, labels, cluster_id, top_k=6):
    idxs = [i for i, lbl in enumerate(labels) if lbl == cluster_id]
    cluster_sents = [sentences[i] for i in idxs]
    cluster_embs = embeddings[idxs]

    centroid = cluster_embs.mean(axis=0)
    sims = cosine_similarity([centroid], cluster_embs)[0]
    rep_rel = np.argmax(sims)
    rep_sentence = cluster_sents[rep_rel]

    # simple keyword extraction: token frequency excluding stopwords (language-agnostic)
    words = []
    for s in cluster_sents:
        clean = re.sub(r"[^\w\s]", " ", s)
        words.extend([w.lower() for w in clean.split() if len(w) > 1])
    most_common = [w for w, c in Counter(words).most_common(top_k)]

    return {
        'cluster_id': cluster_id,
        'keywords': most_common,
        'example': rep_sentence,
        'sentences': cluster_sents,
        'centroid': centroid
    }

# ---------------------------
# GLOSS GENERATION (heuristic or OpenAI)
# ---------------------------

def heuristic_gloss(keywords, example):
    kw = ", ".join(keywords[:6])
    return f"A sense related to: {kw}. Example: \"{example}\""


def generate_gloss_openai(keywords, example, language='en'):
    if not USE_OPENAI:
        return heuristic_gloss(keywords, example)
    prompt = f"Write a one-sentence dictionary-style gloss in {language}.\nKeywords: {keywords}\nExample: \"{example}\"\nReturn only a short gloss."
    try:
        resp = openai.ChatCompletion.create(
            model=OPENAI_MODEL,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0,
            max_tokens=60
        )
        gloss = resp.choices[0].message['content'].strip()
        return gloss
    except Exception as e:
        print("OpenAI error:", e)
        return heuristic_gloss(keywords, example)

# ---------------------------
# EMBED GLOSS TEXT (using embedding encoder by averaging token vectors)
# ---------------------------

def embed_text_with_xlm(text):
    inputs = emb_tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outs = emb_model(**inputs)
    vec = outs.last_hidden_state.mean(dim=1).squeeze(0).numpy()
    return vec

# ---------------------------
# ALIGNMENT between MO and EN clusters
# ---------------------------

def align_clusters(mo_descs, en_descs, mode='gloss', similarity_threshold=SIMILARITY_THRESHOLD):
    mo_ids = list(mo_descs.keys())
    en_ids = list(en_descs.keys())

    if mode == 'gloss':
        mo_mat = np.vstack([mo_descs[i]['gloss_emb'] for i in mo_ids])
        en_mat = np.vstack([en_descs[j]['gloss_emb'] for j in en_ids])
    else:
        mo_mat = np.vstack([mo_descs[i]['centroid'] for i in mo_ids])
        en_mat = np.vstack([en_descs[j]['centroid'] for j in en_ids])

    sim = cosine_similarity(mo_mat, en_mat)

    # Hungarian optimum (maximize) => minimize negative
    cost = -sim
    row_ind, col_ind = linear_sum_assignment(cost)
    one_to_one = []
    for r, c in zip(row_ind, col_ind):
        one_to_one.append({'mo_cluster': mo_ids[r], 'en_cluster': en_ids[c], 'similarity': float(sim[r,c])})

    many_to_one = []
    for i, mid in enumerate(mo_ids):
        for j, eid in enumerate(en_ids):
            if sim[i,j] >= similarity_threshold:
                many_to_one.append({'mo_cluster': mid, 'en_cluster': eid, 'similarity': float(sim[i,j])})

    return {'sim_matrix': sim, 'one_to_one': one_to_one, 'many_to_one': many_to_one, 'mo_ids': mo_ids, 'en_ids': en_ids}

# ---------------------------
# VISUALIZATION
# ---------------------------

def visualize_crosslingual(mo_word, en_word, mo_embs, en_embs, mo_labels, en_labels):
    combined = np.vstack([mo_embs, en_embs])

    if combined.shape[0] < 3: # Require at least 3 points for meaningful UMAP and to avoid scipy.linalg.eigh error
        print(f"Not enough points to visualize {mo_word} ↔ {en_word}. Need at least 3, got {combined.shape[0]}.")
        return

    # Adjust n_neighbors for small datasets
    # UMAP n_neighbors must be at least 2 for some internal calculations
    n_neighbors_val = min(15, combined.shape[0] - 1)
    if n_neighbors_val < 2 and combined.shape[0] > 1:
        n_neighbors_val = 2 # If 2 points, n_neighbors can be 1, but UMAP prefers >=2.
    elif combined.shape[0] == 1:
        print(f"Not enough unique neighbors for UMAP for {mo_word} ↔ {en_word}. Skipping visualization.")
        return

    try:
        reducer = umap.UMAP(random_state=RANDOM_SEED, n_neighbors=n_neighbors_val)
        reduced = reducer.fit_transform(combined)
    except Exception as e:
        print(f"UMAP visualization failed for {mo_word} ↔ {en_word} with {combined.shape[0]} points. Error: {e}. Skipping visualization.")
        return

    n_mo = len(mo_embs)

    plt.figure(figsize=(8,6))
    # Mongolian (circles)
    for lbl in sorted(set(mo_labels)):
        idxs = [i for i in range(n_mo) if mo_labels[i] == lbl]
        if not idxs: continue
        pts = reduced[idxs]
        plt.scatter(pts[:,0], pts[:,1], marker='o', label=f'MO {mo_word} S{lbl}')
    # English (triangles)
    for lbl in sorted(set(en_labels)):
        idxs = [n_mo + i for i in range(len(en_embs)) if en_labels[i] == lbl]
        if not idxs: continue
        pts = reduced[idxs]
        plt.scatter(pts[:,0], pts[:,1], marker='^', label=f'EN {en_word} S{lbl}')

    plt.title(f"WSI: {mo_word} ↔ {en_word}")
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.tight_layout()
    plt.show()

# ---------------------------
# POLYSEMY METRICS
# ---------------------------

def induce_k_for_word(embs, k_max=6):
    if len(embs) < 2:
        return 1
    best_k = 1
    best_score = -1.0
    for k in range(2, min(k_max, len(embs))+1):
        km = KMeans(n_clusters=k, random_state=RANDOM_SEED).fit(embs)
        try:
            from sklearn.metrics import silhouette_score
            score = silhouette_score(embs, km.labels_)
        except Exception:
            score = -1
        if score > best_score:
            best_score = score
            best_k = k
    return best_k

def compute_polysemy_for_word(embs):
    k = induce_k_for_word(embs, k_max=6)
    if k <= 1:
        return {'S':1, 'k':1}
    km = KMeans(n_clusters=k, random_state=RANDOM_SEED).fit(embs)
    sizes = np.bincount(km.labels_)
    centroid_sim = float(np.mean(cosine_similarity(km.cluster_centers_)))
    dom_prop = float(sizes.max() / sizes.sum())
    return {'S':k, 'sizes':sizes.tolist(), 'centroid_sim':centroid_sim, 'dominance':dom_prop}

# ---------------------------
# HIGH-LEVEL PER-WORD PIPELINE
# ---------------------------

def run_wsi_for_pair(mo_word, en_word, mongolian_corpus, english_corpus, desired_pos='NOUN', k=DEFAULT_K):
    print('\n' + '='*60)
    print(f"Processing pair: {mo_word} ↔ {en_word} (POS={desired_pos})")

    # 1) contexts filtered by POS
    mo_contexts = extract_pos_filtered_contexts(mongolian_corpus, mo_word, desired_pos)
    en_contexts = extract_pos_filtered_contexts(english_corpus, en_word, desired_pos)

    print(f"Found {len(mo_contexts)} Mongolian contexts, {len(en_contexts)} English contexts")
    if len(mo_contexts) < 1 or len(en_contexts) < 1:
        print("Not enough contexts; skipping")
        return None

    # 2) embeddings for each occurrence
    mo_embs = []
    for s in mo_contexts:
        emb = get_contextual_embedding(s, mo_word)
        if emb is not None:
            mo_embs.append(emb)
    mo_embs = np.vstack(mo_embs) if mo_embs else np.zeros((0, emb_model.config.hidden_size))

    en_embs = []
    for s in en_contexts:
        emb = get_contextual_embedding(s, en_word)
        if emb is not None:
            en_embs.append(emb)
    en_embs = np.vstack(en_embs) if en_embs else np.zeros((0, emb_model.config.hidden_size))

    if len(mo_embs) == 0 or len(en_embs) == 0:
        print("No valid embeddings found; skipping")
        return None

    # 3) cluster separately (choose k by silhouette or fixed)
    mo_k = induce_k_for_word(mo_embs, k_max=6)
    en_k = induce_k_for_word(en_embs, k_max=6)
    mo_labels, mo_centers = cluster_embeddings(mo_embs, k=mo_k)
    en_labels, en_centers = cluster_embeddings(en_embs, k=en_k)

    print(f"Mongolian clusters: {mo_k}, English clusters: {en_k}")

    # 4) describe clusters + gloss generation
    mo_descs = {}
    for cid in sorted(set(mo_labels)):
        d = describe_sense(mo_contexts, mo_embs, mo_labels, cid)
        d['gloss'] = generate_gloss_openai(d['keywords'], d['example'], language='mn' if True else 'en')
        d['gloss_emb'] = embed_text_with_xlm(d['gloss'])
        mo_descs[cid] = d

    en_descs = {}
    for cid in sorted(set(en_labels)):
        d = describe_sense(en_contexts, en_embs, en_labels, cid)
        d['gloss'] = generate_gloss_openai(d['keywords'], d['example'], language='en')
        d['gloss_emb'] = embed_text_with_xlm(d['gloss'])
        en_descs[cid] = d

    # 5) alignment
    alignment = align_clusters(mo_descs, en_descs, mode='gloss', similarity_threshold=SIMILARITY_THRESHOLD)

    # 6) visualization
    visualize_crosslingual(mo_word, en_word, mo_embs, en_embs, mo_labels, en_labels)

    # 7) polysemy metrics
    mo_poly = compute_polysemy_for_word(mo_embs)
    en_poly = compute_polysemy_for_word(en_embs)

    result = {
        'mo_word': mo_word,
        'en_word': en_word,
        'mo_contexts': mo_contexts,
        'en_contexts': en_contexts,
        'mo_embs': mo_embs,
        'en_embs': en_embs,
        'mo_labels': mo_labels,
        'en_labels': en_labels,
        'mo_descs': mo_descs,
        'en_descs': en_descs,
        'alignment': alignment,
        'mo_poly': mo_poly,
        'en_poly': en_poly
    }

    return result

## Execute

In [None]:

# ---------------------------
# RUN EXAMPLE on toy corpora (replace with your real corpora)
# ---------------------------
if __name__ == '__main__':
    # Example small corpora; replace these with file reads
    mongolian_corpus = [
        "Би зам дагуу алхаж байна.",
        "Энэ зам урт.",
        "Манай гэр уулын орой дээр.",
        "Тэр сайхан гэр барьсан.",
        "Би өнөөдөр амарсан.",
        "Амар амгалан орчин.",
        "Тэр сайн хүн.",
        "Хүн өөрөө үнэ цэнтэй.",
        "Намайг ам гэдэг.",
        "Ам нь жижиг."
    ]

    english_corpus = [
        "The road is long.",
        "I walked along the road.",
        "Our home is warm.",
        "He built a beautiful home.",
        "I rested today.",
        "A peaceful rest is important.",
        "She is a good person.",
        "Every person has value.",
        "His mouth is small.",
        "Open your mouth."
    ]

    # Run for each pair (POS=NOUN as default) - change desired_pos to 'VERB' as needed
    results = {}
    for mo_word, en_word in TARGET_PAIRS:
        res = run_wsi_for_pair(mo_word, en_word, mongolian_corpus, english_corpus, desired_pos='NOUN')
        if res:
            results[f"{mo_word}_{en_word}"] = res

    # Save results to disk for inspection
    with open('wsi_results_sample.json', 'w', encoding='utf8') as f:
        json.dump({k: {
            'mo_word': v['mo_word'],
            'en_word': v['en_word'],
            'mo_poly': v['mo_poly'],
            'en_poly': v['en_poly'],
            'alignment_one_to_one': v['alignment']['one_to_one']
        } for k,v in results.items()}, f, ensure_ascii=False, indent=2)

    print('\nSaved sample results to wsi_results_sample.json')

# ---------------------------
# NOTES / NEXT STEPS
# ---------------------------
# - Replace small example corpora with your real parallel Bible files (aligned by line).
# - If you want to enable OpenAI glosses, set USE_OPENAI=True and set OPENAI_API_KEY in the environment.
# - Consider adding lemmatization (stanza) if you want to extract lemma-level contexts rather than surface forms.
# - For large corpora, consider batching calls to embedding model and pos model for speed.
# - To perform full-scale experiments, save per-word results (embeddings, labels, glosses) and run the polysemy statistics module on the sampled word pairs.



Processing pair: зам ↔ road (POS=NOUN)
Found 0 Mongolian contexts, 0 English contexts
Not enough contexts; skipping

Processing pair: гэр ↔ home (POS=NOUN)
Found 0 Mongolian contexts, 0 English contexts
Not enough contexts; skipping

Processing pair: амар ↔ rest (POS=NOUN)
Found 0 Mongolian contexts, 0 English contexts
Not enough contexts; skipping

Processing pair: сайн ↔ good (POS=NOUN)
Found 0 Mongolian contexts, 0 English contexts
Not enough contexts; skipping

Processing pair: хүн ↔ person (POS=NOUN)
Found 0 Mongolian contexts, 0 English contexts
Not enough contexts; skipping

Processing pair: ам ↔ mouth (POS=NOUN)
Found 0 Mongolian contexts, 0 English contexts
Not enough contexts; skipping

Saved sample results to wsi_results_sample.json


In [None]:
# ==============================================
# Generate automatic “sense descriptions”
# ==============================================

from collections import Counter
import numpy as np

def describe_sense(sentences, embeddings, labels, cluster_id, top_k=5):
    """
    Returns keywords and representative sentence for a sense cluster.
    """
    # Extract cluster sentences
    idx = [i for i, lbl in enumerate(labels) if lbl == cluster_id]
    cluster_sents = [sentences[i] for i in idx]
    cluster_embs  = embeddings[idx]

    # 1. Compute centroid
    centroid = cluster_embs.mean(axis=0)

    # 2. Find most typical (closest) sentence
    sims = cosine_similarity([centroid], cluster_embs)[0]
    closest_idx = idx[int(np.argmax(sims))]
    representative_sentence = sentences[closest_idx]

    # 3. Extract keywords (very simple tokenizer)
    words = []
    for s in cluster_sents:
        clean = re.sub(r"[^\w\s]", "", s)
        words.extend(clean.lower().split())

    most_common = [w for w, c in Counter(words).most_common(top_k)]

    return {
        "cluster_id": cluster_id,
        "keywords": most_common,
        "example": representative_sentence,
        "sentences": cluster_sents
    }


# -------------------------------
# Visualization (combined)
# -------------------------------
def visualize_crosslingual(word_mo, word_en,
                           mo_embeddings, en_embeddings,
                           mo_labels, en_labels):

    reducer = umap.UMAP(random_state=42)
    combined = np.vstack([mo_embeddings, en_embeddings])
    reduced = reducer.fit_transform(combined)

    n_mo = len(mo_embeddings)

    plt.figure(figsize=(7,6))

    # Mongolian points (circles)
    for lbl in set(mo_labels):
        idx = [i for i in range(n_mo) if mo_labels[i] == lbl]
        plt.scatter(reduced[idx,0], reduced[idx,1],
                    marker='o', label=f"MO {word_mo} - Sense {lbl}")

    # English points (triangles)
    for lbl in set(en_labels):
        idx = [i+n_mo for i in range(len(en_embeddings)) if en_labels[i] == lbl]
        plt.scatter(reduced[idx,0], reduced[idx,1],
                    marker='^', label=f"EN {word_en} - Sense {lbl}")

    plt.title(f"Cross-Lingual WSI: '{word_mo}' ↔ '{word_en}'")
    plt.legend()
    plt.show()


# -------------------------------
# RUN CROSS-LINGUAL WSI
# -------------------------------
for mo_word, en_word in targets:

    print("\n=======================================")
    print(f"WORD PAIR: {mo_word} ↔ {en_word}")
    print("=======================================")

    # 1. Collect embeddings
    mo_sents, mo_embs = collect_embeddings(mongolian_corpus, mo_word)
    en_sents, en_embs = collect_embeddings(english_corpus, en_word)

    if len(mo_embs) == 0 or len(en_embs) == 0:
        print("Not enough contexts. Skipping.")
        continue

    # 2. Cluster separately
    mo_kmeans, mo_labels = cluster_senses(mo_embs, k=2)
    en_kmeans, en_labels = cluster_senses(en_embs, k=2)

    # 3. Compute cluster similarity
    sim = compare_clusters(mo_kmeans.cluster_centers_,
                           en_kmeans.cluster_centers_)

    print("\nCosine similarity between senses (MO x EN):")
    print(sim)

    # 4. Show which Mongolian sense matches which English sense
    best_align = sim.argmax(axis=1)
    for i, j in enumerate(best_align):
        print(f" Mongolian Sense {i} ↔ English Sense {j} (similarity={sim[i,j]:.3f})")

    # 5. Visualize combined embeddings
    visualize_crosslingual(mo_word, en_word, mo_embs, en_embs, mo_labels, en_labels)
