## EmbeddingLoader

In [1]:
from vector_mapping import EmbeddingLoader

EMBEDDING_PATH_ROOT = "./pretrained_word2vec/"
EN_EMBEDDING = "en/GoogleNews-vectors-negative300.bin.gz"
ZH_EMBEDDING = "zh/sgns.merge.word.bz2"

MAX_VOCAB = 10000

# Create separate loaders for each language (new stateful API requires separate instances)
en_loader = EmbeddingLoader()
zh_loader = EmbeddingLoader()

# Load embeddings
en_loader.load_word2vec(filepath=EMBEDDING_PATH_ROOT+EN_EMBEDDING, max_vocab=MAX_VOCAB)
zh_loader.load_word2vec(filepath=EMBEDDING_PATH_ROOT+ZH_EMBEDDING, max_vocab=MAX_VOCAB)

# Get embeddings dicts for use with helper functions below
en_emb = en_loader.get_embeddings()
zh_emb = zh_loader.get_embeddings()

# Show vocabulary sizes
print(f"English vocabulary: {len(en_loader)} words")
print(f"Chinese vocabulary: {len(zh_loader)} words")

English vocabulary: 10000 words
Chinese vocabulary: 10000 words


In [2]:
import numpy as np

def get_word_embedding(word, embeddings):
    if word not in embeddings:
        raise KeyError(f"Word '{word}' not found in embeddings")
    return embeddings[word]


def cosine_similarity(vec1, vec2):
    # Calculate dot product
    dot_product = np.dot(vec1, vec2)

    # Calculate magnitudes (L2 norms)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    # Avoid division by zero
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0

    # Calculate cosine similarity
    return dot_product / (norm_vec1 * norm_vec2)

In [3]:
similarity = cosine_similarity(get_word_embedding("一", zh_emb), get_word_embedding("one", en_emb))

print(similarity)

0.11028192


In [4]:
similarity = cosine_similarity(get_word_embedding("一", zh_emb), get_word_embedding("一个", zh_emb))

print(similarity)

0.684622


In [5]:
similarity = cosine_similarity(get_word_embedding("one", en_emb), get_word_embedding("single", en_emb))

print(similarity)

0.4903487


In [6]:
print("Word existence checks:")
print(f"  'cat' in en_loader: {'cat' in en_loader}")
print(f"  'dog' in en_loader: {'dog' in en_loader}")
print(f"  'asdfghjkl' in en_loader: {'asdfghjkl' in en_loader}")
print()

print("Get single word embedding:")
cat_vec = en_loader.get_embedding('cat')
print(f"  'cat' embedding shape: {cat_vec.shape}")
print(f"  'cat' embedding (first 10 dims): {cat_vec[:10]}")
print()

print("Vocabulary operations:")
en_vocab = en_loader.get_vocabulary()
zh_vocab = zh_loader.get_vocabulary()
print(f"  English vocab sample: {list(en_vocab)[:10]}")
print(f"  Chinese vocab sample: {list(zh_vocab)[:10]}")
print()

print("Method chaining example:")
test_loader = EmbeddingLoader()
test_loader.load_word2vec(
    filepath=EMBEDDING_PATH_ROOT+EN_EMBEDDING, 
    max_vocab=1000
).filter_vocabulary({'cat', 'dog', 'house', 'tree', 'car'})
print(f"  Filtered vocab size: {len(test_loader)}")
print(f"  Filtered words: {test_loader.get_vocabulary()}")

Word existence checks:
  'cat' in en_loader: True
  'dog' in en_loader: True
  'asdfghjkl' in en_loader: False

Get single word embedding:
  'cat' embedding shape: (300,)
  'cat' embedding (first 10 dims): [ 0.0123291   0.20410156 -0.28515625  0.21679688  0.11816406  0.08300781
  0.04980469 -0.00952148  0.22070312 -0.12597656]

Vocabulary operations:
  English vocab sample: ['delivers', 'Jerry', 'scheduled', 'South', 'Venezuelan', 'detection', 'Wi_Fi', 'Elsewhere', 'colorful', 'readers']
  Chinese vocab sample: ['花钱', '修正主义', '日中', '未经', '大奖赛', '骑兵', '记者', '率领', '经济体', '均价']

Method chaining example:
  Filtered vocab size: 2
  Filtered words: {'car', 'house'}


## DictionaryParser

In [7]:
from vector_mapping import DictionaryParser

# Parse MUSE dictionary with only simplified Chinese
parser = DictionaryParser()
parser.parse_muse_format('./dictionaries/cedict_processed.txt', include_traditional=False)
print(f"Total dictionary pairs: {len(parser.get_pairs())}")

# Filter by available embeddings
parser.filter_by_vocabulary(zh_vocab, en_vocab)
print(f"Filtered pairs (words with embeddings): {len(parser.get_pairs())}")

# View sample pairs
print(f"Sample pairs: {parser.get_pairs()[:10]}")

Total dictionary pairs: 9197
Filtered pairs (words with embeddings): 516
Sample pairs: [('一些', 'some'), ('一代', 'generation'), ('一共', 'altogether'), ('一再', 'repeatedly'), ('一半', 'half'), ('一对', 'couple'), ('一带', 'region'), ('丈夫', 'husband'), ('下列', 'following'), ('不想', 'unexpectedly')]


  parser.filter_by_vocabulary(zh_vocab, en_vocab)


## Cross-Lingual Translation Pair Similarities (Unmapped Embeddings)

In [8]:
# Use translation pairs from the loaded dictionary parser
pairs = parser.get_pairs()

# Sample a subset if there are too many pairs (optional)
sample_size = min(10, len(pairs))
sampled_pairs = pairs[:sample_size]

print(f"Evaluating {sample_size} translation pairs from dictionary:\n")

similarities = []
found_count = 0

for zh_word, en_word in sampled_pairs:
    try:
        # Get embeddings for both words
        zh_vec = get_word_embedding(zh_word, zh_emb)
        en_vec = get_word_embedding(en_word, en_emb)

        # Calculate cosine similarity
        sim = cosine_similarity(zh_vec, en_vec)
        similarities.append(sim)
        found_count += 1

        print(f"{zh_word:8s} <-> {en_word:20s} : {sim:.4f}")
    except KeyError as e:
        # Skip pairs where word not in embeddings
        continue

print(f"\nResults:")
print(f"  Pairs evaluated: {found_count}/{sample_size}")
print(f"  Average similarity: {np.mean(similarities):.4f}")
print(f"  Std deviation: {np.std(similarities):.4f}")
print(f"  Min similarity: {np.min(similarities):.4f}")
print(f"  Max similarity: {np.max(similarities):.4f}")

Evaluating 10 translation pairs from dictionary:

一些       <-> some                 : -0.0681
一代       <-> generation           : 0.1055
一共       <-> altogether           : -0.0749
一再       <-> repeatedly           : -0.0315
一半       <-> half                 : -0.0861
一对       <-> couple               : -0.0835
一带       <-> region               : 0.0474
丈夫       <-> husband              : 0.0450
下列       <-> following            : 0.1474
不想       <-> unexpectedly         : 0.0292

Results:
  Pairs evaluated: 10/10
  Average similarity: 0.0030
  Std deviation: 0.0797
  Min similarity: -0.0861
  Max similarity: 0.1474


## Cross-Lingual Translation Pair Similarities (Mapped Embeddings)

In [9]:
# Load aligned Chinese embeddings
ALIGNED_ZH_EMBEDDING = "zh-aligned/merge-google/20251023-023520/sgns.merge.bin.gz"

zh_aligned_loader = EmbeddingLoader()
zh_aligned_loader.load_word2vec(filepath=EMBEDDING_PATH_ROOT+ALIGNED_ZH_EMBEDDING, max_vocab=MAX_VOCAB)

# Get aligned embeddings dict
zh_aligned_emb = zh_aligned_loader.get_embeddings()

print(f"Aligned Chinese vocabulary: {len(zh_aligned_loader)} words")
print(f"Sample words: {list(zh_aligned_emb.keys())[:10]}")

Aligned Chinese vocabulary: 10000 words
Sample words: ['，', '的', '。', '、', '在', '和', '：', '了', '是', '”']


### Direct Comparison: Original vs Aligned Chinese Embeddings

Now let's compare how well Chinese words match their English translations **before** and **after** alignment.

In [10]:
# Test specific translation pairs
test_pairs = [
    ('一', 'one'),
    ('猫', 'cat'),
    ('狗', 'dog'),
    ('中国', 'China'),
    ('美国', 'USA')
]

print("Comparison: Original vs Aligned Chinese Embeddings\n")
print(f"{'Chinese':<10} {'English':<15} {'Original':<12} {'Aligned':<12} {'Improvement':<12}")
print("-" * 70)

for zh_word, en_word in test_pairs:
    try:
        # Get embeddings
        zh_orig_vec = get_word_embedding(zh_word, zh_emb)
        zh_aligned_vec = get_word_embedding(zh_word, zh_aligned_emb)
        en_vec = get_word_embedding(en_word, en_emb)
        
        # Calculate similarities
        sim_original = cosine_similarity(zh_orig_vec, en_vec)
        sim_aligned = cosine_similarity(zh_aligned_vec, en_vec)
        improvement = sim_aligned - sim_original
        
        print(f"{zh_word:<10} {en_word:<15} {sim_original:>11.4f} {sim_aligned:>11.4f} {improvement:>+11.4f}")
        
    except KeyError as e:
        print(f"{zh_word:<10} {en_word:<15} {'NOT FOUND':^36}")

Comparison: Original vs Aligned Chinese Embeddings

Chinese    English         Original     Aligned      Improvement 
----------------------------------------------------------------------
一          one                  0.1103      0.3351     +0.2248
猫          cat                 -0.0078      0.6396     +0.6474
狗          dog                 -0.0301      0.5386     +0.5687
中国         China               -0.0480      0.4635     +0.5115
美国         USA                  0.0400      0.3502     +0.3102


### Translation Pairs Evaluation (Aligned Embeddings)

Let's evaluate the same 10 dictionary translation pairs, but now using the **aligned** Chinese embeddings.

In [11]:
# Use the same translation pairs from the dictionary
sampled_pairs = pairs[:sample_size]

print(f"Evaluating {sample_size} translation pairs with ALIGNED Chinese embeddings:\\n")

aligned_similarities = []
found_count = 0

for zh_word, en_word in sampled_pairs:
    try:
        # Get aligned Chinese embedding and English embedding
        zh_aligned_vec = get_word_embedding(zh_word, zh_aligned_emb)
        en_vec = get_word_embedding(en_word, en_emb)
        
        # Calculate cosine similarity
        sim = cosine_similarity(zh_aligned_vec, en_vec)
        aligned_similarities.append(sim)
        found_count += 1
        
        print(f"{zh_word:8s} <-> {en_word:20s} : {sim:.4f}")
    except KeyError as e:
        # Skip pairs where word not in embeddings
        continue

print(f"\nResults (Aligned):")
print(f"  Pairs evaluated: {found_count}/{sample_size}")
print(f"  Average similarity: {np.mean(aligned_similarities):.4f}")
print(f"  Std deviation: {np.std(aligned_similarities):.4f}")
print(f"  Min similarity: {np.min(aligned_similarities):.4f}")
print(f"  Max similarity: {np.max(aligned_similarities):.4f}")

print(f"\n{'='*60}")
print(f"COMPARISON: Original vs Aligned")
print(f"{'='*60}")
print(f"Original average similarity: {np.mean(similarities):.4f}")
print(f"Aligned average similarity:  {np.mean(aligned_similarities):.4f}")
print(f"Improvement:                 {np.mean(aligned_similarities) - np.mean(similarities):+.4f}")
print(f"Improvement ratio:           {np.mean(aligned_similarities) / np.mean(similarities) if np.mean(similarities) != 0 else float('inf'):.1f}x")

Evaluating 10 translation pairs with ALIGNED Chinese embeddings:\n
一些       <-> some                 : 0.5504
一代       <-> generation           : 0.5193
一共       <-> altogether           : 0.0627
一再       <-> repeatedly           : 0.4567
一半       <-> half                 : 0.3925
一对       <-> couple               : 0.3132
一带       <-> region               : 0.4354
丈夫       <-> husband              : 0.6911
下列       <-> following            : 0.1347
不想       <-> unexpectedly         : 0.0919

Results (Aligned):
  Pairs evaluated: 10/10
  Average similarity: 0.3648
  Std deviation: 0.2001
  Min similarity: 0.0627
  Max similarity: 0.6911

COMPARISON: Original vs Aligned
Original average similarity: 0.0030
Aligned average similarity:  0.3648
Improvement:                 +0.3618
Improvement ratio:           119.8x


### Cross-Lingual Nearest Neighbors

Now let's find the nearest neighbors across languages in the shared embedding space. We'll search for:
1. **Chinese → English**: Find English words similar to Chinese concepts
2. **English → Chinese**: Find Chinese words similar to English concepts

In [12]:
def find_nearest_neighbors(query_vec, target_embeddings, top_k=5):
    """
    Find the top-k nearest neighbors in target embeddings to query vector.

    Args:
        query_vec: Query embedding vector
        target_embeddings: Dict of {word: vector} to search in
        top_k: Number of neighbors to return

    Returns:
        List of (word, similarity) tuples
    """
    similarities = []
    for word, vec in target_embeddings.items():
        sim = cosine_similarity(query_vec, vec)
        similarities.append((word, sim))

    # Sort by similarity (descending)
    similarities.sort(key=lambda x: x[1], reverse=True)
    return similarities[:top_k]


# Test Chinese → English (find English words similar to Chinese concepts)
print("=" * 70)
print("CHINESE → ENGLISH (Find English words similar to Chinese concepts)")
print("=" * 70)

zh_test_words = ['猫', '狗', '中国', '学习', '美国']

for zh_word in zh_test_words:
    if zh_word in zh_aligned_emb:
        zh_vec = zh_aligned_emb[zh_word]
        neighbors = find_nearest_neighbors(zh_vec, en_emb, top_k=5)

        print(f"\n'{zh_word}' → English neighbors:")
        for word, sim in neighbors:
            print(f"  {word:20s} ({sim:.4f})")
    else:
        print(f"\n'{zh_word}' not found in aligned embeddings")


print("\n" + "=" * 70)
print("ENGLISH → CHINESE (Find Chinese words similar to English concepts)")
print("=" * 70)

en_test_words = ['cat', 'dog', 'China', 'learning', 'USA']

for en_word in en_test_words:
    if en_word in en_emb:
        en_vec = en_emb[en_word]
        neighbors = find_nearest_neighbors(en_vec, zh_aligned_emb, top_k=5)

        print(f"\n'{en_word}' → Chinese neighbors:")
        for word, sim in neighbors:
            print(f"  {word:20s} ({sim:.4f})")
    else:
        print(f"\n'{en_word}' not found in English embeddings")

CHINESE → ENGLISH (Find English words similar to Chinese concepts)

'猫' → English neighbors:
  cat                  (0.6396)
  dog                  (0.5551)
  cats                 (0.5527)
  pet                  (0.5153)
  dogs                 (0.4624)

'狗' → English neighbors:
  dog                  (0.5386)
  cat                  (0.5008)
  dogs                 (0.4504)
  pet                  (0.4393)
  animal               (0.4348)

'中国' → English neighbors:
  China                (0.4635)
  United_States        (0.4589)
  Chinese              (0.3998)
  India                (0.3960)
  world                (0.3905)

'学习' → English neighbors:
  learning             (0.4125)
  teaching             (0.4047)
  teach                (0.3978)
  math                 (0.3599)
  taught               (0.3501)

'美国' → English neighbors:
  United_States        (0.5104)
  U.S.                 (0.4505)
  America              (0.4227)
  Canada               (0.3988)
  California           (0.3806)
