## EmbeddingLoader

In [2]:
from vector_mapping import EmbeddingLoader

EMBEDDING_PATH_ROOT = "./pretrained_word2vec/"
EN_EMBEDDING = "en/GoogleNews-vectors-negative300.bin.gz"
ZH_EMBEDDING = "zh/sgns.merge.word.bz2"

MAX_VOCAB = 10000

# Create separate loaders for each language (new stateful API requires separate instances)
en_loader = EmbeddingLoader()
zh_loader = EmbeddingLoader()

# Load embeddings
en_loader.load_word2vec(filepath=EMBEDDING_PATH_ROOT+EN_EMBEDDING, max_vocab=MAX_VOCAB)
zh_loader.load_word2vec(filepath=EMBEDDING_PATH_ROOT+ZH_EMBEDDING, max_vocab=MAX_VOCAB)

# Get embeddings dicts for use with helper functions below
en_emb = en_loader.get_embeddings()
zh_emb = zh_loader.get_embeddings()

# Show vocabulary sizes
print(f"English vocabulary: {len(en_loader)} words")
print(f"Chinese vocabulary: {len(zh_loader)} words")

English vocabulary: 10000 words
Chinese vocabulary: 10000 words


In [3]:
import numpy as np

def get_word_embedding(word, embeddings):
    if word not in embeddings:
        raise KeyError(f"Word '{word}' not found in embeddings")
    return embeddings[word]


def cosine_similarity(vec1, vec2):
    # Calculate dot product
    dot_product = np.dot(vec1, vec2)

    # Calculate magnitudes (L2 norms)
    norm_vec1 = np.linalg.norm(vec1)
    norm_vec2 = np.linalg.norm(vec2)

    # Avoid division by zero
    if norm_vec1 == 0 or norm_vec2 == 0:
        return 0.0

    # Calculate cosine similarity
    return dot_product / (norm_vec1 * norm_vec2)

In [4]:
similarity = cosine_similarity(get_word_embedding("一", zh_emb), get_word_embedding("one", en_emb))

print(similarity)

0.11028192


In [5]:
similarity = cosine_similarity(get_word_embedding("一", zh_emb), get_word_embedding("一个", zh_emb))

print(similarity)

0.684622


In [6]:
similarity = cosine_similarity(get_word_embedding("one", en_emb), get_word_embedding("single", en_emb))

print(similarity)

0.4903487


In [7]:
print("Word existence checks:")
print(f"  'cat' in en_loader: {'cat' in en_loader}")
print(f"  'dog' in en_loader: {'dog' in en_loader}")
print(f"  'asdfghjkl' in en_loader: {'asdfghjkl' in en_loader}")
print()

print("Get single word embedding:")
cat_vec = en_loader.get_embedding('cat')
print(f"  'cat' embedding shape: {cat_vec.shape}")
print(f"  'cat' embedding (first 10 dims): {cat_vec[:10]}")
print()

print("Vocabulary operations:")
en_vocab = en_loader.get_vocabulary()
zh_vocab = zh_loader.get_vocabulary()
print(f"  English vocab sample: {list(en_vocab)[:10]}")
print(f"  Chinese vocab sample: {list(zh_vocab)[:10]}")
print()

print("Method chaining example:")
test_loader = EmbeddingLoader()
test_loader.load_word2vec(
    filepath=EMBEDDING_PATH_ROOT+EN_EMBEDDING, 
    max_vocab=1000
).filter_vocabulary({'cat', 'dog', 'house', 'tree', 'car'})
print(f"  Filtered vocab size: {len(test_loader)}")
print(f"  Filtered words: {test_loader.get_vocabulary()}")

Word existence checks:
  'cat' in en_loader: True
  'dog' in en_loader: True
  'asdfghjkl' in en_loader: False

Get single word embedding:
  'cat' embedding shape: (300,)
  'cat' embedding (first 10 dims): [ 0.0123291   0.20410156 -0.28515625  0.21679688  0.11816406  0.08300781
  0.04980469 -0.00952148  0.22070312 -0.12597656]

Vocabulary operations:
  English vocab sample: ['misconduct', 'ancient', 'fluid', 'speeding', 'Huskies', 'Iraqis', 'road', 'disappeared', 'hearing', 'announce']
  Chinese vocab sample: ['想到', '旋律', '2009', '手动', '兑现', '恶化', '火电', '鼓励', '加深', '保']

Method chaining example:
  Filtered vocab size: 2
  Filtered words: {'house', 'car'}


## DictionaryParser

In [None]:
from vector_mapping import DictionaryParser

# Parse MUSE dictionary with only simplified Chinese
parser = DictionaryParser()
parser.parse_muse_format('./dictionaries/muse-zh-en.txt', include_traditional=False)
print(f"Total dictionary pairs: {len(parser.get_pairs())}")

# Filter by available embeddings
parser.filter_by_vocabulary(zh_vocab, en_vocab)
print(f"Filtered pairs (words with embeddings): {len(parser.get_pairs())}")

# View sample pairs
print(f"Sample pairs: {parser.get_pairs()[:10]}")

Total dictionary pairs: 21597
Filtered pairs (words with embeddings): 5345
Sample pairs: [('年', 'year'), ('月', 'moon'), ('月', 'months'), ('月', 'month'), ('日', 'day'), ('村', 'village'), ('人', 'man'), ('人', 'people'), ('%', '%'), ('大', 'big')]


## Cross-Lingual Translation Pair Similarities (Unmapped Embeddings)

In [14]:
# Use translation pairs from the loaded dictionary parser
pairs = parser.get_pairs()

# Sample a subset if there are too many pairs (optional)
sample_size = min(10, len(pairs))
sampled_pairs = pairs[:sample_size]

print(f"Evaluating {sample_size} translation pairs from dictionary:\n")

similarities = []
found_count = 0

for zh_word, en_word in sampled_pairs:
    try:
        # Get embeddings for both words
        zh_vec = get_word_embedding(zh_word, zh_emb)
        en_vec = get_word_embedding(en_word, en_emb)

        # Calculate cosine similarity
        sim = cosine_similarity(zh_vec, en_vec)
        similarities.append(sim)
        found_count += 1

        print(f"{zh_word:8s} <-> {en_word:20s} : {sim:.4f}")
    except KeyError as e:
        # Skip pairs where word not in embeddings
        continue

print(f"\nResults:")
print(f"  Pairs evaluated: {found_count}/{sample_size}")
print(f"  Average similarity: {np.mean(similarities):.4f}")
print(f"  Std deviation: {np.std(similarities):.4f}")
print(f"  Min similarity: {np.min(similarities):.4f}")
print(f"  Max similarity: {np.max(similarities):.4f}")

Evaluating 10 translation pairs from dictionary:

年        <-> year                 : -0.0347
月        <-> moon                 : 0.0117
月        <-> months               : -0.0639
月        <-> month                : 0.0526
日        <-> day                  : 0.0688
村        <-> village              : -0.0393
人        <-> man                  : -0.0390
人        <-> people               : 0.0672
%        <-> %                    : -0.0239
大        <-> big                  : -0.0061

Results:
  Pairs evaluated: 10/10
  Average similarity: -0.0007
  Std deviation: 0.0459
  Min similarity: -0.0639
  Max similarity: 0.0688
