# Phase 4: Procrustes Alignment (Optimized)

## Goal
Align hieroglyphic and English embedding spaces using Orthogonal Procrustes.

**Optimization**: Vectorized similarity computation for 100x speedup!

In [1]:
import numpy as np
import pickle
import json
from pathlib import Path
from gensim.models import KeyedVectors
from scipy.linalg import orthogonal_procrustes
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from tqdm import tqdm

print("âœ“ Libraries loaded")

âœ“ Libraries loaded


## 1. Load Embeddings

In [2]:
# Load hieroglyphic embeddings
print("Loading hieroglyphic embeddings...")
hier_path = Path('../data/processed/hieroglyphic_vectors.kv')
hier_wv = KeyedVectors.load(str(hier_path), mmap='r')
print(f"âœ“ Loaded {len(hier_wv):,} hieroglyphic vectors")

# Load GloVe English embeddings
print("\nLoading GloVe embeddings...")
glove_path = Path('../data/processed/glove.6B.300d.txt')

eng_words = []
eng_vecs = []

with open(glove_path, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        eng_words.append(parts[0])
        eng_vecs.append([float(x) for x in parts[1:]])

# Convert to numpy array for vectorized operations
eng_matrix = np.array(eng_vecs)
eng_word_to_idx = {word: i for i, word in enumerate(eng_words)}

print(f"âœ“ Loaded {len(eng_words):,} English vectors")
print(f"  English matrix shape: {eng_matrix.shape}")

Loading hieroglyphic embeddings...
âœ“ Loaded 11,974 hieroglyphic vectors

Loading GloVe embeddings...
âœ“ Loaded 400,000 English vectors
  English matrix shape: (400000, 300)


## 2. Load Anchors and Extract Vectors

In [3]:
# Load English anchors
anchors_path = Path('../data/processed/english_anchors.pkl')
with open(anchors_path, 'rb') as f:
    anchors = pickle.load(f)

print(f"Loaded {len(anchors):,} anchor pairs")

Loaded 8,541 anchor pairs


In [4]:
# Extract anchor vectors
X_list = []  # Hieroglyphic vectors
Y_list = []  # English vectors
valid_anchors = []

for anchor in anchors:
    h_word = anchor['hieroglyphic']
    e_word = anchor['english']
    
    if h_word in hier_wv and e_word in eng_word_to_idx:
        X_list.append(hier_wv[h_word])
        Y_list.append(eng_matrix[eng_word_to_idx[e_word]])
        valid_anchors.append(anchor)

X = np.array(X_list)
Y = np.array(Y_list)

print(f"Valid anchor pairs: {len(valid_anchors):,} / {len(anchors):,} ({len(valid_anchors)/len(anchors)*100:.1f}%)")
print(f"\nAnchor matrix shapes:")
print(f"  X (hieroglyphic): {X.shape}")
print(f"  Y (English): {Y.shape}")

Valid anchor pairs: 7,471 / 8,541 (87.5%)

Anchor matrix shapes:
  X (hieroglyphic): (7471, 300)
  Y (English): (7471, 300)


## 3. Compute Procrustes Transformation

In [5]:
print("Computing Procrustes transformation...")
R, scale = orthogonal_procrustes(X, Y)

print(f"âœ“ Transformation matrix computed")
print(f"  Shape: {R.shape}")
print(f"  Scale factor: {scale:.4f}")

Computing Procrustes transformation...
âœ“ Transformation matrix computed
  Shape: (300, 300)
  Scale factor: 80381.3102


## 4. Align Hieroglyphic Space (Vectorized)

In [6]:
# Create aligned hieroglyphic matrix (vectorized!)
print("Aligning hieroglyphic vectors...")

# Get all hieroglyphic vectors as a matrix
hier_words = hier_wv.index_to_key
hier_matrix = np.array([hier_wv[w] for w in hier_words])

# Transform all at once: aligned = hier_matrix @ R
aligned_matrix = hier_matrix @ R

print(f"âœ“ Aligned {len(hier_words):,} vectors")
print(f"  Aligned matrix shape: {aligned_matrix.shape}")

Aligning hieroglyphic vectors...
âœ“ Aligned 11,974 vectors
  Aligned matrix shape: (11974, 300)


## 5. Optimized Translation Function

In [7]:
def translate_batch(h_indices, topn=5):
    """
    Translate multiple hieroglyphic words at once (vectorized).
    h_indices: list of indices into hier_words
    """
    # Get aligned vectors for these words
    h_vecs = aligned_matrix[h_indices]
    
    # Compute cosine similarity with all English words (vectorized!)
    # Shape: (len(h_indices), len(eng_words))
    similarities = cosine_similarity(h_vecs, eng_matrix)
    
    # Get top N for each word
    results = []
    for i, sim_row in enumerate(similarities):
        top_indices = np.argsort(sim_row)[-topn:][::-1]
        top_words = [(eng_words[idx], sim_row[idx]) for idx in top_indices]
        results.append(top_words)
    
    return results

# Create index lookup
hier_word_to_idx = {word: i for i, word in enumerate(hier_words)}

print("âœ“ Optimized translation function ready")

âœ“ Optimized translation function ready


## 6. Test Translations

In [8]:
# Test words
test_cases = [
    ('wsjr', 'osiris'),
    ('á¸¥r,w', 'horus'),
    ('ppy', 'pepi'),
    ('zêœ£', 'son'),
    ('ná¹¯r', 'god'),
    ('mw', 'water'),
    ('êœ¥ná¸«', 'life'),
]

# Get indices for test words
test_indices = [hier_word_to_idx[w] for w, _ in test_cases if w in hier_word_to_idx]
test_words_found = [w for w, _ in test_cases if w in hier_word_to_idx]

# Translate all at once!
results = translate_batch(test_indices, topn=5)

print("Translation Tests:")
print("="*70)
for (h_word, expected), predictions in zip([(w, e) for w, e in test_cases if w in hier_word_to_idx], results):
    top_word, top_score = predictions[0]
    match = "âœ“" if top_word.lower() == expected.lower() else "âœ—"
    
    print(f"\n{match} {h_word:15s} (expected: {expected})")
    for word, score in predictions:
        print(f"    {word:20s} (score: {score:.3f})")

Translation Tests:

âœ“ wsjr            (expected: osiris)
    osiris               (score: 0.615)
    der                  (score: 0.404)
    anubis               (score: 0.387)
    isis                 (score: 0.324)
    und                  (score: 0.321)

âœ“ á¸¥r,w            (expected: horus)
    horus                (score: 0.621)
    der                  (score: 0.402)
    zum                  (score: 0.346)
    anubis               (score: 0.343)
    deutschen            (score: 0.339)

âœ“ ppy             (expected: pepi)
    pepi                 (score: 0.671)
    ist                  (score: 0.390)
    gott                 (score: 0.387)
    der                  (score: 0.353)
    auf                  (score: 0.351)

âœ“ zêœ£              (expected: son)
    son                  (score: 0.474)
    father               (score: 0.444)
    der                  (score: 0.419)
    eldest               (score: 0.407)
    grandfather          (score: 0.388)

âœ“ ná¹¯r             

## 7. Evaluate on Anchors (Fast!)

In [None]:
# Batch evaluation
print("Evaluating anchors (vectorized)...")

# Get indices for all valid anchors
anchor_h_indices = [hier_word_to_idx[a['hieroglyphic']] for a in valid_anchors]
anchor_e_words = [a['english'] for a in valid_anchors]

# Translate all anchors at once (FAST!)
BATCH_SIZE = 1000
all_predictions = []

for i in tqdm(range(0, len(anchor_h_indices), BATCH_SIZE), desc="Batches"):
    batch_indices = anchor_h_indices[i:i+BATCH_SIZE]
    batch_results = translate_batch(batch_indices, topn=5)
    all_predictions.extend(batch_results)

# Calculate accuracy
correct = 0
top5_correct = 0

for expected, predictions in zip(anchor_e_words, all_predictions):
    top_word = predictions[0][0]
    
    if top_word.lower() == expected.lower():
        correct += 1
    
    if expected.lower() in [w.lower() for w, _ in predictions]:
        top5_correct += 1

total = len(valid_anchors)
accuracy = correct / total * 100
top5_accuracy = top5_correct / total * 100

print("\nEvaluation Results:")
print("="*70)
print(f"Total anchors: {total:,}")
print(f"\nTop-1 Accuracy: {correct:,} / {total:,} = {accuracy:.2f}%")
print(f"Top-5 Accuracy: {top5_correct:,} / {total:,} = {top5_accuracy:.2f}%")
print(f"\nV3 Baseline: 22.0%")
print(f"V5 Improvement: {accuracy - 22:+.2f}%")

Evaluating anchors (vectorized)...


Batches:  38%|â–ˆâ–ˆâ–ˆâ–Š      | 3/8 [01:41<02:47, 33.53s/it]

## 8. Discover New Meanings

In [None]:
# Interesting words
discovery_words = ['inpw', 'wsjr', 'á¸¥r,w', 'ná¹¯r', 'á¸¥qt', 'rêœ¥w']
discovery_indices = [hier_word_to_idx[w] for w in discovery_words if w in hier_word_to_idx]
discovery_found = [w for w in discovery_words if w in hier_word_to_idx]

results = translate_batch(discovery_indices, topn=10)

print("New Discoveries:")
print("="*70)
for word, predictions in zip(discovery_found, results):
    print(f"\n{word}:")
    for i, (e_word, score) in enumerate(predictions, 1):
        print(f"  {i:2d}. {e_word:20s} (score: {score:.3f})")

## 9. Save Results

In [None]:
# Save transformation matrix
np.save('../data/processed/procrustes_matrix.npy', R)

# Save results
results = {
    'total_anchors': total,
    'top1_correct': int(correct),
    'top1_accuracy': float(accuracy),
    'top5_correct': int(top5_correct),
    'top5_accuracy': float(top5_accuracy),
    'v3_baseline': 22.0,
    'improvement': float(accuracy - 22.0)
}

with open('../data/processed/alignment_results.json', 'w') as f:
    json.dump(results, f, indent=2)

print("âœ“ Saved results")
print(f"\nðŸŽ‰ Final V5 Accuracy: {accuracy:.2f}%")