# lets see if we can get some terms

In [5]:

# --- Setup: Import and Load Models ---
from heiro import Config, EmbeddingManager, MultiSpaceVec2VecModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Initialize config and embedding manager
config = Config()
embedding_manager = EmbeddingManager(config)

# Load embeddings
hier_data = embedding_manager.load_embeddings('hieroglyphic')
tla_english_data = embedding_manager.load_embeddings('tla_english')
wikipedia_data = embedding_manager.load_embeddings('wikipedia')

# Load trained vec2vec model
model_path = config.MODELS_DIR / "multi_space_vec2vec_model.pt"
device = torch.device('mps' if torch.backends.mps.is_available() and torch.backends.mps.is_built() else 'cuda' if torch.cuda.is_available() else 'cpu')
checkpoint = torch.load(model_path, map_location=device)
model = MultiSpaceVec2VecModel(
    checkpoint['input_dims'],
    checkpoint['latent_dim'],
    checkpoint['num_spaces']
)
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()
model.to(device)



INFO:heiro:Loaded hieroglyphic embeddings: (12773, 300) from cache/embeddings/hieroglyphic_embeddings.pkl
INFO:heiro:Loaded tla_english embeddings: (12773, 768) from cache/embeddings/tla_english_embeddings.pkl
INFO:heiro:Loaded wikipedia embeddings: (101000, 768) from cache/embeddings/wikipedia_embeddings.pkl


MultiSpaceVec2VecModel(
  (encoders): ModuleList(
    (0-1): 2 x Sequential(
      (0): Linear(in_features=768, out_features=512, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=512, out_features=256, bias=True)
      (4): ReLU()
    )
    (2): Sequential(
      (0): Linear(in_features=300, out_features=512, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=512, out_features=256, bias=True)
      (4): ReLU()
    )
  )
  (decoders): ModuleList(
    (0-1): 2 x Sequential(
      (0): Linear(in_features=256, out_features=512, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=512, out_features=768, bias=True)
    )
    (2): Sequential(
      (0): Linear(in_features=256, out_features=512, bias=True)
      (1): ReLU()
      (2): Dropout(p=0.2, inplace=False)
      (3): Linear(in_features=512, out_features=300, bias=True)
    )
  )
)

In [6]:
# Choose a term and its expert translation
# Example: index = 0 (change as needed)
idx = 3
hier_word = hier_data['texts'][idx]
eng_word = tla_english_data['texts'][idx]
print(f"Hieroglyphic: {hier_word} | Expert English: {eng_word}")

# Get embeddings and move to device
hier_emb = torch.FloatTensor(hier_data['embeddings'][idx:idx+1]).to(device)
eng_emb = torch.FloatTensor(tla_english_data['embeddings'][idx:idx+1]).to(device)
wiki_embs = torch.FloatTensor(wikipedia_data['embeddings']).to(device)
wiki_texts = wikipedia_data['texts']

# Project into latent space
with torch.no_grad():
    hier_latent = model.encoders[2](hier_emb)   # 2 = hieroglyphic encoder
    eng_latent = model.encoders[1](eng_emb)     # 1 = TLA English encoder
    wiki_latents = model.encoders[0](wiki_embs) # 0 = Wikipedia encoder

# Compute similarity to expert translation
sim = cosine_similarity(hier_latent.cpu().numpy(), eng_latent.cpu().numpy())[0][0]
print(f"Latent space similarity (to expert translation): {sim:.4f}")

# Find closest Wikipedia embedding in latent space
wiki_sims = cosine_similarity(hier_latent.cpu().numpy(), wiki_latents.cpu().numpy())[0]
best_idx = np.argmax(wiki_sims)
best_sim = wiki_sims[best_idx]
best_wiki_text = wiki_texts[best_idx]

print(f"\nClosest Wikipedia text in latent space:")
print(f"  Text: {best_wiki_text}")
print(f"  Similarity: {best_sim:.4f}")
print(f"  Distance: {1 - best_sim:.4f}")

Hieroglyphic: ꜥḥꜥ | Expert English: Get up!
Latent space similarity (to expert translation): 0.8207

Closest Wikipedia text in latent space:
  Text: A computer program is a sequence or set of instructions in a programming language for a computer to execute. It is one component of software, which also includes documentation and other intangible components.

A computer program in its human-readable form is called source code. Source code needs another computer program to execute because computers can only execute their native machine instructions. Therefore, source code may be translated to machine instructions using the language's compiler. (Assembly language programs are translated using an assembler.) The resulting file is called an executable. Alternatively, source code may execute within the language's interpreter.

If the executable is requested for execution, then the operating system loads it into memory and starts a process. The central processing unit will soon switch to this p