# V6 Phase 4: Text-Only Alignment

## Goal
Fine-tune **HieroBERT** (Text Only) to align with **English BERT** using **Contrastive Learning** (InfoNCE).

## Strategy
1. **Encoders**:
    - **Source**: HieroBERT (Trainable). No visual fusion.
    - **Target**: English BERT (Frozen).
2. **Objective**: Minimize the distance between correct translation pairs $(h_i, e_i)$.
3. **Data**: 8,541 Anchor Pairs.
4. **Output**: Save fine-tuned model as `heirobert_small_2`.

## Inputs
- `models/hierobert_small`: Pre-trained HieroBERT.
- `data/processed/anchors.json`: Anchor pairs.

In [1]:
!pip install transformers torch scikit-learn numpy pandas tqdm


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
import json
import numpy as np
from pathlib import Path
from transformers import BertModel, BertTokenizerFast, BertTokenizer
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Paths
MODEL_PATH = Path("../models/hierobert_small")
ANCHORS_PATH = Path("../data/processed/anchors.json")
SAVE_PATH = Path("../models/heirobert_small_2")

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: mps


## 1. Data Loading & Dataset Class

In [3]:
# Load Anchors
with open(ANCHORS_PATH, 'r') as f:
    anchors = json.load(f)

# Tokenizers
hiero_tokenizer = BertTokenizerFast.from_pretrained(str(MODEL_PATH))
en_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class TextDataset(Dataset):
    def __init__(self, anchors, h_tokenizer, e_tokenizer, max_len=32):
        self.anchors = anchors
        self.h_tokenizer = h_tokenizer
        self.e_tokenizer = e_tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.anchors)

    def __getitem__(self, idx):
        item = self.anchors[idx]
        h_text = item['hieroglyphic']
        e_text = item['english']

        # Hiero Tokenization
        h_enc = self.h_tokenizer(h_text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')
        
        # English Tokenization
        e_enc = self.e_tokenizer(e_text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors='pt')

        return {
            'h_input_ids': h_enc['input_ids'].squeeze(0),
            'h_attention_mask': h_enc['attention_mask'].squeeze(0),
            'e_input_ids': e_enc['input_ids'].squeeze(0),
            'e_attention_mask': e_enc['attention_mask'].squeeze(0)
        }

# Split & Loader
train_anchors, test_anchors = train_test_split(anchors, test_size=0.1, random_state=42)

train_dataset = TextDataset(train_anchors, hiero_tokenizer, en_tokenizer)
test_dataset = TextDataset(test_anchors, hiero_tokenizer, en_tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## 2. Model Setup

In [4]:
# Initialize Models
hiero_model = BertModel.from_pretrained(str(MODEL_PATH)).to(device)
en_model = BertModel.from_pretrained('bert-base-uncased').to(device)

# Freeze English BERT (Target)
for param in en_model.parameters():
    param.requires_grad = False
en_model.eval()

# Optimizer
optimizer = optim.AdamW(hiero_model.parameters(), lr=2e-5)

# Contrastive Loss (InfoNCE)
def contrastive_loss(h_emb, e_emb, temperature=0.1):
    # Normalize
    h_emb = torch.nn.functional.normalize(h_emb, dim=1)
    e_emb = torch.nn.functional.normalize(e_emb, dim=1)
    
    # Cosine similarity matrix: [batch, batch]
    logits = torch.matmul(h_emb, e_emb.T) / temperature
    
    # Labels: diagonal is the positive pair (0,0), (1,1), etc.
    labels = torch.arange(logits.size(0)).to(device)
    
    loss = nn.CrossEntropyLoss()(logits, labels)
    return loss

Some weights of BertModel were not initialized from the model checkpoint at ../models/hierobert_small and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## 3. Training Loop

In [5]:
EPOCHS = 5

print("Starting Text-Only Fine-tuning...")
for epoch in range(EPOCHS):
    hiero_model.train()
    total_loss = 0
    
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        # Move to device
        h_ids = batch['h_input_ids'].to(device)
        h_mask = batch['h_attention_mask'].to(device)
        e_ids = batch['e_input_ids'].to(device)
        e_mask = batch['e_attention_mask'].to(device)
        
        optimizer.zero_grad()
        
        # Forward Hiero
        h_out = hiero_model(input_ids=h_ids, attention_mask=h_mask)
        h_emb = h_out.pooler_output
        
        # Forward English (Target)
        with torch.no_grad():
            e_out = en_model(input_ids=e_ids, attention_mask=e_mask)
            e_emb = e_out.pooler_output
            
        # Loss
        loss = contrastive_loss(h_emb, e_emb)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} Loss: {avg_loss:.4f}")

# Save Fine-tuned Model
SAVE_PATH.mkdir(parents=True, exist_ok=True)
hiero_model.save_pretrained(SAVE_PATH)
hiero_tokenizer.save_pretrained(SAVE_PATH)
print(f"Model saved to {SAVE_PATH}")

Starting Text-Only Fine-tuning...


Epoch 1: 100%|██████████| 241/241 [01:07<00:00,  3.57it/s]


Epoch 1 Loss: 3.3948


Epoch 2: 100%|██████████| 241/241 [00:56<00:00,  4.24it/s]


Epoch 2 Loss: 3.3204


Epoch 3: 100%|██████████| 241/241 [00:57<00:00,  4.20it/s]


Epoch 3 Loss: 3.2586


Epoch 4: 100%|██████████| 241/241 [00:58<00:00,  4.14it/s]


Epoch 4 Loss: 3.2009


Epoch 5: 100%|██████████| 241/241 [00:58<00:00,  4.15it/s]


Epoch 5 Loss: 3.1473
Model saved to ../models/heirobert_small_2


## 4. Evaluation

In [8]:
def evaluate(loader, k_values=[1, 5, 10]):
    hiero_model.eval()
    all_h_embs = []
    all_e_embs = []
    
    with torch.no_grad():
        for batch in tqdm(loader, desc="Evaluating"):
            h_ids = batch['h_input_ids'].to(device)
            h_mask = batch['h_attention_mask'].to(device)
            e_ids = batch['e_input_ids'].to(device)
            e_mask = batch['e_attention_mask'].to(device)
            
            h_out = hiero_model(input_ids=h_ids, attention_mask=h_mask)
            e_out = en_model(input_ids=e_ids, attention_mask=e_mask)
            
            all_h_embs.append(h_out.pooler_output.cpu())
            all_e_embs.append(e_out.pooler_output.cpu())
            
    H = torch.cat(all_h_embs)
    E = torch.cat(all_e_embs)
    
    # Normalize
    H = torch.nn.functional.normalize(H, dim=1)
    E = torch.nn.functional.normalize(E, dim=1)
    
    # Similarity Matrix
    sim_matrix = torch.matmul(H, E.T).numpy()
    
    top_k_hits = {k: 0 for k in k_values}
    n_test = len(H)
    
    for i in range(n_test):
        sorted_indices = np.argsort(-sim_matrix[i])
        for k in k_values:
            if i in sorted_indices[:k]:
                top_k_hits[k] += 1
                
    results = {f"Top-{k}": hits/n_test for k, hits in top_k_hits.items()}
    return results

print("Evaluating on Test Set...")
scores = evaluate(test_loader)
print("Text-Only Alignment Results:")
print(json.dumps(scores, indent=2))

Evaluating on Test Set...


Evaluating: 100%|██████████| 27/27 [00:03<00:00,  7.50it/s]

Text-Only Alignment Results:
{
  "Top-1": 0.0023391812865497076,
  "Top-5": 0.026900584795321637,
  "Top-10": 0.047953216374269005
}



