Install Packs!

In [None]:
!pip install --upgrade pip
!pip install transformers sentence-transformers torch torchvision torchaudio networkx scikit-learn numpy pandas tqdm
!pip install torch-geometric

Imports!!

In [None]:
import random
import itertools
import math
from collections import defaultdict
from typing import List, Tuple, Dict

import numpy as np
import pandas as pd
import networkx as nx
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam

from tqdm.auto import tqdm

# PyG
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.nn import GATConv

# Transformers (mBERT)
from transformers import AutoTokenizer, AutoModel
# better than me... blood orange is incredible!


Multiparallel Corpus (synthetic)

In [None]:
# Small synthetic vocabulary with translations for English -> other languages.
synthetic_vocab = [
    ("cat", "Katze", "gato", "γάτα"),
    ("dog", "Hund", "perro", "σκύλος"),
    ("house", "Haus", "casa", "σπίτι"),
    ("blue", "blau", "azul", "μπλε"),
    ("eat", "essen", "comer", "τρώω"),
    ("sleep", "schlafen", "dormir", "κοιμάμαι"),
    ("small", "klein", "pequeño", "μικρό"),
    ("big", "groß", "grande", "μεγάλο"),
    ("man", "Mann", "hombre", "άνδρας"),
    ("woman", "Frau", "mujer", "γυναίκα"),
    ("see", "sehen", "ver", "βλέπω"),
    ("love", "lieben", "amar", "αγαπώ"),
]

# synthetic sentences by sampling small templates
templates = [
    ("The {a} is {adj}", "{a} ist {adj}", "El {a} es {adj}", "Η {a} είναι {adj}"),
    ("A {adj} {a}", "Ein {adj} {a}", "Un {adj} {a}", "Ένα {adj} {a}"),
    ("I {v} the {a}", "Ich {v} die {a}", "Yo {v} al {a}", "Εγώ {v} το {a}"),
    ("The {a} and the {b}", "Die {a} und die {b}", "El {a} y el {b}", "Η {a} και η {b}"),
]

lang_index = {"en": 0, "de": 1, "es": 2, "el": 3}

def build_sentence_pair(template):
    # chose two distinct nouns and one adjective and one verb
    nouns = random.sample(synthetic_vocab, 2)
    adj = random.choice([w for w in synthetic_vocab if w[0] in ("small","big","blue")])
    verb = random.choice([w for w in synthetic_vocab if w[0] in ("eat","sleep","see","love")])
    # Fill template
    en_t, de_t, es_t, el_t = template
    a_en = nouns[0][0]; b_en = nouns[1][0]; adj_en = adj[0]; v_en = verb[0]
    a_de = nouns[0][1]; b_de = nouns[1][1]; adj_de = adj[1]; v_de = verb[1]
    a_es = nouns[0][2]; b_es = nouns[1][2]; adj_es = adj[2]; v_es = verb[2]
    a_el = nouns[0][3]; b_el = nouns[1][3]; adj_el = adj[3]; v_el = verb[3]
    # Simple formatting
    en = en_t.format(a=a_en, b=b_en, adj=adj_en, v=v_en)
    de = de_t.format(a=a_de, b=b_de, adj=adj_de, v=v_de)
    es = es_t.format(a=a_es, b=b_es, adj=adj_es, v=v_es)
    el = el_t.format(a=a_el, b=b_el, adj=adj_el, v=v_el)
    return {"en": en, "de": de, "es": es, "el": el}

# Build a dataset of sentence quadruples
random.seed(42)
num_sentences = 200  # size; increase if you want
corpus = []
for _ in range(num_sentences):
    templ = random.choice(templates)
    corpus.append(build_sentence_pair(templ))

# Quick peek
for i in range(5):
    print(corpus[i])


{'en': 'The cat is blue', 'de': 'Katze ist blau', 'es': 'El gato es azul', 'el': 'Η γάτα είναι μπλε'}
{'en': 'A big love', 'de': 'Ein groß lieben', 'es': 'Un grande amar', 'el': 'Ένα μεγάλο αγαπώ'}
{'en': 'The cat and the love', 'de': 'Die Katze und die lieben', 'es': 'El gato y el amar', 'el': 'Η γάτα και η αγαπώ'}
{'en': 'A blue man', 'de': 'Ein blau Mann', 'es': 'Un azul hombre', 'el': 'Ένα μπλε άνδρας'}
{'en': 'The blue and the big', 'de': 'Die blau und die groß', 'es': 'El azul y el grande', 'el': 'Η μπλε και η μεγάλο'}


Tokenize... token-level objects(nodes)

In [None]:
# Tokenize simply by whitespace
# EMBEDDINGS; mBERT tokenization and map word -> averaged subword embeddings.

def whitespace_tokenize(s: str):
    return s.strip().split()

# ode structure: each token gets a unique node id across the whole corpus + meta
nodes = []
node_id = 0
sent_token_index = []  # per sentence: dict mapping (lang -> list of node ids)
for s_idx, sent in enumerate(corpus):
    mapping = {}
    for lang in ["en","de","es","el"]:
        toks = whitespace_tokenize(sent[lang])
        ids = []
        for pos, tok in enumerate(toks):
            nodes.append({
                "node_id": node_id,
                "sent_idx": s_idx,
                "lang": lang,
                "token": tok,
                "pos": pos,
                "sent_len": len(toks)
            })
            ids.append(node_id)
            node_id += 1
        mapping[lang] = ids
    sent_token_index.append(mapping)

num_nodes = len(nodes)
print("Total nodes (tokens):", num_nodes)


Load multilingual tokenizer/model (mBERT) and compute token embeddings

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
bert = AutoModel.from_pretrained(model_name).to(device)
bert.eval()

# helper: compute embedding for a word (average of subword token embeddings)
# cache results for identical surface forms
from functools import lru_cache

@lru_cache(maxsize=10000)
def word_embedding(word: str) -> np.ndarray:
    # use tokenizer.encode_plus to get input ids and attention mask
    with torch.no_grad():
        encoded = tokenizer(word, return_tensors="pt", add_special_tokens=True)
        input_ids = encoded["input_ids"].to(device)
        attention_mask = encoded["attention_mask"].to(device)
        outputs = bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden = outputs.last_hidden_state.squeeze(0)  # (seq_len, hidden)
        # skip [CLS] and [SEP], average the rest
        if last_hidden.shape[0] <= 2:
            vec = last_hidden.mean(dim=0).cpu().numpy()
        else:
            vec = last_hidden[1:-1].mean(dim=0).cpu().numpy()
    return vec

# Compute embeddings for all nodes
emb_dim = bert.config.hidden_size
X_emb = np.zeros((num_nodes, emb_dim), dtype=np.float32)
for nd in tqdm(nodes, desc="embeddings"):
    X_emb[nd["node_id"], :] = word_embedding(nd["token"])
# normalize embeddings for cosine similarity convenience
X_emb = normalize(X_emb, axis=1)


Initial bilingual alignments via embedding similarity (per sentence)

In [None]:
# For each sentence, compute bilingual alignment edges between language pairs using cosine similarity
# Threshold-based and 1-to-1 greedy assignment
def align_pair(ids_a: List[int], ids_b: List[int], threshold=0.6):
    # compute cosine matrix between embeddings
    ma = X_emb[ids_a]  # (na, d)
    mb = X_emb[ids_b]  # (nb, d)
    sim = cosine_similarity(ma, mb)  # (na, nb)
    edges = []
    # greedy: for eachh a, pick best b if above threshold and not taken; and also allow symmetric bests
    taken_b = set()
    pairs = []
    # flattened sort by sim descending
    idx_pairs = [(i,j,sim[i,j]) for i in range(sim.shape[0]) for j in range(sim.shape[1])]
    idx_pairs.sort(key=lambda x: x[2], reverse=True)
    for i,j,s in idx_pairs:
        if s < threshold:
            break
        if j in taken_b:
            continue
        taken_b.add(j)
        pairs.append((ids_a[i], ids_b[j], float(s)))

    return pairs

# build initial edge set (undirected)
initial_alignment_edges = set()
for s_idx, mapping in enumerate(sent_token_index):
    # align all language pairs (en-de, en-es, en-el, de-es, de-el, es-el)
    langs = ["en","de","es","el"]
    for la, lb in itertools.combinations(langs, 2):
        ids_a = mapping[la]
        ids_b = mapping[lb]
        pairs = align_pair(ids_a, ids_b, threshold=0.62)
        for u,v,score in pairs:
            # store as sorted tuple
            if u == v:
                continue
            edge = tuple(sorted((u,v)))
            initial_alignment_edges.add(edge)

print("Initial bilingual alignment edges:", len(initial_alignment_edges))
# show some
list(initial_alignment_edges)[:10]


multiparallel graph (NetworkX), add intra-sentence adjacency and initial edges

In [None]:
G = nx.Graph()
for nd in nodes:
    G.add_node(nd["node_id"], **nd)

# intra-sentence adjacency edges (connect adjacent tokens within same sentence & language)
for s_idx, mapping in enumerate(sent_token_index):
    for lang, ids in mapping.items():
        for i in range(len(ids)-1):
            G.add_edge(ids[i], ids[i+1], label="intra")

# initial bilingual alignment edges (label them)
for (u,v) in initial_alignment_edges:
    G.add_edge(u, v, label="bilingual_init")

print("Graph summary:", G)
G = nx.Graph()
for nd in nodes:
    G.add_node(nd["node_id"], **nd)

# intra-sentence adjacency edges (connect adjacent tokens within same sentence & language)
for s_idx, mapping in enumerate(sent_token_index):
    for lang, ids in mapping.items():
        for i in range(len(ids)-1):
            G.add_edge(ids[i], ids[i+1], label="intra")

# add initial bilingual alignment edges (label them)
for (u,v) in initial_alignment_edges:
    G.add_edge(u, v, label="bilingual_init")

print("Graph summary:", G)

Compute graph metrics (degree, pagerank) and community detection

In [None]:
# Degree
deg_dict = dict(G.degree())
nx.set_node_attributes(G, deg_dict, "degree")

# PageRank
pr = nx.pagerank(G)
nx.set_node_attributes(G, pr, "pagerank")

# Greedy modularity communities (returns sets)
communities = list(nx.algorithms.community.greedy_modularity_communities(G))
# Map node -> community id (int)
node2comm = {}
for cid, comm in enumerate(communities):
    for n in comm:
        node2comm[n] = cid
nx.set_node_attributes(G, node2comm, "community")

print("Number of communities:", len(communities))


Prepare node feature matrix (concatenate embedding + positional + degree + pagerank + community one-hot)

In [None]:
# base embedding fro earlier X_emb
# positional: pos / sent_len normalized scalar
pos_feat = np.zeros((num_nodes, 1), dtype=np.float32)
degree_feat = np.zeros((num_nodes, 1), dtype=np.float32)
pagerank_feat = np.zeros((num_nodes, 1), dtype=np.float32)
for nd in nodes:
    nid = nd["node_id"]
    pos_feat[nid,0] = nd["pos"] / max(1, nd["sent_len"] - 1)
    degree_feat[nid,0] = deg_dict.get(nid,0)
    pagerank_feat[nid,0] = pr.get(nid,0)

# community one-hot (small number of communities)
num_comms = len(communities)
comm_feat = np.zeros((num_nodes, num_comms), dtype=np.float32)
for nid, cid in node2comm.items():
    comm_feat[nid, cid] = 1.0

# normalize degree/pagerank
degree_feat = (degree_feat - degree_feat.mean()) / (degree_feat.std() + 1e-8)
pagerank_feat = (pagerank_feat - pagerank_feat.mean()) / (pagerank_feat.std() + 1e-8)

X = np.concatenate([X_emb, pos_feat, degree_feat, pagerank_feat, comm_feat], axis=1)
print("Node feature shape:", X.shape)


Convert to PyG Data (edges + features)

In [None]:
# Build edge_index from graph (use undirected symmetric edges)
edge_list = list(G.edges())
edge_index = torch.tensor(edge_list, dtype=torch.long).t().contiguous()  # shape (2, E)

if edge_index.shape[1] == 0:
    raise RuntimeError("No edges in graph.")
edge_index = torch.cat([edge_index, edge_index.flip(0)], dim=1)  # add reverse direction

x = torch.tensor(X, dtype=torch.float)

data = Data(x=x, edge_index=edge_index)
print(data)


Link prediction dataset (positive and negative samples)

In [None]:
# Positive edges: the bilingual_init edges (need the GNN to learn to predict them).
# some of these will be present in graph; use as positives.
pos_edges = [tuple(e) for e in initial_alignment_edges]

# Negative sampling: sample same number of random node pairs that are NOT in graph and not positives
all_pairs_set = set(tuple(sorted((u,v))) for u in range(num_nodes) for v in range(u+1, num_nodes))
forbidden = set(initial_alignment_edges) | set(tuple(sorted(e)) for e in G.edges())
available = list(all_pairs_set - forbidden)
neg_samples = random.sample(available, min(len(pos_edges), len(available)))

# Build training tensors (train in-batch)
def pairs_to_tensor(pairs):
    u = torch.tensor([p[0] for p in pairs], dtype=torch.long)
    v = torch.tensor([p[1] for p in pairs], dtype=torch.long)
    return u, v

pos_u, pos_v = pairs_to_tensor(pos_edges)
neg_u, neg_v = pairs_to_tensor(neg_samples)

print("Positive samples:", pos_u.size(0), "Negative samples:", neg_u.size(0))


Positive samples: 1757 Negative samples: 1757


GAT encoder + dot-product decoder for link prediction...

In [None]:
class GATEncoder(nn.Module):
    def __init__(self, in_dim, hidden_dim=256, n_heads=4, n_layers=2, dropout=0.2):
        super().__init__()
        self.layers = nn.ModuleList()
        self.layers.append(GATConv(in_dim, hidden_dim // n_heads, heads=n_heads, dropout=dropout))
        for _ in range(n_layers-1):
            self.layers.append(GATConv(hidden_dim, hidden_dim // n_heads, heads=n_heads, dropout=dropout))
        self.dropout = dropout
        self.out_dim = hidden_dim

    def forward(self, x, edge_index):
        for i,layer in enumerate(self.layers):
            x = layer(x, edge_index)
            x = F.elu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        return x  # node embeddings

class LinkPredictor(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        # use dot product; optionally could add MLP
        self.lin = nn.Linear(in_dim*2, in_dim)
        self.out = nn.Linear(in_dim, 1)

    def forward(self, hu, hv):
        # hu, hv: (batch, in_dim)
        h = torch.cat([hu, hv], dim=1)
        h = F.relu(self.lin(h))
        return self.out(h).squeeze(-1)

# instantiate
in_dim = data.num_node_features
encoder = GATEncoder(in_dim=in_dim, hidden_dim=256, n_heads=4, n_layers=2, dropout=0.2).to(device)
predictor = LinkPredictor(encoder.out_dim).to(device)

optimizer = Adam(list(encoder.parameters()) + list(predictor.parameters()), lr=1e-3, weight_decay=1e-5)
criterion = nn.BCEWithLogitsLoss()


Training loop (mini-batch approach)

In [None]:
data = data.to(device)
x_all = data.x
edge_index_all = data.edge_index

# onvert training pairs to device tensors
pos_u, pos_v = pos_u.to(device), pos_v.to(device)
neg_u, neg_v = neg_u.to(device), neg_v.to(device)

# Combine into a dataset for shuffled training
num_epochs = 30
batch_size = 256
pos_pairs = list(zip(pos_u.tolist(), pos_v.tolist()))
neg_pairs = list(zip(neg_u.tolist(), neg_v.tolist()))
labels_pos = [1]*len(pos_pairs)
labels_neg = [0]*len(neg_pairs)
train_pairs = pos_pairs + neg_pairs
train_labels = labels_pos + labels_neg

# shuffle
perm = list(range(len(train_pairs)))
random.shuffle(perm)
train_pairs = [train_pairs[i] for i in perm]
train_labels = [train_labels[i] for i in perm]

def batchify(pairs_batch):
    u = torch.tensor([p[0] for p in pairs_batch], dtype=torch.long, device=device)
    v = torch.tensor([p[1] for p in pairs_batch], dtype=torch.long, device=device)
    return u, v

for epoch in range(1, num_epochs+1):
    encoder.train(); predictor.train()
    total_loss = 0.0
    for i in range(0, len(train_pairs), batch_size):
        batch_pairs = train_pairs[i:i+batch_size]
        batch_labels = torch.tensor(train_labels[i:i+batch_size], dtype=torch.float, device=device)

        optimizer.zero_grad()
        node_reps = encoder(x_all, edge_index_all)  # (N, D)
        u_idx, v_idx = batchify(batch_pairs)
        hu = node_reps[u_idx]
        hv = node_reps[v_idx]
        logits = predictor(hu, hv)  # (batch,)
        loss = criterion(logits, batch_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * len(batch_pairs)

    avg_loss = total_loss / len(train_pairs)
    # simple evaluation: compute recall on validation (reuse training positives for demo)
    encoder.eval(); predictor.eval()
    with torch.no_grad():
        node_reps = encoder(x_all, edge_index_all)
        # score all candidate pairs in pos_edges for retrieval (simple)
        pos_u_t = torch.tensor([p[0] for p in pos_pairs], device=device)
        pos_v_t = torch.tensor([p[1] for p in pos_pairs], device=device)
        pos_scores = predictor(node_reps[pos_u_t], node_reps[pos_v_t]).sigmoid().cpu().numpy()
        avg_pos_score = pos_scores.mean()

        neg_u_t = torch.tensor([p[0] for p in neg_pairs], device=device)
        neg_v_t = torch.tensor([p[1] for p in neg_pairs], device=device)
        neg_scores = predictor(node_reps[neg_u_t], node_reps[neg_v_t]).sigmoid().cpu().numpy()
        avg_neg_score = neg_scores.mean()

    print(f"Epoch {epoch:02d} | Loss: {avg_loss:.4f} | avg_pos_score: {avg_pos_score:.3f} | avg_neg_score: {avg_neg_score:.3f}")


Used trained model to predict missing alignment edges (link prediction) and add high-confidence edges...

In [None]:
encoder.eval(); predictor.eval()
with torch.no_grad():
    node_reps = encoder(x_all, edge_index_all)

# Candidate generation strategy:
# for each token in English, score all tokens in other languages within same sentence and add top-K above threshold.
predicted_edges = set()
score_threshold = 0.7
top_k = 2

for s_idx, mapping in enumerate(sent_token_index):
    en_ids = mapping["en"]
    for en_id in en_ids:
        # considered tokens in other langs within same sentence
        cand_ids = mapping["de"] + mapping["es"] + mapping["el"]
        if not cand_ids:
            continue
        en_rep = node_reps[en_id].unsqueeze(0).repeat(len(cand_ids),1)
        cand_reps = node_reps[torch.tensor(cand_ids, device=device)]
        scores = predictor(en_rep, cand_reps).sigmoid().cpu().detach().numpy()
        # pick top_k
        top_idxs = np.argsort(scores)[-top_k:][::-1]
        for idx in top_idxs:
            score = float(scores[idx])
            if score >= score_threshold:
                u,v = sorted((en_id, cand_ids[idx]))
                predicted_edges.add((u,v,score))

print("Predicted new edges (count):", len(predicted_edges))
# show a few with token text
for u,v,score in list(predicted_edges)[:20]:
    print(f"{u}({G.nodes[u]['token']},{G.nodes[u]['lang']}) <-> {v}({G.nodes[v]['token']},{G.nodes[v]['lang']})  score={score:.3f}")

Predicted new edges (count): 1536
1106(cat,en) <-> 1121(γάτα,el)  score=0.970
1553(dog,en) <-> 1558(Hund,de)  score=1.000
2460(cat,en) <-> 2469(γάτα,el)  score=1.000
1827(The,en) <-> 1834(El,es)  score=0.999
1437(the,en) <-> 1449(Η,el)  score=0.982
1611(blue,en) <-> 1613(ist,de)  score=1.000
1014(see,en) <-> 1017(sehen,de)  score=0.977
1914(love,en) <-> 1917(Ich,de)  score=0.989
355(The,en) <-> 362(El,es)  score=0.995
1313(the,en) <-> 1316(Frau,de)  score=1.000
662(is,en) <-> 670(azul,es)  score=0.999
2443(see,en) <-> 2456(το,el)  score=0.998
2965(big,en) <-> 2971(grande,es)  score=0.999
723(dog,en) <-> 731(es,es)  score=0.999
3067(the,en) <-> 3071(die,de)  score=1.000
2471(small,en) <-> 2481(Η,el)  score=1.000
2715(the,en) <-> 2728(γυναίκα,el)  score=1.000
1332(and,en) <-> 1345(Η,el)  score=0.999
1667(A,en) <-> 1674(grande,es)  score=0.994
2979(love,en) <-> 2984(lieben,de)  score=1.000


Evaluate recall/precision on withheld gold alignments

In [None]:
gold = set(initial_alignment_edges)
pred_set = set(tuple(sorted((u,v))) for u,v,_ in predicted_edges)

tp = len(gold & pred_set)
precision = tp / len(pred_set) if pred_set else 0.0
recall = tp / len(gold) if gold else 0.0
print(f"Precision: {precision:.3f}, Recall: {recall:.3f}, TP: {tp}, Pred: {len(pred_set)}, Gold: {len(gold)}")
# trash!!

Precision: 0.256, Recall: 0.224, TP: 393, Pred: 1536, Gold: 1757
