<a href="https://colab.research.google.com/github/graehl/awesome-align/blob/master/awesome_align_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# AWESOME: Aligning Word Embedding Spaces of Multilingual Encoders

[``awesome-align``](https://github.com/neulab/awesome-align) is a tool that can extract word alignments from multilingual BERT (mBERT) and allows you to fine-tune mBERT on parallel corpora for better alignment quality (see [our paper](https://arxiv.org/abs/2101.08231) for more details).

This is a simple demo of how `awesome-align` extracts word alignments from mBERT.

First, install and import the following packages. (Note that the original `awesome-align` tool does not require the `transformers` package.)

In [18]:
!pwd
!git clone https://github.com/graehl/awesome-align.git || (cd awesome-align && git pull)
!pip install -r awesome-align/requirements.txt
import sys
sys.path.append('/content/awesome-align')
sys.path.append('/content')

!pip install transformers
!pip install onnx
!pip install skl2onnx
import torch
import itertools
import onnx
from skl2onnx.helpers import onnx_helper


/content
fatal: destination path 'awesome-align' already exists and is not an empty directory.
remote: Enumerating objects: 7, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 4 (delta 3), reused 0 (delta 0), pack-reused 0 (from 0)[K
Unpacking objects: 100% (4/4), 1008 bytes | 504.00 KiB/s, done.
From https://github.com/graehl/awesome-align
   3a7d7e0..68c1ff2  master     -> origin/master
Updating 3a7d7e0..68c1ff2
Fast-forward
 awesome_align/modeling.py | 2 [32m+[m[31m-[m
 1 file changed, 1 insertion(+), 1 deletion(-)


In [19]:
# printing
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'


Load the multilingual BERT model and its tokenizer.

In [25]:
model_name_or_path='bert-base-multilingual-cased'

import transformers

from awesome_align import modeling
from awesome_align.configuration_bert import BertConfig
from awesome_align.modeling import BertForMaskedLM
from awesome_align.tokenization_bert import BertTokenizer
from awesome_align.tokenization_utils import PreTrainedTokenizer
from awesome_align.modeling_utils import PreTrainedModel

def init_model_and_tokenizer(
    model_name_or_path,
    config_name = None,
    cache_dir = None,
    tokenizer_name = None,
):
  config_class, model_class, tokenizer_class = BertConfig, BertForMaskedLM, BertTokenizer
  if config_name:
      config = config_class.from_pretrained(config_name, cache_dir=cache_dir)
  elif model_name_or_path:
      config = config_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)
  else:
      config = config_class()

  if tokenizer_name:
      tokenizer = tokenizer_class.from_pretrained(tokenizer_name, cache_dir=cache_dir)
  elif model_name_or_path:
      tokenizer = tokenizer_class.from_pretrained(model_name_or_path, cache_dir=cache_dir)
  else:
      raise ValueError(
          "You are instantiating a new {} tokenizer. This is not supported, but you can do it from another script, save it,"
          "and load it from here, using --tokenizer_name".format(tokenizer_class.__name__)
      )

  modeling.PAD_ID = tokenizer.pad_token_id
  modeling.CLS_ID = tokenizer.cls_token_id
  modeling.SEP_ID = tokenizer.sep_token_id

  if model_name_or_path:
      model = model_class.from_pretrained(
          model_name_or_path,
          from_tf=bool(".ckpt" in model_name_or_path),
          config=config,
          cache_dir=cache_dir,
      )
  else:
      model = model_class(config=config)

  return model, tokenizer

#model, tokenizer = init_model_and_tokenizer(model_name_or_path)
model, tokenizer = transformers.AutoModel.from_pretrained(model_name_or_path), transformers.AutoTokenizer.from_pretrained(model_name_or_path)


Input *tokenized* source and target sentences.

In [32]:
src = 'I bought a new car because I was going through a midlife crisis .'
tgt = 'Я купил новую тачку , потому что я переживал кризис среднего возраста .'
tgt = 'Compré un auto nuevo porque estaba pasando por una crisis de la mediana edad .'
srctgt = f'{src} ||| {tgt}'
fpar = 'srctgt.txt'
with open(fpar, 'w') as f:
  f.write(srctgt)
!CUDA_VISIBLE_DEVICES=0 PYTHONPATH=/content/awesome-align python /content/awesome-align/run_align.py --output_file=align.txt --model_name_or_path="$model_name_or_path" --data_file="$fpar" --extraction 'softmax' --batch_size=32


2025-04-03 22:07:31.980443: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1743718052.014437    7959 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1743718052.025475    7959 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Loading the dataset...
Extracting: 1it [00:00,  1.16it/s]


In [33]:
!cat align.txt

13-14 2-1 10-8 0-0 5-4 4-2 12-9 11-12 3-3 8-6 1-0 7-5 9-7


Run the model and print the resulting alignments.

In [22]:
import pdb
# pre-processing
sent_src, sent_tgt = src.strip().split(), tgt.strip().split()
token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [tokenizer.tokenize(word) for word in sent_tgt]
wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
def tokenizer_max_len(tokenizer): return tokenizer.max_len_single_sentence if hasattr(tokenizer, 'max_len_single_sentence') else tokenizer.model_max_length
maxlenkw = {}
if hasattr(tokenizer, 'model_max_length'):
  maxlenkw['model_max_length'] = tokenizer.model_max_length
  maxlenkw['truncation'] = True

def ids_for_model(ids, model, tokenizer): return tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', **maxlenkw)['input_ids']
ids_src, ids_tgt = ids_for_model(wid_src, model, tokenizer), ids_for_model(wid_tgt, model, tokenizer)

sub2word_map_src = []
for i, word_list in enumerate(token_src):
  sub2word_map_src += [i for x in word_list]
sub2word_map_tgt = []
for i, word_list in enumerate(token_tgt):
  sub2word_map_tgt += [i for x in word_list]


In [23]:


model.eval() # just sets mode of model, probably doesn't need to be under no_grad

def to_onnx(model, onnx_file_path, inputs=['input_ids', 'attention_mask'], outputs=['output'], dynamic=True, batch=True, opset_version=14, return_tensor_names=True):
  captions = {0 : 'batch_size', 1: 'sequence_length'} if batch else {0 : 'sequence_length'}
  dynamic_axes = {}
  if dynamic:
    for k in inputs:
      dynamic_axes[k] = captions
    for k in outputs:
      dynamic_axes[k] = captions

  # Create dummy input data
  batch_size = 1
  sequence_length = 128
  dims = (batch_size, sequence_length) if batch else (sequence_length,)
  inputs_ones = tuple(torch.ones(dims) if x != 'input_ids' else torch.randint(0, model.config.vocab_size, dims) for x in inputs)

  # Export the model to ONNX
  torch.onnx.export(
      model,
      inputs_ones, #(input_ids, attention_mask),
      onnx_file_path,
      export_params=True,
      opset_version=opset_version,
      do_constant_folding=True,
      input_names = inputs,
      output_names = outputs,
      dynamic_axes=dynamic_axes,
  )

  if return_tensor_names:
    om = onnx_helper.load_onnx_model(onnx_file_path)
    return list(onnx_helper.enumerate_model_node_outputs(om))
  else:
    return f"Model exported to {onnx_file_path}"

for x in to_onnx(model,"model.onnx"): print(str(x))


ValueError: must specify labels for the self-trianing objective

In [None]:

# alignment

def sent_without_startend(batch, sent=0): return batch[sent, 1:-1]
def alignvec(batch, align_layer=8, sent=0): return sent_without_startend(batch[align_layer], sent=sent)
def hidden(model, ids): return model(ids.unsqueeze(0), output_hidden_states=True)[2]

for align_layer in range(7,12):
 last_align = None
 threshold = 1e-1
 for it in range(6):
  threshold = threshold * 1e-2
  with torch.no_grad():
    hidden_src = hidden(model, ids_src)
    hidden_tgt = hidden(model, ids_tgt)
    #pdb.set_trace()
    out_src = alignvec(hidden_src, align_layer) #model(ids_src.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
    out_tgt = alignvec(hidden_tgt, align_layer) #model(ids_tgt.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]

    dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))

    softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
    softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)

    softmax_inter = (softmax_srctgt > threshold)*(softmax_tgtsrc > threshold)

  align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
  align_words = set()
  for i, j in align_subwords:
    align_words.add( (sub2word_map_src[i], sub2word_map_tgt[j]) )
  align_words = sorted(align_words)
  if align_words != last_align:
    print(f" (layer {align_layer} > {threshold:.3g}) {len(align_words)} links for '{src}' to '{tgt}'")
    for i, j in align_words:
      print(f'{color.BOLD}{color.BLUE}{sent_src[i]}{color.END}==={color.BOLD}{color.RED}{sent_tgt[j]}{color.END}')
  last_align = align_words