## Calculating alignment for Arabic and English

In [1]:
%load_ext autoreload
%autoreload 2
import torch
from offenseval.nn import (
    Tokenizer,
    train, evaluate, train_cycle, save_model, load_model
)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model, TEXT = load_model("../../models/bert.en.sample.mean06.ft", device)

bert_model = model.bert 


Load parallel data. Download [this dataset](https://www.kaggle.com/samirmoustafa/arabic-to-english-translation-sentences/data) and put it on `data/nmt`

In [2]:
import pandas as pd
from torchtext import data
from transformers import BertTokenizer
from offenseval.nn import Tokenizer

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')

init_token = bert_tokenizer.cls_token
eos_token = bert_tokenizer.sep_token
pad_token = bert_tokenizer.pad_token
unk_token = bert_tokenizer.unk_token

init_token_idx = bert_tokenizer.cls_token_id
eos_token_idx = bert_tokenizer.sep_token_id
pad_token_idx = bert_tokenizer.pad_token_id
unk_token_idx = bert_tokenizer.unk_token_id
# Trying to cut this down to check if this improves memory usage

tokenizer = Tokenizer(bert_tokenizer)


ENGLISH = data.Field(
    tokenize=tokenizer.tokenize,
    include_lengths = True,
    use_vocab=False,
    batch_first = True,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)


ARABIC = data.Field(
    tokenize=tokenizer.tokenize,
    include_lengths = True,
    use_vocab=False,
    batch_first = True,
    preprocessing = tokenizer.convert_tokens_to_ids,
    init_token = init_token_idx,
    eos_token = eos_token_idx,
    pad_token = pad_token_idx,
    unk_token = unk_token_idx
)




tokenizer = Tokenizer(bert_tokenizer)

train_dataset = data.TabularDataset(
    "../../data/nmt/ara_eng.txt",
    format="tsv", skip_header=True,
    fields=[("english", ENGLISH), ("arabic", ARABIC)],
)



In [3]:
idx = 101
bert_tokenizer.convert_ids_to_tokens(train_dataset[idx].english), bert_tokenizer.convert_ids_to_tokens(train_dataset[idx].arabic)

(['tom', 'quit', '.'], ['تو', '##م', 'است', '##قال', '.'])

Now, create iterator, and calculate the differences

In [4]:
BATCH_SIZE = 16


device = "cuda" if torch.cuda.is_available() else "cpu"

train_it = data.BucketIterator(
    train_dataset, batch_size=BATCH_SIZE, device=device,
    sort_key = lambda x: len(x.arabic), sort_within_batch = True,
)

In [5]:
from tqdm.notebook import tqdm
ar_to_en = torch.zeros(768)

model.eval()

for batch in tqdm(train_it):
    ar_text, _ = batch.arabic
    en_text, _ = batch.english
    _, pooled_ar = bert_model(ar_text)
    _, pooled_en = bert_model(en_text)
    
    ar_to_en += (pooled_en - pooled_ar).sum(dim=0).detach().cpu()

HBox(children=(FloatProgress(value=0.0, max=1540.0), HTML(value='')))




We divide by the number of elements

In [6]:
ar_to_en = ar_to_en / len(train_dataset)

In [7]:
en_to_ar = -ar_to_en