In [None]:
from datasets import Dataset, DatasetDict
from huggingface_hub import hf_hub_download

def load_transliteration_split(repo_id, folder_path, split_name):
    # download the .ar and .en files for each split
    ar_file_path = hf_hub_download(
        repo_id=repo_id,
        filename=f"{folder_path}/{split_name}.ar",
        repo_type="dataset"
    )
    en_file_path = hf_hub_download(
        repo_id=repo_id,
        filename=f"{folder_path}/{split_name}.en",
        repo_type="dataset"
    )

    # read the content of the files
    with open(ar_file_path, 'r', encoding='utf-8') as f_ar:
        arabic_names = f_ar.read().splitlines()
    with open(en_file_path, 'r', encoding='utf-8') as f_en:
        english_names = f_en.read().splitlines()

    # both lists have the same number of lines
    if len(arabic_names) != len(english_names):
        raise ValueError(f"Mismatch in line counts for {split_name} split: {len(arabic_names)} Arabic, {len(english_names)} English")

    # create the list of dictionaries for the dataset
    data = [{
        'arabic': ar_name,
        'english': en_name
    } for ar_name, en_name in zip(arabic_names, english_names)]

    return Dataset.from_list(data)

# the repository ID and the folder containing the files
repo_id = "moha/Arabic-English-Transliteration-Dataset"
folder_path = "EN-AR transliteration dataset"

# load each split
train_dataset = load_transliteration_split(repo_id, folder_path, "train")
dev_dataset = load_transliteration_split(repo_id, folder_path, "dev")
test_dataset = load_transliteration_split(repo_id, folder_path, "test")

# combine into a DatasetDict
raw_datasets = DatasetDict({
    "train": train_dataset,
    "validation": dev_dataset, # Using 'validation' as standard split name
    "test": test_dataset
})

# display dataset structure
print(raw_datasets)
print(raw_datasets["train"][0])

In [None]:
# download pre-trained model weights
# make dir to store them
!mkdir -p log_dir/CA_MSA.base.cbhg/models

# download the PyTorch model checkpoint (~400 MB)
!curl -sSL https://github.com/secryst/rababa-models/releases/download/0.1/2000000-snapshot.pt -o log_dir/CA_MSA.base.cbhg/models/2000000-snapshot.pt

# verify the download size and file placement
!ls -lh log_dir/CA_MSA.base.cbhg/models/2000000-snapshot.pt

print("Rababa model weights downloaded and placed successfully. Proceed to Block 3.")

In [None]:
import os
from datasets import DatasetDict

# temporary filenames
TEMP_INPUT_FILE = "arabic_input.txt"
TEMP_OUTPUT_FILE = "arabic_diacritized.txt"
# path to the config file
CONFIG_FILE = "arabic/config/cbhg.yml"

# create a copy of the dataset to store results
# so we don't overwrite the original immediately
processed_datasets = raw_datasets.copy()

# splits to process
splits = ["train", "validation", "test"]

for split in splits:
    print(f"--- Processing Split: {split} ---")

    # get the raw Arabic list from the current split
    arabic_names_list = raw_datasets[split]["arabic"]

    # write the raw Arabic names to a temporary text file
    # Rababa's script needs a file on disk to read from
    print(f"Writing {len(arabic_names_list)} lines to temp file...")
    with open(TEMP_INPUT_FILE, 'w', encoding='utf-8') as f:
        f.write('\n'.join(arabic_names_list))

    # run Rababa via command line
    # using the '!' shell command to run the python script 'diacritize.py'
    print("Running Rababa diacritization... (This may take a while)")
    !python diacritize.py \
        --model_kind "cbhg" \
        --config {CONFIG_FILE} \
        --text_file {TEMP_INPUT_FILE} \
        > {TEMP_OUTPUT_FILE}

    # read the results back into Python
    print("Reading diacritized output...")
    with open(TEMP_OUTPUT_FILE, 'r', encoding='utf-8') as f:
        diacritized_names_list = f.read().splitlines()

    # validation: input and output lengths match
    if len(diacritized_names_list) != len(arabic_names_list):
        print(f"Error in split '{split}': Length mismatch! (Input: {len(arabic_names_list)}, Output: {len(diacritized_names_list)})")
        # TODO: add logic here to handle mismatches (e.g., stop or pad)
        break

    # add new column to the dataset
    # If the column already exists (from a previous run), remove it first
    if "arabic_diacritized" in processed_datasets[split].column_names:
        processed_datasets[split] = processed_datasets[split].remove_columns("arabic_diacritized")

    processed_datasets[split] = processed_datasets[split].add_column(
        "arabic_diacritized",
        diacritized_names_list
    )

    # cleanup temporary files
    if os.path.exists(TEMP_INPUT_FILE): os.remove(TEMP_INPUT_FILE)
    if os.path.exists(TEMP_OUTPUT_FILE): os.remove(TEMP_OUTPUT_FILE)
    print(f"Split '{split}' completed successfully.\n")

print("All splits processed.")
print(processed_datasets)

# verify a sample from each split
print("\n--- Verification Samples ---")
for split in splits:
    print(f"{split}: {processed_datasets[split]['arabic'][0]} -> {processed_datasets[split]['arabic_diacritized'][0]}")

In [None]:
# Romanised Phonetic Transcription
# convert the arabic_diacritized column into a standardised
# Latin-character format (Buckwalter transliteration)

from arabic_buckwalter_transliteration.transliteration import arabic_to_buckwalter
import pyarabic.araby as araby

# define the Transcription fun
def transcribe_batch(batch):
    """
    Converts diacritized Arabic text to Buckwalter (ASCII) transliteration.
    Example: 'بُولْك' -> 'buwlok'
    """
    diacritized_texts = batch['arabic_diacritized']
    romanized_inputs = []

    for text in diacritized_texts:
        # None or empty strings
        if not text:
            romanized_inputs.append("")
            continue

        # encode to Buckwalter (a 1:1 mapping used in NLP)
        # strip tatweel (stretching char) just in case, as it adds noise
        clean_text = araby.strip_tatweel(text)
        # emcode to Buckwalter
        try:
          romanized = arabic_to_buckwalter(clean_text)
        except Exception as e:
          print(f"Error during conversion: {e}")
          romanized = ""

        romanized_inputs.append(romanized)

    batch['romanized_input'] = romanized_inputs
    return batch

# apply to all splits
print("Applying Romanized transcription...")
# iterate over the split names (keys) in the DataSet
keys_to_process = [k for k in processed_datasets.keys()]

for split_name, dataset in processed_datasets.items():
  print(f"Processing {split_name} split...")
  processed_datasets[split_name] = processed_datasets[split_name].map(
      transcribe_batch,
      batched=True
  )

# Verification
print("\n--- Transcription Sample ---")
sample = processed_datasets['train'][0]
print(f"Arabic (Diacritized): {sample['arabic_diacritized']}")
print(f"Romanized Input:      {sample['romanized_input']}")
print(f"Target English:       {sample['english']}")

In [None]:
# Vocabulary Generation
# Define Special Tokens
# these are critical for the Seq2Seq model structure
SPECIAL_TOKENS = {'<PAD>': 0, '<UNK>': 1, '<SOS>': 2, '<EOS>': 3}

def build_vocab(dataset_dict, column_name):
    """
    Scans all splits in the dataset to build a comprehensive set of unique characters.
    """
    unique_chars = set()
    # Iterate through all splits (train, validation, test)
    for split in dataset_dict.keys():
        for text in dataset_dict[split][column_name]:
            if text:
                unique_chars.update(list(text))
    return sorted(list(unique_chars))

# Build Source (Input) and Target (Output) Vocabularies
print("Building vocabularies from all splits...")

# Scan ALL data to ensure we capture every character
src_chars = build_vocab(processed_datasets, 'romanized_input')
tgt_chars = build_vocab(processed_datasets, 'english')

# --- FIX: Explicitly add space to vocabularies if not already present ---
if ' ' not in src_chars:
    src_chars.append(' ')
    src_chars.sort() # Re-sort to maintain order
if ' ' not in tgt_chars:
    tgt_chars.append(' ')
    tgt_chars.sort() # Re-sort to maintain order
# --- End FIX ---

# create mappings (Char -> ID and ID -> Char)
# Start numbering AFTER the special tokens
start_idx = len(SPECIAL_TOKENS)

# Source Mappings (Arabic/Romanized)
src_char2id = {char: idx + start_idx for idx, char in enumerate(src_chars)}
src_char2id.update(SPECIAL_TOKENS) # Add special tokens
src_id2char = {id: char for char, id in src_char2id.items()}

# Target Mappings (English)
tgt_char2id = {char: idx + start_idx for idx, char in enumerate(tgt_chars)}
tgt_char2id.update(SPECIAL_TOKENS)
tgt_id2char = {id: char for char, id in tgt_char2id.items()}

print(f"Source Vocab Size: {len(src_char2id)}")
print(f"Target Vocab Size: {len(tgt_char2id)}")

# Print first few items to verify
print(f"Source Chars (Sample): {list(src_char2id.keys())[:10]}")
print(f"Target Chars (Sample): {list(tgt_char2id.keys())[:10]}")


In [None]:
import json
import os
from datasets import DatasetDict

# setup Artifacts Directory
os.makedirs("artifacts", exist_ok=True)

# save Vocabularies (CRITICAL STEP)
print("Saving vocabularies to 'artifacts/'...")
with open("artifacts/src_char2id.json", "w", encoding="utf-8") as f:
    json.dump(src_char2id, f, ensure_ascii=False, indent=4)

with open("artifacts/tgt_char2id.json", "w", encoding="utf-8") as f:
    json.dump(tgt_char2id, f, ensure_ascii=False, indent=4)

# DatasetDict format
if isinstance(processed_datasets, dict):
    print("Converting dictionary back to DatasetDict...")
    processed_datasets = DatasetDict(processed_datasets)

MAX_LENGTH = 32

def tokenize_and_pad(batch):
    # Input (Source) Processing
    input_ids_batch = []
    for text in batch['romanized_input']:
        ids = [src_char2id.get(c, src_char2id['<UNK>']) for c in list(text)]
        ids = ids[:MAX_LENGTH - 1]
        padded_ids = ids + [src_char2id['<EOS>']] + [src_char2id['<PAD>']] * (MAX_LENGTH - len(ids) - 1)
        input_ids_batch.append(padded_ids)

    # Output (Target) Processing
    labels_batch = []
    for text in batch['english']:
        ids = [tgt_char2id.get(c, tgt_char2id['<UNK>']) for c in list(text)]
        ids = ids[:MAX_LENGTH - 2]
        padded_ids = [tgt_char2id['<SOS>']] + ids + [tgt_char2id['<EOS>']] + [tgt_char2id['<PAD>']] * (MAX_LENGTH - len(ids) - 2)
        labels_batch.append(padded_ids)

    batch['input_ids'] = input_ids_batch
    batch['labels'] = labels_batch
    return batch

# Apply & Save Processed Data
print("Tokenizing and Padding datasets...")
tokenized_datasets = processed_datasets.map(tokenize_and_pad, batched=True)
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'labels'])

# save the processed tensor data to disk
tokenized_datasets.save_to_disk("artifacts/tokenized_data")

print("Data prep complete. Artifacts saved.")
print(f"Source Vocab Size: {len(src_char2id)} | Target Vocab Size: {len(tgt_char2id)}")

In [None]:
from torch.utils.data import DataLoader

BATCH_SIZE = 64 # Standard for Colab GPUs

# Create DataLoaders
train_loader = DataLoader(tokenized_datasets['train'], batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(tokenized_datasets['validation'], batch_size=BATCH_SIZE)
test_loader = DataLoader(tokenized_datasets['test'], batch_size=BATCH_SIZE)

print(f"Train Batches: {len(train_loader)}")
print(f"Val Batches:   {len(val_loader)}")

In [None]:
# define Transformer Architecture
import torch
import torch.nn as nn
import math

class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=5000, dropout=0.1):
    super(PositionalEncoding, self).__init__()
    self.dropout = nn.Dropout(p=dropout)

    # compute position encodings once in log space
    pos_enc = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0,d_model, 2).float() * (-math.log(10000.0) / d_model))

    pos_enc[:, 0 :: 2] = torch.sin(position * div_term)
    pos_enc[:, 1 :: 2] = torch.cos(position * div_term)
    pos_enc = pos_enc.unsqueeze(0).transpose(0, 1)

    self.register_buffer('pos_enc', pos_enc)

  def forward(self, x):
    # x shape: [seq_len, batch_size, embedding_dim]
    x = x + self.pos_enc[:x.size(0), :]
    return self.dropout(x)

class Transformer(nn.Module):
  def __init__(
      self, src_vocab_size, tgt_vocab_size, d_model, nhead,
      num_encoder_layers, num_decoder_layers, dim_feedforward, dropout=0.1
  ):
    super(Transformer, self).__init__()

    # embeddings
    self.src_embedding = nn.Embedding(src_vocab_size, d_model)
    self.tgt_embedding = nn.Embedding(tgt_vocab_size, d_model)
    # positional encoding
    self.pos_encoder = PositionalEncoding(d_model)
    # encoder + decoder
    self.transformer = nn.Transformer(
        d_model = d_model,
        nhead = nhead,
        num_encoder_layers = num_encoder_layers,
        num_decoder_layers = num_decoder_layers,
        dim_feedforward = dim_feedforward,
        dropout = dropout
    )
    # output layer, map back to vocab size
    self.generator = nn.Linear(d_model, tgt_vocab_size)
    self.d_model = d_model

  def forward(
      self, src, tgt, src_key_padding_mask, tgt_key_padding_mask,
      memory_key_padding_mask, tgt_mask
  ):
    # reshape inputs for PyTorch Transformer: [seq_len, batch_size]
    # the DataLoaders give [batch_size, seq_len] -> transpose
    src = src.transpose(0, 1)
    tgt = tgt.transpose(0, 1)

    # embeddings + positional encoding
    src_emb = self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model))
    tgt_emb = self.pos_encoder(self.tgt_embedding(tgt) * math.sqrt(self.d_model))

    # pass through Transformer
    output = self.transformer(
        src_emb,
        tgt_emb,
        tgt_mask = tgt_mask,
        src_key_padding_mask = src_key_padding_mask,
        tgt_key_padding_mask = tgt_key_padding_mask,
        memory_key_padding_mask = memory_key_padding_mask
    )
    # output shape [seq_len, batch_size, vocab_size] -> transpose back to [batch, seq, vocab]
    return self.generator(output).transpose(0, 1)

  # helpers for inference
  def encode(self, src, src_mask):
    src = src.transpose(0, 1)
    return self.transformer.encoder(self.pos_encoder(self.src_embedding(src) * math.sqrt(self.d_model)), src_mask)

  def decode(self, tgt, memory, tgt_mask):
        tgt = tgt.transpose(0, 1)
        return self.transformer.decoder(self.pos_encoder(self.tgt_embedding(tgt) * math.sqrt(self.d_model)), memory, tgt_mask)


In [None]:
# configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# model hyperparameters
SRC_VOCAB_SIZE = len(src_char2id)
TGT_VOCAB_SIZE = len(tgt_char2id)
EMB_SIZE = 256 # emb dim (d_model)
NHEAD = 4 # number of attention heads
FFN_HID_DIM = 512 # feedfoward dimens
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3
DROPOUT = 0.1

# instatiate model
model = Transformer(
    SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, EMB_SIZE, NHEAD,
    NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, FFN_HID_DIM, DROPOUT
).to(DEVICE)

# weight initialization
for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

print(f"Model initialized on {DEVICE}")
print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

In [None]:
def generate_square_subsequent_mask(sz):
    """Generates the look-ahead mask (upper triangular matrix with -inf)."""
    mask = (torch.triu(torch.ones((sz, sz), device=DEVICE)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask

def create_mask(src, tgt):
    """Creates both padding masks and the look-ahead mask, converting them to float type."""
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len)

    # boolean padding masks first
    src_is_pad = (src == src_char2id['<PAD>'])
    tgt_is_pad = (tgt == tgt_char2id['<PAD>'])

    # convert boolean padding masks to float masks (0.0 for valid, -inf for padding)
    src_padding_mask = src_is_pad.float().masked_fill(src_is_pad == True, float('-inf')).masked_fill(src_is_pad == False, float(0.0))
    tgt_padding_mask = tgt_is_pad.float().masked_fill(tgt_is_pad == True, float('-inf')).masked_fill(tgt_is_pad == False, float(0.0))

    # memory_key_padding_mask is for the encoder output in the decoder's cross-attention
    # should be identical to the src_padding_mask in this context.
    memory_key_padding_mask = src_padding_mask

    return src_padding_mask, tgt_padding_mask, memory_key_padding_mask, tgt_mask

# loss function: ignore the padding index so we don't optimize for predicting empty space
loss_fn = nn.CrossEntropyLoss(ignore_index=tgt_char2id['<PAD>'])
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [None]:
from tqdm import tqdm
import json
import torch
import wandb

# W&B Login
try:
    wandb_key = userdata.get('WB_TOKEN')
    wandb.login(key=wandb_key)
except Exception as e:
    print(f"Warning: Could not log in to W&B. Check your 'WB_TOKEN' secret. Error: {e}")

# Define & Save Model Configuration
model_config = {
    "src_vocab_size": len(src_char2id),
    "tgt_vocab_size": len(tgt_char2id),
    "emb_size": 256,
    "nhead": 4,
    "num_encoder_layers": 3,
    "num_decoder_layers": 3,
    "ffn_hid_dim": 512,
    "dropout": 0.1,
    "max_length": MAX_LENGTH,
    "batch_size": 64,
    "epochs": 12,
    "device": str(DEVICE)
}

# save config locally
with open("artifacts/model_config.json", "w") as f:
    json.dump(model_config, f, indent=4)

# initialize W&B Run
wandb.init(
    project="arabic-transliteration",
    name="char-transformer-v1",
    config=model_config
)
wandb.watch(model, log="gradients", log_freq=100)

def train_epoch(model, optimizer):
    model.train()
    losses = 0

    for batch in tqdm(train_loader, desc="Training"):
        src = batch['input_ids'].to(DEVICE)
        tgt = batch['labels'].to(DEVICE)

        tgt_input = tgt[:, :-1]
        tgt_out = tgt[:, 1:]

        src_padding_mask, tgt_padding_mask, memory_key_padding_mask, tgt_mask = create_mask(src, tgt_input)

        # ensure masks are on correct device
        src_padding_mask = src_padding_mask.to(DEVICE)
        tgt_padding_mask = tgt_padding_mask.to(DEVICE)
        memory_key_padding_mask = memory_key_padding_mask.to(DEVICE)
        tgt_mask = tgt_mask.to(DEVICE)

        logits = model(
            src, tgt_input,
            src_key_padding_mask=src_padding_mask,
            tgt_key_padding_mask=tgt_padding_mask,
            memory_key_padding_mask=memory_key_padding_mask,
            tgt_mask=tgt_mask
        )

        optimizer.zero_grad()
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        loss.backward()
        optimizer.step()
        losses += loss.item()

    return losses / len(train_loader)

def evaluate(model):
    model.eval()
    losses = 0
    with torch.no_grad():
        for batch in val_loader:
            src = batch['input_ids'].to(DEVICE)
            tgt = batch['labels'].to(DEVICE)
            tgt_input = tgt[:, :-1]
            tgt_out = tgt[:, 1:]

            src_padding_mask, tgt_padding_mask, memory_key_padding_mask, tgt_mask = create_mask(src, tgt_input)

            logits = model(
                src, tgt_input,
                src_key_padding_mask=src_padding_mask.to(DEVICE),
                tgt_key_padding_mask=tgt_padding_mask.to(DEVICE),
                memory_key_padding_mask=memory_key_padding_mask.to(DEVICE),
                tgt_mask=tgt_mask.to(DEVICE)
            )
            loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
            losses += loss.item()
    return losses / len(val_loader)

# run training
NUM_EPOCHS = model_config['epochs']
best_val_loss = float('inf')

print(f"Starting training for {NUM_EPOCHS} epochs...")

for epoch in range(1, NUM_EPOCHS + 1):
    train_loss = train_epoch(model, optimizer)
    val_loss = evaluate(model)

    print(f"Epoch: {epoch} | Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")

    # log to W&B
    wandb.log({"train_loss": train_loss, "val_loss": val_loss, "epoch": epoch})

    # save checkpoint
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        save_path = "artifacts/best_transformer_model.pth"

        # save locally
        torch.save(model.state_dict(), save_path)

        # save to W&B as backup
        wandb.save(save_path)

        print(f"  -> New best model saved to {save_path}")

wandb.finish()

In [None]:
import torch

MODEL_PATH = ####

# load weights -> this can be changed to load from latest w&b run
if torch.cuda.is_available():
    map_location = torch.device('cuda')
else:
    map_location = torch.device('cpu')

try:
    model.load_state_dict(torch.load(MODEL_PATH, map_location=map_location))
    model.to(DEVICE)
    print(f"Successfully loaded model from: {MODEL_PATH}")
except FileNotFoundError:
    print(f"ERROR: Could not find file at {MODEL_PATH}. Please check the path.")
except RuntimeError as e:
    print(f"ERROR: Architecture mismatch. Ensure the model class matches the saved weights.\nDetails: {e}")

In [None]:
import torch

def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(DEVICE)
    src_mask = src_mask.to(DEVICE)

    # encode the source once
    # Input: [1, Seq_Len] -> Encoder Output: [1, Seq_Len, Emb]
    memory = model.encode(src, src_mask)

    # initialize decoder input with SOS
    # Shape: [1, 1] (Batch, Seq)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(DEVICE)

    for i in range(max_len - 1):
        # create square mask for the sequence generated so far
        sz = ys.size(1)
        tgt_mask = (generate_square_subsequent_mask(sz).type(torch.bool)).to(DEVICE)

        # decode
        out = model.decode(ys, memory, tgt_mask)

        # probabilities for the LAST token only
        # out shape: [Seq_Len, Batch, Vocab] -> Transpose to [Batch, Seq_Len, Vocab]
        out = out.transpose(0, 1)
        prob = model.generator(out[:, -1])

        # get the token with max probability
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item() # This will now work because batch size is guaranteed to be 1

        # append to the sequence
        ys = torch.cat([ys, torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)

        # stop if EOS is generated
        if next_word == tgt_char2id['<EOS>']:
            break

    return ys

In [None]:
def transliterate(arabic_name, model):
    model.eval()

    # preprocessing
    clean_text = araby.strip_tatweel(arabic_name)
    try:
        romanized = arabic_to_buckwalter(clean_text)
    except:
        romanized = ""

    if not romanized:
        return ""

    # tokenization
    src_ids = [src_char2id.get(c, src_char2id['<UNK>']) for c in list(romanized)]

    # use unsqueeze(0) to create [Batch=1, Seq_Len]
    src_tensor = torch.tensor(src_ids).unsqueeze(0)

    # mask must be square (Seq_Len, Seq_Len) for attention
    num_tokens = src_tensor.shape[1]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)

    # run inference
    tgt_tokens = greedy_decode(
        model, src_tensor, src_mask,
        max_len=MAX_LEN, start_symbol=tgt_char2id['<SOS>']
    )

    return ids_to_string(tgt_tokens.flatten(), tgt_id2char)

# model on the correct device
model.to(DEVICE)

names_to_test = ["أحمد محمد عبدالله العمرو"]

print(f"{'INPUT':<30} | {'OUTPUT':<30}")
print("-" * 65)
for name in names_to_test:
    output = transliterate(name, model)
    print(f"{name:<30} | {output:<30}")