In [1]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

We'll use the stock multilingual MBart50 model, with a decoder. Languages covered are listed in Table 6 here: https://arxiv.org/pdf/2008.00401.pdf 

Note that `ar`, `fa`, and `ur` are present.

In [2]:
model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")

In [3]:
from transformers import AutoTokenizer

In [4]:
tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50")

We're doing a slightly odd thing here: "translating" from Arabic to Arabic.

In [5]:
tokenizer.src_lang = "ar-AR"
tokenizer.tgt_lang = "ar-AR"

In [6]:
len(tokenizer.vocab)

250054

In [7]:
tokenizer.tokenize('لأن قوله لا يفيد إلا الظن والظن لا يغني من الحق شيئا ولم يحصل الغرض ')

['▁لأن',
 '▁قوله',
 '▁لا',
 '▁ي',
 'فيد',
 '▁إلا',
 '▁ال',
 'ظن',
 '▁وال',
 'ظن',
 '▁لا',
 '▁ي',
 'غني',
 '▁من',
 '▁الحق',
 '▁شيئا',
 '▁ولم',
 '▁يحصل',
 '▁ال',
 'غرض',
 '▁']

Now we load the training and development data sets from the RDD corpus of Dong and Smith 2018.  Only 10% of each is used here, i.e. around 700,000 exemplars in the training set.

In [8]:
import datasets

In [9]:
ds = datasets.load_dataset('csv', data_files={'train': '../corpora/train.tsv',
                                     'validation': '../corpora/eval.tsv'}, 
                  delimiter='\t', quoting=3)

Using custom data configuration default-8521b9f93f7a9324
Reusing dataset csv (/home/jds/.cache/huggingface/datasets/csv/default-8521b9f93f7a9324/0.0.0/2dc6629a9ff6b5697d82c25b73731dd440507a69cbce8b425db50b751e8fcfd0)


In [10]:
ds

DatasetDict({
    train: Dataset({
        features: ['ar', 'ar.1'],
        num_rows: 599553
    })
    validation: Dataset({
        features: ['ar', 'ar.1'],
        num_rows: 201019
    })
})

The boring code to tokenize the input and retrieve the vocabulary IDs.

In [11]:
max_input_length = 128
max_target_length = max_input_length
source_lang = "ar.1"
target_lang = "ar"

def preprocess(examples):
    inputs = examples[source_lang]
    targets = examples[target_lang]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True, padding='max_length')
    
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [12]:
preprocess(ds['train'][:2])

{'input_ids': [[3, 716, 2862, 902, 18789, 39978, 240, 99147, 6, 102840, 10488, 20223, 1933, 5319, 250, 7024, 20258, 83289, 826, 154149, 1423, 39654, 80846, 9, 2567, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [3, 88315, 6008, 878, 1282, 4995, 50, 74469, 4714, 1333, 27949, 556, 12011, 133369, 412, 33361, 6294, 359, 61825, 50, 6, 147746, 130981, 39141, 79370, 53929, 376, 73443, 323, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [13]:
token_ds = ds.map(preprocess, batched=True)

HBox(children=(FloatProgress(value=0.0, max=600.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=202.0), HTML(value='')))




Huggingface's new `Trainer` is very welcome, as it allowed me to cut down all the detailed steps in the training loop.

In [14]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

A batch size of 12 is all I could fit on a 24GB graphics card.  We will run just 2 epochs (typical of finetuning).

In [15]:
training_args = Seq2SeqTrainingArguments(
    output_dir = './results',          # output directory
    num_train_epochs = 2,              # total # of training epochs
    per_device_train_batch_size = 12,  # batch size per device during training
    per_device_eval_batch_size = 12,   # batch size for evaluation
    warmup_steps = 500,                # number of warmup steps for learning rate scheduler
    weight_decay = 0.01,               # strength of weight decay
    logging_dir = './logs',            # directory for storing logs
)

Code off the web to add more metrics for training, in addition to just the loss.

In [16]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [17]:
trainer = Seq2SeqTrainer(
    model = model,                         
    args = training_args,                  
    train_dataset = token_ds['train'],         
    eval_dataset = token_ds['validation'],
    compute_metrics = compute_metrics
)

In [None]:
trainer.train()

Step,Training Loss
500,3.2284
1000,0.5548
1500,0.1918
2000,0.1793
2500,0.1669
3000,0.157
3500,0.1527
4000,0.1475
4500,0.1449
5000,0.1421


In [19]:
model.eval()

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(250054, 1024, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): Embedding(250054, 1024, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0): MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=T

In [20]:
import torch
def correct(sent):
    tokens = tokenizer(sent, max_length=max_input_length, 
                  truncation=True, padding='max_length', return_tensors='pt').to(device='cuda')
    with torch.no_grad():
        gen_tokens = model.generate(**tokens)
    
    return ' '.join(tokenizer.tokenize(sent)), tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)

In [22]:
with open('../corpora/test.x.txt') as f:
    test = f.read().splitlines()

In [23]:
with open('../corpora/test.y.txt') as f:
    gold = f.read().splitlines()

In [24]:
%%time
corrected = [correct(line) for line in test[:1000]]

CPU times: user 8min 5s, sys: 57.4 s, total: 9min 3s
Wall time: 9min 3s


In [35]:
f"{(9 * 60 + 3)/1000} secs/correction"

'0.543 secs/correction'

In [25]:
import fastwer

In [26]:
corrected_strings = [corr[0] for _, corr in corrected]

In [27]:
f"WER: {fastwer.score(corrected_strings, gold[:1000])}"

'WER: 17.4812'

In [28]:
f"CER: {fastwer.score(corrected_strings, gold[:1000], char_level=True)}"

'CER: 5.1542'

For comparison, the CER before correction:
    

In [29]:
f"CER: {fastwer.score(test[:1000], gold[:1000], char_level=True)}"

'CER: 11.1806'

In [30]:
f"CER Improvement: {round((11.1861-6.3847)/11.1861 * 100, 2)}%"

'CER Improvement: 42.92%'

In [32]:
with open('test.rec.txt', 'w') as f:
    f.writelines(f'{line}\n' for line in corrected_strings)

In [33]:
with open('../d3/test.rec.txt') as f:
    test = f.read().splitlines()

In [34]:
with open('../d3/test.gt.txt') as f:
    gold = f.read().splitlines()

In [36]:
corrected = [correct(line) for line in test]

In [37]:
corrected_strings = [corr[0] for _, corr in corrected]

In [38]:
with open('../d4/test.rec.txt', 'w') as f:
    f.writelines(f'{line}\n' for line in corrected_strings)

In [40]:
import re

In [41]:
noise = re.compile(""" ّ    | # Tashdīd / Shadda
                       َ    | # Fatḥa
                       ً    | # Tanwīn Fatḥ / Fatḥatān
                       ُ    | # Ḍamma
                       ٌ    | # Tanwīn Ḍamm / Ḍammatān
                       ِ    | # Kasra
                       ٍ    | # Tanwīn Kasr / Kasratān
                       ْ    | # Sukūn
                       ۡ    | # Quranic Sukūn
                       ࣰ    | # Quranic Open Fatḥatān
                       ࣱ    | # Quranic Open Ḍammatān
                       ࣲ    | # Quranic Open Kasratān
                       ٰ    | # Dagger Alif
                       ـ     # Taṭwīl / Kashīda
                   """, re.VERBOSE)
