In [None]:
! pip install -U accelerate
! pip install -U transformers
!pip install datasets
!pip install evaluate
!pip install rouge_score

from datasets import load_dataset,DatasetDict
from transformers import AutoTokenizer,AutoModel,M2M100ForConditionalGeneration,DataCollatorForSeq2Seq
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import evaluate
import numpy as np

wmt16 = load_dataset("wmt16","tr-en")

model = M2M100ForConditionalGeneration.from_pretrained("facebook/nllb-200-distilled-600M",output_attentions=True,output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")



In [None]:
# wmt16 = DatasetDict(train=wmt16['train'].select(range(100)),
#             validation=wmt16['validation'].select(range(100)),
#             test=wmt16['test'].select(range(100)))

Let's tokenize one of the instances printed above which we'll let the model generate an output and then, take a look at the model architecture.

In [None]:
ex_input = wmt16['train'][0]['translation']['en'][:]
tokenized_ex_input = tokenizer(ex_input,return_tensors='pt')
print(f"Before Tokenization: {ex_input}")
print(f"After Tokenization: {'/'.join([tokenizer.decode(x) for x in tokenized_ex_input.input_ids[0].tolist()])}\n")
ex_input = wmt16['train'][1]['translation']['en'][:]
ex_output = wmt16['train'][1]['translation']['tr'][:]
tokenized_ex_input = tokenizer(ex_input,return_tensors='pt')
tokenized_ex_output = tokenizer(ex_output,return_tensors='pt')
print(f"Before Tokenization: {ex_input}")
print(f"After Tokenization: {'/'.join([tokenizer.decode(x) for x in tokenized_ex_input.input_ids[0].tolist()])}")
print(f"Golden Output: {ex_output}\n")

Before Tokenization: Kosovo's privatisation process is under scrutiny
After Tokenization: eng_Latn/Kosovo/'/s/priv/atis/ation/process/is/under/sc/rut/iny/</s>

Before Tokenization: Kosovo is taking a hard look at its privatisation process in light of recurring complaints.
After Tokenization: eng_Latn/Kosovo/is/taking/a/hard/look/at/its/priv/atis/ation/process/in/light/of/recur/ring/compla/ints/./</s>
Golden Output: Kosova, tekrar eden şikayetler ışığında özelleştirme sürecini incelemeye alıyor.



It seems like the tokenizer estimates the source language which is later passed into the input as a token and most words are further tokenized than one token which is presumably due to the model being a general purpose serving machine translation for ~200 languages.

In [None]:
model.eval()
with torch.no_grad():
  ex_generated_output = model.generate(**tokenized_ex_input,forced_bos_token_id = tokenizer.lang_code_to_id['tur_Latn'])
  ex_generated_output = tokenizer.batch_decode(ex_generated_output,skip_special_tokens=True)[0]

print(f"Input Sentence: {ex_input}")
print(f"Actual Output Sentence: {ex_output}")
print(f"Generated Output Sentence: {ex_generated_output}")

Input Sentence: Kosovo is taking a hard look at its privatisation process in light of recurring complaints.
Actual Output Sentence: Kosova, tekrar eden şikayetler ışığında özelleştirme sürecini incelemeye alıyor.
Generated Output Sentence: Kosovo, tekrar tekrar gelen şikayetler ışığında özelleştirme sürecine ciddi bir bakış açısı veriyor.


Let's generate the datasets for test training and validation and further preprocess the source and target sentences for tokenization.

In [None]:

wmt16=wmt16.map(lambda x: {'en':x['translation']['en'],'tr':x['translation']['tr']})
wmt16=wmt16.remove_columns(['translation'])
max_token = np.max(wmt16['train'].map(lambda x: {'len': len(tokenizer(x['en'],return_tensors='pt').input_ids[0])})['len'])
def preprocess_data(example):

  sou_sen = example['en']
  tar_sen = example['tr']

  model_inputs = tokenizer(sou_sen,return_tensors='pt',truncation=True,max_length=max_token,padding='max_length')
  labels=tokenizer(tar_sen,return_tensors='pt',truncation=True,max_length=max_token,padding='max_length')
  model_inputs['labels'] = labels['input_ids'][0]
  model_inputs['input_ids'] = model_inputs['input_ids'][0]
  model_inputs['attention_mask'] = model_inputs['attention_mask'][0]

  return model_inputs

wmt16 = wmt16.map(preprocess_data)


Map:   0%|          | 0/1001 [00:00<?, ? examples/s]

Let's finetune the mdoel with Seq2SeqTrainer. The batch size was set considering the memory limit of the gpu's ram and save it when the training is finished.

In [None]:

data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer,model=model)
metric=evaluate.load("rouge")
def compute_metric(pred_label):

  preds,labels = pred_label
  token_preds = tokenizer.batch_decode(preds,skip_special_tokens=True)
  token_labels = tokenizer.batch_decode(labels,skip_special_tokens=True)

  value=metric.compute(predictions=[token_preds],references=[token_labels])

  return value

training_args=Seq2SeqTrainingArguments(
    output_dir="/content/results",
    evaluation_strategy="epoch", # run validation at the end of each epoch
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=7,
    per_device_eval_batch_size=7,
    num_train_epochs=2,
    load_best_model_at_end=True,
    fp16=True
)
trainer=Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=wmt16['train'],
    eval_dataset=wmt16['validation'],
    tokenizer=tokenizer,
    # data_collator=data_collator,
    # compute_metrics=compute_metric,
)
trainer.train()
trainer.load_model("/fined_tuned")

You're using a NllbTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,0.1675,0.234704


Epoch,Training Loss,Validation Loss
1,0.1675,0.234704
2,0.1314,0.230104


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=58788, training_loss=0.17176363955973573, metrics={'train_runtime': 10583.7615, 'train_samples_per_second': 38.881, 'train_steps_per_second': 5.555, 'total_flos': 1.7156498159940403e+17, 'train_loss': 0.17176363955973573, 'epoch': 2.0})

Now that we trained the model for 2 epochs and there is a decrease in both training and validation's log loss, we can predict the candidates for both the old and new model and then, compare their ROUGE scores (1-2 gram and L) to see if the translations have improved after fine-tuning in the test data

In [6]:

last_model = trainer.model
device = torch.device("cpu")
last_model.to(device)
model_old = M2M100ForConditionalGeneration.from_pretrained("facebook/nllb-200-distilled-600M",output_attentions=True,output_hidden_states=True)

def tokenize_input(model,example):

  tokenized_items_input = tokenizer(example['en'],return_tensors='pt',max_length=max_token,padding=True,truncation=True)
  tokenized_items_output = model.generate(**tokenized_items_input,forced_bos_token_id = tokenizer.lang_code_to_id['tur_Latn'])[0]
  sentence_output = tokenizer.decode(tokenized_items_output,skip_special_tokens=True)

  return sentence_output

wmt16_test=wmt16['test'].map(lambda x: {'pre_prediction':tokenize_input(model_old,x)})
wmt16_test=wmt16_test.map(lambda x: {'prediction':tokenize_input(last_model,x)})


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]



Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [20]:
rouge = evaluate.load("rouge")
pt_pred = wmt16_test['pre_prediction']
ft_pred = wmt16_test['prediction']
actual = wmt16_test['tr']
print(f"Pretrained Model Rouge Scores:{rouge.compute(predictions=pt_pred,references=actual)}")
print(f"Finetuned Model Rouge Scores:{rouge.compute(predictions=ft_pred,references=actual)}")

Pretrained Model Rouge Scores:{'rouge1': 0.5374327598164357, 'rouge2': 0.3268135772139561, 'rougeL': 0.4861250218188812, 'rougeLsum': 0.48607387254017237}
Finetuned Model Rouge Scores:{'rouge1': 0.5431571542831495, 'rouge2': 0.334978474363637, 'rougeL': 0.4948124766884754, 'rougeLsum': 0.4950863208850611}


Based on the results above Finetuned model's candidates better represent the test data in terms of Rouge compared to Pretrained model's candidates. It seems that the reference sentences are also generated through MT but this experiment still validates the usefulness of pretrain-finetune paradigm in machine translation

In [None]:
import os
os._exit(00)