In [9]:
from datasets import load_dataset
from transformers import AutoTokenizer

In [2]:
raw_dataset = load_dataset('kde4', lang1='en', lang2='fr')
raw_dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [3]:
split_dataset = raw_dataset['train'].train_test_split(train_size=.9, seed=20)
split_dataset['validation'] = split_dataset.pop('test')
split_dataset["train"][1]["translation"]

{'en': 'Default to expanded threads',
 'fr': 'Par défaut, développer les fils de discussion'}

In [6]:
from transformers import pipeline

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
translator = pipeline('translation', model=model_checkpoint)

[{'translation_text': 'Mon nom est Tyler'}]

In [7]:
translator('my name is tyler')

[{'translation_text': 'Mon nom est Tyler'}]

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

In [14]:
en_inputs = split_dataset['train'][1]['translation']['en']
fr_inputs = split_dataset['train'][1]['translation']['fr']

inputs = tokenizer(en_inputs, text_target=fr_inputs)
inputs   # the input here is english tokens, and the label is the french translation

{'input_ids': [47591, 12, 9842, 19634, 9, 0], 'attention_mask': [1, 1, 1, 1, 1, 1], 'labels': [577, 5891, 2, 3184, 16, 2542, 5, 1710, 0]}

In [20]:
max_length = 128

def preprocess(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    
    model_inputs = tokenizer(inputs, text_target=targets, max_length=max_length, truncation=True)
    return model_inputs


In [22]:
tokenized_datasets = split_dataset.map(preprocess, batched=True, remove_columns=split_dataset['train'].column_names)

In [24]:
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [30]:
batch = collator([tokenized_datasets['train'][i] for i in range(1,3)])
print(batch.keys())
print(batch.labels)

dict_keys(['input_ids', 'attention_mask', 'labels', 'decoder_input_ids'])
tensor([[  577,  5891,     2,  3184,    16,  2542,     5,  1710,     0,  -100,
          -100,  -100,  -100,  -100,  -100,  -100],
        [ 1211,     3,    49,  9409,  1211,     3, 29140,   817,  3124,   817,
           550,  7032,  5821,  7907, 12649,     0]])


In [37]:
import evaluate

metric = evaluate.load("sacrebleu")

Using the latest cached version of the module from /Users/tylerklimas/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--sacrebleu/28676bf65b4f88b276df566e48e603732d0b4afd237603ebdf92acaacf5be99b (last modified on Fri Mar  8 12:35:39 2024) since it couldn't be found locally at evaluate-metric--sacrebleu, or remotely on the Hugging Face Hub.


In [38]:
predictions = [
    "This plugin lets you translate web pages between several languages automatically."
]
references = [
    [
        "This plugin allows you to automatically translate web pages between several languages."
    ]
]
metric.compute(predictions=predictions, references=references) # very high because the sentence is near identical

{'score': 46.750469682990165,
 'counts': [11, 6, 4, 3],
 'totals': [12, 11, 10, 9],
 'precisions': [91.66666666666667,
  54.54545454545455,
  40.0,
  33.333333333333336],
 'bp': 0.9200444146293233,
 'sys_len': 12,
 'ref_len': 13}

In [41]:
import numpy as np

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    
    if isinstance(preds, tuple):
        preds = preds[0]
    
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    decoded_preds = [pred.strip() for pred in decoded_preds]
    
    decoded_labels = [[label.strip()] for label in labels]
    
    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    
    return {"bleu": result['score']}
    
        
    


In [52]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [54]:
from transformers import Seq2SeqTrainingArguments

args = Seq2SeqTrainingArguments(
    f"marian-finetuned-kde4-en-to-fr",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=3,
    predict_with_generate=True,
    fp16=False,
    push_to_hub=True,
)

ValueError: FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX).