In [1]:
!pip install git+https://github.com/huggingface/datasets
!pip install transformers[torch]
!pip install evaluate
!pip install sentencepiece

Collecting git+https://github.com/huggingface/datasets
  Cloning https://github.com/huggingface/datasets to /tmp/pip-req-build-rz_tixmc
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/datasets /tmp/pip-req-build-rz_tixmc
  Resolved https://github.com/huggingface/datasets to commit c65315e4a8308f04fcb025039afe2a2e43b5684e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting pyarrow-hotfix (from datasets==2.15.1.dev0)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.15.1.dev0)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets==2.15.1.dev0)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K  

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import numpy as np
import evaluate

from datasets import load_dataset
from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer


def preprocess_function(examples):
    src='zul'
    tgt='eng'
    inputs = [example for example in examples[src]]
    targets = [example for example in examples[tgt]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
    return model_inputs

def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):

    metric = evaluate.load("accuracy")

    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

if __name__ == "__main__":
    dataset = load_dataset("dsfsi/vukuzenzele-sentence-aligned", "eng-zul")
    sentences = dataset["train"]
    eval = dataset["test"]

    tokenizer = M2M100Tokenizer.from_pretrained("dsfsi/zu-en-m2m100-gov", src_lang="zu", tgt_lang="en")
    tokenized_sentences = sentences.map(preprocess_function, batched=True)

    model = M2M100ForConditionalGeneration.from_pretrained("dsfsi/zu-en-m2m100-gov")


    data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

    training_args = Seq2SeqTrainingArguments(
        output_dir="output",
        evaluation_strategy="no",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        weight_decay=0.01,
        save_total_limit=3,
        num_train_epochs=2,
        predict_with_generate=True,
        fp16=True,
        push_to_hub=True,
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_sentences,
        # eval_dataset=eval,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    trainer.train()

Downloading readme:   0%|          | 0.00/31.6k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/615k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/151k [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3024 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/756 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Map:   0%|          | 0/3024 [00:00<?, ? examples/s]

config.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Step,Training Loss
