In [None]:
!pip install sacrebleu evaluate transformers[torch] wandb datasets peft bitsandbytes accelerate

In [None]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoTokenizer, DataCollatorForSeq2Seq, BitsAndBytesConfig
from datasets import load_dataset, load_from_disk, Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, TaskType
import torch
import evaluate
import numpy as np

In [None]:
model_path = 'google/umt5-small'

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
dataset = load_from_disk("data/tedx")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 302426
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 75607
    })
})

In [None]:
train_split_en_tr = dataset['train'].shuffle(seed=42).select(range(40000))
test_split_en_tr = dataset['test'].shuffle(seed=42).select(range(10000))

train_split_tr_en = dataset['train'].shuffle(seed=42).select(range(40000))
test_split_tr_en = dataset['test'].shuffle(seed=42).select(range(10000))

en_tr = DatasetDict({
    'train': train_split_en_tr,
    'test': test_split_en_tr
})

tr_en = DatasetDict({
    'train': train_split_tr_en,
    'test': test_split_tr_en
})

In [None]:
en_tr

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 10000
    })
})

In [None]:
tr_en

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 40000
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 10000
    })
})

In [None]:
source_lang = "en"
target_lang = "tr"

prefix_en_to_tr = "translate English to Turkish: "
prefix_tr_to_en = "translate Turkish to English: "

def preprocess_en_to_tr(examples):
    inputs = [prefix_en_to_tr + example[source_lang] for example in examples["translation"]]
    targets = [example[target_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=512, truncation=True)
    return model_inputs

def preprocess_tr_to_en(examples):
    inputs = [prefix_tr_to_en + example[target_lang] for example in examples["translation"]]
    targets = [example[source_lang] for example in examples["translation"]]
    model_inputs = tokenizer(inputs, text_target=targets, max_length=512, truncation=True)
    return model_inputs

tokenized_en_to_tr = en_tr.map(preprocess_en_to_tr, batched=True)
tokenized_tr_to_en = tr_en.map(preprocess_tr_to_en, batched=True)


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
train_dataset = Dataset.from_dict({
    'input_ids': tokenized_en_to_tr['train']['input_ids'] + tokenized_tr_to_en['train']['input_ids'],
    'attention_mask': tokenized_en_to_tr['train']['attention_mask'] + tokenized_tr_to_en['train']['attention_mask'],
    'labels': tokenized_en_to_tr['train']['labels'] + tokenized_tr_to_en['train']['labels'],
    'translation': tokenized_en_to_tr['train']['translation'] + tokenized_tr_to_en['train']['translation']
})

test_dataset = Dataset.from_dict({
    'input_ids': tokenized_en_to_tr['test']['input_ids'] + tokenized_tr_to_en['test']['input_ids'],
    'attention_mask': tokenized_en_to_tr['test']['attention_mask'] + tokenized_tr_to_en['test']['attention_mask'],
    'labels': tokenized_en_to_tr['test']['labels'] + tokenized_tr_to_en['test']['labels'],
    'translation': tokenized_en_to_tr['test']['translation'] + tokenized_tr_to_en['test']['translation']
})

tokenized_datasets = DatasetDict({
    'train': train_dataset,
    'test': test_dataset
})


In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'translation'],
        num_rows: 80000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'translation'],
        num_rows: 20000
    })
})

In [None]:
metric = evaluate.load("sacrebleu")

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_path)

In [None]:
model

UMT5ForConditionalGeneration(
  (shared): Embedding(256384, 512)
  (encoder): UMT5Stack(
    (embed_tokens): Embedding(256384, 512)
    (block): ModuleList(
      (0-7): 8 x UMT5Block(
        (layer): ModuleList(
          (0): UMT5LayerSelfAttention(
            (SelfAttention): UMT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): UMT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): UMT5LayerFF(
            (DenseReluDense): UMT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=Fal

In [None]:
label_pad_token_id=-100

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model, # peft_model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
import wandb

wandb.login(key="***")


In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/content/checkpoints1",
    eval_strategy="steps",
    eval_steps=200000,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    eval_accumulation_steps=20,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    fp16=False,
    run_name="umt5-machine-translation-QLoRA4"
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

Step,Training Loss,Validation Loss


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
eval_results = trainer.evaluate(tokenized_datasets["test"], metric_key_prefix="eval")
print("Evaluation results:", eval_results)

Evaluation results: {'eval_loss': 2.4954159259796143, 'eval_bleu': 12.5726, 'eval_gen_len': 1.0, 'eval_runtime': 1190.5137, 'eval_samples_per_second': 16.799, 'eval_steps_per_second': 1.05}


### Inference

In [None]:
tr_en = "translate Turkish to English: "
en_tr = "translate English to Turkish: "
text = tr_en + "Hangi müzikleri seviyorsun"

inputs = tokenizer(text, return_tensors="pt").input_ids

outputs = inference_model.generate(inputs, max_new_tokens=512, do_sample=True, top_k=30, top_p=0.95)

translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text)
print(translation)

translate Turkish to English: Hangi müzikleri seviyorsun
What music do you love?
