# Requirements

In [29]:
!pip install wandb transformers[torch] torch pandas datasets evaluate sacrebleu
!pip install accelerate -U

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


# Data

In [63]:
import datasets
from itertools import chain
from transformers import AutoTokenizer


def load_dataset():
    dataset = datasets.load_dataset("colesimmons/SumTablets")

    # Oversample non-administrative examples in the training set
    def oversample_non_administrative(dataset_, oversampling_factor):
        admin_examples = dataset_.filter(
            lambda example: example["genre"] == "Administrative"
        )
        print("Num admin: ", len(admin_examples))

        non_admin_examples = dataset_.filter(
            lambda example: example["genre"] != "Administrative"
        )
        print("Num non-admin: ", len(non_admin_examples))

        oversampled_non_admin = datasets.concatenate_datasets(
            [non_admin_examples] * oversampling_factor
        )
        print("Num non-admin after: ", len(oversampled_non_admin))

        balanced = datasets.concatenate_datasets(
            [admin_examples, oversampled_non_admin]
        )
        balanced = balanced.shuffle(seed=42)
        return balanced

    dataset["train"] = oversample_non_administrative(dataset["train"], 10)
    dataset["validation"] = dataset["validation"].shuffle(seed=42)
    dataset["test"] = dataset["test"].shuffle(seed=42)
    return dataset


def tokenize(
    dataset,
    *,
    encoder_tokenizer,
    decoder_tokenizer,
    max_length=256,
    cache_location=None,
):
    def _tokenize(examples):
        inputs = encoder_tokenizer(
            examples["glyphs"],
            # padding="max_length",
            # max_length=max_length,
            padding=False,
            truncation=False,
        )
        labels = decoder_tokenizer(
            examples["transliteration"],
            # padding="max_length",
            # max_length=max_length,
            padding=False,
            truncation=False,
        )

        return {"input_ids": inputs["input_ids"], "labels": labels["input_ids"]}

    # Tokenize the tablets, removing old columns
    columns_to_remove = [
        "id",
        "glyph_names",
        "genre",
        "period",
    ]
    dataset_ = dataset.map(_tokenize, batched=True, remove_columns=columns_to_remove)

    # These are below the max length, so they're good
    below_max_length = dataset_["train"].filter(
        lambda example: len(example["input_ids"]) <= max_length
        and len(example["labels"]) <= max_length
    )
    print("Below max length: ", len(below_max_length))

    # These are above the max length, so we need to do some extra work
    above_max_length = dataset_.filter(
        lambda example: len(example["input_ids"]) > max_length
        or len(example["labels"]) > max_length
    )
    print("Above max length: ", len(above_max_length))

    def _split_overly_long_examples_by_surface(examples):
        """Split examples that are too long into multiple examples.
        Try to do it by surface.
        """

        # Split by surface
        glyphs = [glyphs.split("<SURFACE>") for glyphs in examples["glyphs"]]
        glyphs = list(chain(*glyphs))
        glyphs = ["<SURFACE>" + glyph for glyph in glyphs if glyph]

        transliterations = [
            example["transliteration"].split("<SURFACE>") for example in examples
        ]
        transliterations = list(chain(*transliterations))
        transliterations = [
            "<SURFACE>" + transliteration
            for transliteration in transliterations
            if transliteration
        ]
        return {"glyphs": glyphs, "transliteration": transliterations}

    above_max_length = above_max_length.map(
        _split_overly_long_examples_by_surface, batched=True
    )
    above_max_length = above_max_length.map(_tokenize, batched=True)

    fixed = above_max_length.filter(
        lambda example: len(example["input_ids"]) <= max_length
        and len(example["labels"]) <= max_length
    )
    print("Fixed: ", len(fixed))
    below_max_length = datasets.concatenate_datasets([below_max_length, fixed])
    above_max_length = above_max_length.filter(
        lambda example: len(example["input_ids"]) > max_length
        or len(example["labels"]) > max_length
    )
    print("Still above max length: ", len(above_max_length))

    if cache_location:
        below_max_length.save_to_disk(cache_location)

    return below_max_length

In [None]:
MAX_LENGTH = 256

In [None]:
LOAD_DATASET_FROM_CACHE = False
DATASET_CACHE_LOCATION = "./dataset"

encoder_tokenizer = AutoTokenizer.from_pretrained(
    "colesimmons/SumerianGlyphTokenizer_Roberta"
)
decoder_tokenizer = AutoTokenizer.from_pretrained(
    "colesimmons/SumerianTransliterationTokenizer_Roberta"
)

if LOAD_DATASET_FROM_CACHE:
    dataset = datasets.load_from_disk(DATASET_CACHE_LOCATION)
else:
    dataset = load_dataset()
    dataset = tokenize(
        dataset,
        encoder_tokenizer=encoder_tokenizer,
        decoder_tokenizer=decoder_tokenizer,
        max_length=MAX_LENGTH,
        cache_location=DATASET_CACHE_LOCATION,
    )

In [83]:
example = dataset["train"][0]
print(example)
print(encoder_tokenizer.decode(example["input_ids"]))
print(decoder_tokenizer.decode([t for t in example["labels"] if t != -100]))

{'input_ids': [0, 250002, 7450, 250002, 250004, 250002, 28, 4571, 9, 561, 9, 85, 9, 143, 18504, 13, 250004, 250002, 10, 2402, 71, 966, 256, 9, 192, 9, 112, 18504, 112, 304, 250004, 250002, 10, 2402, 71, 966, 256, 9, 192, 9, 112, 18504, 112, 304, 250004, 250002, 51, 9387, 9, 24854, 301, 8152, 9, 208, 17, 735, 9, 93, 9, 4807, 363, 256, 9, 402, 9, 76582, 18504, 429, 250002, 250004, 250002, 51, 9387, 9, 24854, 301, 8152, 9, 208, 17, 735, 9, 93, 9, 4807, 363, 256, 9, 402, 9, 76582, 18504, 429, 250002, 250004, 250002, 842, 794, 80, 9, 6820, 27495, 76, 4571, 9, 561, 9, 85, 9, 143, 18504, 13, 250004, 250002, 17, 735, 9, 93, 9, 4807, 363, 842, 9, 85, 9, 18, 16480, 250004, 250002, 24, 9, 232287, 34, 963, 250004, 250002, 6494, 9, 12283, 9, 13, 250004, 250002, 6, 232287, 34, 963, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

# Model

In [30]:
import wandb

wandb.login(key="d50c52de8f8f7a0f7afebb827fbfbdf1e506cb2b")



True

In [95]:
from transformers import (
    EncoderDecoderModel,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
)

MODELS_DIR = "./models"


def load_new_model(encoder_path: str, decoder_path: str):
    model = EncoderDecoderModel.from_encoder_decoder_pretrained(
        encoder_path, decoder_path
    )
    model.encoder.resize_token_embeddings(len(encoder_tokenizer))
    model.decoder.resize_token_embeddings(len(decoder_tokenizer))
    model.decoder.config.is_decoder = True
    model.decoder.config.add_cross_attention = True
    model.config.decoder_start_token_id = decoder_tokenizer.bos_token_id
    model.config.pad_token_id = decoder_tokenizer.pad_token_id

    for param in model.encoder.parameters():
        param.requires_grad = False

    return model


def load_model(
    *,
    encoder_path: str,
    decoder_path: str,
    model_name: str,
):
    if encoder_path and decoder_path:
        return load_new_model(encoder_path, decoder_path)
    if model_name:
        return EncoderDecoderModel.from_pretrained(model_name)
    raise ValueError("Must provide either encoder_path and decoder_path or model_name")


def train(
    *,
    model: EncoderDecoderModel,
    run_name: str,
    lr: float,
    num_epochs: int,
    train_batch_size: int,
    eval_batch_size: int,
    warmup_steps: int,
    eval_steps: int,
):
    training_args = Seq2SeqTrainingArguments(
        # Run info
        output_dir=f"{MODELS_DIR}/{run_name}",
        run_name=run_name,
        # Logging
        logging_steps=10,
        # Saving
        save_steps=200,
        save_total_limit=5,
        # Train
        bf16=True,
        learning_rate=lr,
        num_train_epochs=num_epochs,
        per_device_train_batch_size=train_batch_size,
        warmup_steps=warmup_steps,
        # Eval
        eval_steps=eval_steps,
        per_device_eval_batch_size=eval_batch_size,
        evaluation_strategy="steps",
        predict_with_generate=True,
        generation_max_length=decoder_tokenizer.model_max_length,
        # Return model
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
    )

    data_collator = DataCollatorForSeq2Seq(
        encoder_tokenizer, model=model, pad_to_multiple_of=128, max_length=MAX_LENGTH
    )

    trainer = Seq2SeqTrainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset["validation"],
        tokenizer=encoder_tokenizer,
        data_collator=data_collator,
    )

    wandb.init(
        project="transliteration",
    )

    train_result = trainer.train()

    trainer.save_model(f"{MODELS_DIR}/{run_name}/best_model")
    metrics = train_result.metrics
    trainer.log_metrics("train", metrics)
    trainer.save_metrics("train", metrics)
    trainer.save_state()

    wandb.finish()

[INFO|configuration_utils.py:726] 2024-03-03 23:27:00,780 >> loading configuration file /workspace/results/encoder/checkpoint-1200/config.json
[INFO|configuration_utils.py:791] 2024-03-03 23:27:00,783 >> Model config XLMRobertaConfig {
  "_name_or_path": "/workspace/results/encoder/checkpoint-1200",
  "architectures": [
    "XLMRobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.38.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250005
}

[INFO|modeling_utils.py:3254] 20

[INFO|training_args.py:1902] 2024-03-03 23:39:02,642 >> PyTorch: setting up devices
[INFO|training_args.py:1611] 2024-03-03 23:39:02,644 >> The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


03/03/2024 23:39:02 - INFO - __main__ - Training/evaluation parameters Seq2SeqTrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=100,
eval_steps=100,
evaluation_strategy=steps,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2'

[INFO|trainer.py:1812] 2024-03-03 23:39:03,326 >> ***** Running training *****
[INFO|trainer.py:1813] 2024-03-03 23:39:03,327 >>   Num examples = 2,502
[INFO|trainer.py:1814] 2024-03-03 23:39:03,329 >>   Num Epochs = 30
[INFO|trainer.py:1815] 2024-03-03 23:39:03,329 >>   Instantaneous batch size per device = 8
[INFO|trainer.py:1818] 2024-03-03 23:39:03,331 >>   Total train batch size (w. parallel, distributed & accumulation) = 8
[INFO|trainer.py:1819] 2024-03-03 23:39:03,332 >>   Gradient Accumulation steps = 1
[INFO|trainer.py:1820] 2024-03-03 23:39:03,333 >>   Total optimization steps = 9,390
[INFO|trainer.py:1821] 2024-03-03 23:39:03,336 >>   Number of trainable parameters = 584,710,293
[INFO|integration_utils.py:722] 2024-03-03 23:39:03,339 >> Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss
100,4.6165,4.555409
200,4.5657,4.572995
300,4.5014,4.622006
400,4.561,4.635164
500,4.5299,4.573621
600,4.5122,5.031684
700,4.5342,4.83883
800,4.464,7.888671
900,4.4502,7.712603
1000,4.4533,6.49738


[INFO|trainer.py:3376] 2024-03-03 23:40:21,015 >> ***** Running Evaluation *****
[INFO|trainer.py:3378] 2024-03-03 23:40:21,016 >>   Num examples = 278
[INFO|trainer.py:3381] 2024-03-03 23:40:21,017 >>   Batch size = 8
[INFO|trainer.py:3067] 2024-03-03 23:40:29,454 >> Saving model checkpoint to /workspace/results/enc-dec-1/checkpoint-100
[INFO|configuration_utils.py:473] 2024-03-03 23:40:29,468 >> Configuration saved in /workspace/results/enc-dec-1/checkpoint-100/config.json
[INFO|configuration_utils.py:614] 2024-03-03 23:40:29,480 >> Configuration saved in /workspace/results/enc-dec-1/checkpoint-100/generation_config.json
[INFO|modeling_utils.py:2454] 2024-03-03 23:40:34,495 >> Model weights saved in /workspace/results/enc-dec-1/checkpoint-100/model.safetensors
[INFO|tokenization_utils_base.py:2459] 2024-03-03 23:40:34,510 >> tokenizer config file saved in /workspace/results/enc-dec-1/checkpoint-100/tokenizer_config.json
[INFO|tokenization_utils_base.py:2468] 2024-03-03 23:40:34,521 >

### 

In [99]:
import numpy as np
from datasets import load_metric
import os

metric = load_metric("sacrebleu")

preds_dir = f"{BASE_PATH}/predictions"
os.makedirs(preds_dir, exist_ok=True)


def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels


def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]

    # Replace -100s used for padding as we can't decode them
    preds = np.where(preds != -100, preds, decoder_tokenizer.pad_token_id)
    decoded_preds = decoder_tokenizer.batch_decode(preds, skip_special_tokens=True)

    labels = np.where(labels != -100, labels, decoder_tokenizer.pad_token_id)
    decoded_labels = decoder_tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    with open(
        os.path.join(preds_dir, "predictions.txt"), "w", encoding="utf-8"
    ) as outfile:
        for pred, label in zip(decoded_preds, decoded_labels):
            outfile.write(pred + "\n")
            outfile.write(label[0] + "\n")  # Assuming labels are lists of lists

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [
        np.count_nonzero(pred != decoder_tokenizer.pad_token_id) for pred in preds
    ]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result


def _compute_metrics(eval_preds):
    predictions, labels = eval_preds
    print(predictions)
    print(labels)
    return {"bleu": 0}
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds = [pred.strip() for pred in decoded_preds]
    decoded_labels = [[label.strip()] for label in decoded_labels]

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    return {"bleu": result["score"]}

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
