# A minimal example of my problems with training byT5


## Dependencies

In [1]:
import torch
import transformers
import datasets              

## Helper Functions

In [2]:
def models_identical(model1, model2) -> bool:
    unequal_states = []
    for (name_mo, params_mo), (name_mn, params_mn) in zip(model1.state_dict().items(), model2.state_dict().items()):
        assert name_mo == name_mn
        if not torch.equal(params_mo, params_mn):
            unequal_states.append(name_mo)
    return len(unequal_states) == 0

In [3]:
if 'model' in locals():
    del model
    
if 'model_orig' in locals():
    del model_orig


## Set-up

In [4]:
checkpoint_name = "google/byt5-small"
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = transformers.T5ForConditionalGeneration.from_pretrained(checkpoint_name).to(device)
model_orig = transformers.T5ForConditionalGeneration.from_pretrained(checkpoint_name).to(device)

In [5]:
train_data = {"input_ids" : torch.tensor([[105, 114, 114, 35, 101, 100, 117, 0, 0, 0]]), 
              "attention_mask" : torch.tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]) ,
              "labels" : torch.tensor([[117, 100, 101, 35, 114, 114, 105, -100, -100, -100]])}
# train_data = {k: v.to(device) for k, v in train_data.items()} # muss das überhaupt?
print(train_data)
# convert to a Dataset object
ds_train = datasets.Dataset.from_dict(train_data)
print(ds_train[0])
# evaluation data is just a copy of train_data
ds_eval = ds_train

{'input_ids': tensor([[105, 114, 114,  35, 101, 100, 117,   0,   0,   0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]), 'labels': tensor([[ 117,  100,  101,   35,  114,  114,  105, -100, -100, -100]])}
{'input_ids': [105, 114, 114, 35, 101, 100, 117, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 0, 0, 0], 'labels': [117, 100, 101, 35, 114, 114, 105, -100, -100, -100]}


In [6]:
epochs = 200

In [7]:
# mininal training arguments
training_args = transformers.Seq2SeqTrainingArguments(
    output_dir="test",
    predict_with_generate=True,
    # evaluation_strategy = "steps",
    # fp16=True, ####### Hier spielt die Musik 
    eval_steps=100,
    num_train_epochs=epochs,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    )

In [8]:
# initialize the trainer
trainer = transformers.Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_eval,
)


# A. With `Trainer.train()`

In [9]:
trainer.train()

***** Running training *****
  Num examples = 1
  Num Epochs = 200
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 299637760


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=200, training_loss=3.6698907470703124, metrics={'train_runtime': 32.3615, 'train_samples_per_second': 6.18, 'train_steps_per_second': 6.18, 'total_flos': 3588865536000.0, 'train_loss': 3.6698907470703124, 'epoch': 200.0})

In [35]:
# Check whether models are identical
print("Untrained model and trained model are the same:", 
    models_identical(model_orig, model)
    )

print("Untrained model and trained model are the same:", 
    models_identical(model_orig, trainer.model)
    )

Untrained model and trained model are the same: False
Untrained model and trained model are the same: False


In [36]:
print("Model and trainer.model are the same:", 
    models_identical(model, trainer.model)
    )

Model and trainer.model are the same: True


In [37]:
print(trainer.state.log_history)

[{'eval_loss': 4.751281721837586e-06, 'eval_runtime': 0.0678, 'eval_samples_per_second': 14.758, 'eval_steps_per_second': 14.758, 'epoch': 250.0, 'step': 250}, {'loss': 1.3045, 'learning_rate': 0.0, 'epoch': 500.0, 'step': 500}, {'eval_loss': 4.5980661411704205e-07, 'eval_runtime': 0.017, 'eval_samples_per_second': 58.786, 'eval_steps_per_second': 58.786, 'epoch': 500.0, 'step': 500}, {'train_runtime': 88.2316, 'train_samples_per_second': 5.667, 'train_steps_per_second': 5.667, 'total_flos': 8972163840000.0, 'train_loss': 1.3044609375, 'epoch': 500.0, 'step': 500}]


## A. Application

In [13]:
for batch in trainer.get_train_dataloader():
    break
batch_without_labels = {k: v.to(device) for k, v in batch.items() if k != "labels"}
print(batch_without_labels)
print(trainer.model.generate(**batch_without_labels, num_beams=2, early_stopping=True, max_length=10)[0])
print(model.generate(**batch_without_labels, num_beams=2, early_stopping=True, max_length=10)[0])

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0,
  "transformers_version": "4.26.0"
}



{'input_ids': tensor([[105, 114, 114,  35, 101, 100, 117,   0,   0,   0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0]], device='cuda:0')}
tensor([  0, 117, 100, 101,  35, 114, 114, 105, 105, 114], device='cuda:0')
tensor([  0, 117, 100, 101,  35, 114, 114, 105, 105, 114], device='cuda:0')


fp16=False : tensor([  0, 117, 100, 101,  35, 114, 114, 105, 105, 114], device='cuda:0')
fp16=True : tensor([0, 0, 1], device='cuda:0')

# B. pytorch-style loop but with some trainer stuff 

In [None]:
# create an optimizer
trainer.create_optimizer()

# get the first (and only) batch
for batch in trainer.get_train_dataloader():
    break
batch = {k: v.to(device) for k, v in batch.items()}

# loop over batches
for i in range(epochs):
    # get loss
    outputs = trainer.model(**batch)
    loss = outputs.loss
    # gradients and backprop
    loss.backward()
    trainer.optimizer.step()
    trainer.optimizer.zero_grad()

In [None]:
# Check whether models are identical
print("Untrained model and trained model are the same: ", 
      models_identical(model_orig, model)
      )

print("Untrained model and trained model are the same: ", 
      models_identical(model_orig, trainer.model)
      )

In [1]:
del model, model_orig

NameError: name 'model' is not defined