In [1]:
#https://discuss.huggingface.co/t/issues-with-fine-tuning-an-encoder-decoder-model/48880

from transformers import EncoderDecoderConfig, EncoderDecoderModel, BertTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate, datasets

In [2]:
batch_size=8
encoder_max_length=128
decoder_max_length=16

In [3]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertLMHeadModel: ['cls.seq_relationship.bias', 'cls.seq_relations

In [4]:
dataset = datasets.load_dataset("bookcorpus")

In [5]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.decoder.decoder_start_token_id = tokenizer.cls_token_id
# model.config.bos_token_id = tokenizer.bos_token_id

In [6]:
model.config.vocab_size = tokenizer.vocab_size
model.config.max_length = 16
model.config.min_length = 4
model.config.no_repeat_ngram_size = 1
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [7]:
model.tie_weights()

In [8]:
# def process_data_to_model_inputs(batch):
#     inputs = tokenizer(batch["title_s_article_s"], padding="max_length", truncation=True, max_length=encoder_max_length)
#     outputs = tokenizer(batch["highlight"], padding="max_length", truncation=True, max_length=decoder_max_length)

#     batch["input_ids"] = inputs.input_ids
#     batch["attention_mask"] = inputs.attention_mask
#     batch["decoder_input_ids"] = outputs.input_ids
#     batch["decoder_attention_mask"] = outputs.attention_mask
#     batch["labels"] = outputs.input_ids.copy()

#     # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
#     # We have to make sure that the PAD token is ignored
#     batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

#     return batch
# # end

In [9]:
def process_data_to_model_inputs(batch):
    inputs = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=encoder_max_length)
    # outputs = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=decoder_max_length)

    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = inputs.input_ids
    batch["decoder_attention_mask"] = inputs.attention_mask
    batch["labels"] = inputs.input_ids.copy()

    # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`.
    # We have to make sure that the PAD token is ignored
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

    return batch
# end

In [10]:
# train_data = dataset["train"].select(range(int(len(dataset['train'])/2)))

train_data = dataset["train"].select(range(72000))
# train_data = dataset["train"]

train_data = train_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=['text']
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

Map:   0%|          | 0/72000 [00:00<?, ? examples/s]

In [11]:
val_data = dataset["train"].select(range(72000, 75000))
# val_data = dataset["train"].select(range(int(len(dataset['train'])/10*9)),len(dataset['train']))

val_data = val_data.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=batch_size,
    remove_columns=['text']
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

In [12]:
training_args = Seq2SeqTrainingArguments(
    output_dir="outputs",
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    logging_steps=100,
    save_steps=1000,
    eval_steps=100,
    warmup_steps=0,
    overwrite_output_dir=True,
    save_total_limit=1,
    num_train_epochs = 12,
    learning_rate=1e-4,
    fp16=True,
    lr_scheduler_type = "cosine"
)

In [13]:
rouge = evaluate.load("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    # print('START')
    # for x,y in zip(pred_str[:10],label_str[:10]):
    #     print('PRED: ',x,"LABEL: ",y)
    # print('END')
    rouge_output = rouge.compute(
        predictions=pred_str,
        references=label_str
    )
    return rouge_output
# end

In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)

Using amp fp16 backend


In [15]:
trainer.train()

***** Running training *****
  Num examples = 72000
  Num Epochs = 12
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 108000
Trainer is attempting to log a value of "{'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': None, 'use_bfloat16': False, 'pruned_heads': {}, 'tie_word_embeddings': True, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'chunk_siz

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.0012,0.00169,6.3e-05,0.0,6.2e-05,6.2e-05
2,0.001,0.010402,0.030519,0.0,0.028822,0.028861
3,0.0001,0.009558,0.020017,0.0,0.01995,0.019981
4,0.0009,0.016854,0.023511,0.002311,0.023412,0.023363
5,0.0,0.008225,0.029937,0.0,0.028753,0.028709
6,0.0,0.010556,0.000305,0.0,0.000309,0.000302
7,0.0,0.00862,0.000101,0.0,0.000101,0.000103
8,0.0,0.015109,0.054362,0.0,0.052278,0.052245
9,0.0,0.009316,0.067935,2.2e-05,0.060481,0.060513
10,0.0,0.007644,0.078172,0.000276,0.067848,0.067829


Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to outputs/checkpoint-2000
Configuration saved in outputs/checkpoint-2000/config.json
Model weights saved in outputs/checkpoint-2000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-2000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-2000/special_tokens_map.json
Deleting older checkpoint [outputs/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to outputs/checkpoint-3000
Configuration saved in outputs/checkpoint-3000/config.json
Model weights saved in outputs/checkpoint-3000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-3000/tokenizer_config.json
Special t

TrainOutput(global_step=108000, training_loss=0.0036797359274930936, metrics={'train_runtime': 23403.1064, 'train_samples_per_second': 36.918, 'train_steps_per_second': 4.615, 'total_flos': 2.6501217140736e+17, 'train_loss': 0.0036797359274930936, 'epoch': 12.0})

In [27]:
dataset['train'][0]

{'text': 'usually , he would be tearing around the living room , playing with his toys .'}

In [40]:
test_input = tokenizer.encode_plus('good morning', return_tensors='pt')
test_input = test_input.to('cuda')

In [36]:
# test_input

{'input_ids': tensor([[  101,  2788,  1010,  2002,  2052,  2022, 13311,  2105,  1996,  2542,
          2282,  1010,  2652,  2007,  2010, 10899,   102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [37]:
# [tokenizer.decode(i) for i in model.generate(test_input['input_ids']).cpu().tolist()[0]]

['[ C L S ]',
 '[ S E P ]',
 ',',
 'a n d',
 'w a s',
 'b e',
 'c o',
 'n',
 'd',
 'h a d',
 'h a s',
 'i s',
 'd o e s',
 'd o',
 'h a v e',
 'o f']

In [41]:
model.generate(test_input['input_ids'])

tensor([[ 101,  102, 1010, 1998, 2001, 2022, 2522, 1050, 1040, 2018, 2038, 2003,
         2515, 2079, 2031, 1997]], device='cuda:0')