## **Warm-starting BERT2BERT for CNN/Dailymail**

***Note***: This notebook only uses a few training, validation, and test data samples for demonstration purposes. To fine-tune an encoder-decoder model on the full training data, the user should change the training and data preprocessing parameters accordingly as highlighted by the comments.


### **Data Preprocessing**


In [1]:
%%capture
!pip install datasets==1.0.2
!pip install transformers==4.2.1

import datasets
import transformers
import pandas as pd

Load From File

In [6]:
# Check that dataset file is available
import glob
import pathlib

version_files = glob.glob('./versions_*.csv')

if len(version_files) == 0:
  raise SystemError("Cant find any versions!!")


def get_last_modification_time(file_name):
  fname = pathlib.Path(file_name)
  return fname.stat().st_mtime

# TODO: get latest uploaded version..
version_files.sort(key=get_last_modification_time)
version_file = version_files[0]
print(f"Using - {version_file}")

Using - ./versions_7.csv


In [7]:
pd.options.display.max_colwidth = 10000


# Load the dataset into a pandas dataframe.
df = pd.read_csv(version_file)

In [8]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, BertTokenizerFast
from torch.utils.data import random_split

pd.options.display.max_colwidth = 10000
  
tokenizer = AutoTokenizer.from_pretrained("avichr/heBERT")
model = AutoModelForMaskedLM.from_pretrained("avichr/heBERT")

tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

dataset_to_split = datasets.Dataset.from_pandas(df)
print(dataset_to_split)

splitted_datasets = dataset_to_split.train_test_split(
    test_size=0.1,
    train_size=0.9)

test_dataset = splitted_datasets['test']
train_dataset = splitted_datasets['train']

splitted_train_datasets = train_dataset.train_test_split(
    test_size=0.1,
    train_size=0.9)

train_dataset = splitted_train_datasets['train']
validation_dataset = splitted_train_datasets['test']

print(test_dataset)
print(train_dataset)
print(validation_dataset)

Downloading:   0%|          | 0.00/505 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/299k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Dataset(features: {'Unnamed: 0': Value(dtype='int64', id=None), 'article_id': Value(dtype='string', id=None), 'older': Value(dtype='string', id=None), 'newer': Value(dtype='string', id=None)}, num_rows: 11423)
Dataset(features: {'Unnamed: 0': Value(dtype='int64', id=None), 'article_id': Value(dtype='string', id=None), 'older': Value(dtype='string', id=None), 'newer': Value(dtype='string', id=None)}, num_rows: 1143)
Dataset(features: {'Unnamed: 0': Value(dtype='int64', id=None), 'article_id': Value(dtype='string', id=None), 'older': Value(dtype='string', id=None), 'newer': Value(dtype='string', id=None)}, num_rows: 9252)
Dataset(features: {'Unnamed: 0': Value(dtype='int64', id=None), 'article_id': Value(dtype='string', id=None), 'older': Value(dtype='string', id=None), 'newer': Value(dtype='string', id=None)}, num_rows: 1028)


In [9]:
batch_size=4  # change to 16 for full training
encoder_max_length=512
decoder_max_length=128

def clean_title(title):
  title = title.replace(" ", "")
  return title

def normalize_data(batch):
  batch["older"] = [clean_title(title) for title in batch["older"]]
  batch["newer"] = [clean_title(title) for title in batch["newer"]]
  return batch

def process_data_to_model_inputs(batch):
  # tokenize the inputs and labels
  batch = normalize_data(batch)
  inputs = tokenizer(batch["older"], padding="max_length", truncation=True, max_length=encoder_max_length)
  outputs = tokenizer(batch["newer"], padding="max_length", truncation=True, max_length=decoder_max_length)

  batch["input_ids"] = inputs.input_ids
  batch["attention_mask"] = inputs.attention_mask
  batch["decoder_input_ids"] = outputs.input_ids
  batch["decoder_attention_mask"] = outputs.attention_mask
  batch["labels"] = outputs.input_ids.copy()

  # because BERT automatically shifts the labels, the labels correspond exactly to `decoder_input_ids`. 
  # We have to make sure that the PAD token is ignored
  batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]

  return batch

train_data = train_dataset
# only use 32 training examples for notebook - DELETE LINE FOR FULL TRAINING
#train_data = train_data.select(range(32))
print(train_data)

train_data = train_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["older", "newer", "Unnamed: 0", "article_id"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)


val_data = validation_dataset
# only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING
#val_data = val_data.select(range(16))
print(val_data)

val_data = val_data.map(
    process_data_to_model_inputs, 
    batched=True, 
    batch_size=batch_size, 
    remove_columns=["older", "newer", "Unnamed: 0", "article_id"]
)
val_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

Dataset(features: {'Unnamed: 0': Value(dtype='int64', id=None), 'article_id': Value(dtype='string', id=None), 'older': Value(dtype='string', id=None), 'newer': Value(dtype='string', id=None)}, num_rows: 9252)


  0%|          | 0/2313 [00:00<?, ?ba/s]

Dataset(features: {'Unnamed: 0': Value(dtype='int64', id=None), 'article_id': Value(dtype='string', id=None), 'older': Value(dtype='string', id=None), 'newer': Value(dtype='string', id=None)}, num_rows: 1028)


  0%|          | 0/257 [00:00<?, ?ba/s]

### **Warm-starting the Encoder-Decoder Model**

In [10]:
from transformers import EncoderDecoderModel

bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [11]:
# set special tokens
bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
bert2bert.config.eos_token_id = tokenizer.eos_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size
bert2bert.config.max_length = 142
bert2bert.config.min_length = 56
bert2bert.config.no_repeat_ngram_size = 3
bert2bert.config.early_stopping = True
bert2bert.config.length_penalty = 2.0
bert2bert.config.num_beams = 4

### **Fine-Tuning Warm-Started Encoder-Decoder Models**

For the `EncoderDecoderModel` framework, we will use the `Seq2SeqTrainingArguments` and the `Seq2SeqTrainer`. Let's import them.

In [12]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer

Also, we need to define a function to correctly compute the ROUGE score during validation. ROUGE is a much better metric to track during training than only language modeling loss.

In [13]:
!pip install rouge_score

# load rouge for validation
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

Collecting rouge_score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl (22 kB)
Installing collected packages: rouge-score
Successfully installed rouge-score-0.0.4


Downloading:   0%|          | 0.00/1.66k [00:00<?, ?B/s]

Cool! Finally, we start training.

In [None]:
# set training arguments - these params are not really tuned, feel free to change
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    logging_steps=2,  # set to 1000 for full training
    save_steps=16,  # set to 500 for full training
    eval_steps=4,  # set to 8000 for full training
    warmup_steps=1,  # set to 2000 for full training
    max_steps=16, # delete for full training
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=True, 
)

# instantiate trainer
trainer = Seq2SeqTrainer(
    model=bert2bert,
    tokenizer=tokenizer,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()

  self.args.max_grad_norm,


Step,Training Loss,Validation Loss


To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor'). (Triggered internally at  /pytorch/aten/src/ATen/native/BinaryOps.cpp:467.)
  return torch.floor_divide(self, other)


### **Evaluation**

Awesome, we finished training our dummy model. Let's now evaluated the model on the test data. We make use of the dataset's handy `.map()` function to generate a summary of each sample of the test data.

In [None]:
import datasets
from transformers import BertTokenizer, EncoderDecoderModel

tokenizer = AutoTokenizer.from_pretrained("avichr/heBERT")
model = AutoModelForMaskedLM.from_pretrained("avichr/heBERT")
model.to("cuda")

test_data = test_dataset

# only use 16 training examples for notebook - DELETE LINE FOR FULL TRAINING
#test_data = test_data.select(range(16))

batch_size = 16  # change to 64 for full evaluation

# map data correctly
def generate_better_title(batch):
    # Tokenizer will automatically set [BOS] <text> [EOS]
    # cut off at BERT max length 512
    inputs = tokenizer(batch["older"], padding="max_length", truncation=True, max_length=512, return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str
    #print(batch["older"])
    #print(batch["pred"])

    return batch

results = test_data.map(generate_better_title, batched=True, batch_size=batch_size, remove_columns=["older"])

pred_str = results["pred"]
label_str = results["newer"]

print(generate_better_title({"older": ["◊ê◊†◊©◊ô◊ù◊ü ◊†◊î◊®◊í◊ï", "@@@@", "◊ï◊ï◊ê◊ï! ◊û◊ô ◊ê◊†◊ó◊†◊ï?", "◊ê◊ú◊ï◊î◊ô◊ù◊ê◊ï◊î◊ë◊ê◊ï◊™◊ô◊ï◊ê◊™◊õ◊ï◊ú◊†◊ï◊ï◊ï◊ê◊ï◊ô◊©◊õ◊§◊®◊î"]}))

rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output)
#model.save_pretrained('./sample_data')

HBox(children=(FloatProgress(value=0.0, max=72.0), HTML(value='')))


{'older': ['◊ê◊†◊©◊ô◊ù◊ü ◊†◊î◊®◊í◊ï', '@@@@', '◊ï◊ï◊ê◊ï! ◊û◊ô ◊ê◊†◊ó◊†◊ï?', '◊ê◊ú◊ï◊î◊ô◊ù◊ê◊ï◊î◊ë◊ê◊ï◊™◊ô◊ï◊ê◊™◊õ◊ï◊ú◊†◊ï◊ï◊ï◊ê◊ï◊ô◊©◊õ◊§◊®◊î'], 'pred': ['◊ê◊†◊©◊ô◊ù◊ü ◊†◊î◊®◊í◊ï', '@ @ @ @', '◊ï◊ï◊ê◊ï! ◊û◊ô ◊ê◊†◊ó◊†◊ï?', '◊ê◊ú◊ï◊î◊ô◊ù◊ê◊ï◊î◊ë◊ê◊ï◊™◊ô◊ï◊ê◊™◊õ◊ï◊ú◊†◊ï◊ï◊ï◊ê◊ï◊ô◊©◊õ◊§◊®◊î']}
Score(precision=0.02933070866141732, recall=0.02811315252260134, fmeasure=0.028258967629046366)


The fully trained *BERT2BERT* model is uploaded to the ü§ómodel hub under [patrickvonplaten/bert2bert_cnn_daily_mail](https://huggingface.co/patrickvonplaten/bert2bert_cnn_daily_mail). 

The model achieves a ROUGE-2 score of **18.22**, which is even a little better than reported in the paper.

For some summarization examples, the reader is advised to use the online inference API of the model, [here](https://huggingface.co/patrickvonplaten/bert2bert_cnn_daily_mail).