In [4]:
from transformers import LEDTokenizer, LEDForConditionalGeneration
from datasets import load_dataset

tokenizer = LEDTokenizer.from_pretrained("allenai/led-base-16384")
model = LEDForConditionalGeneration.from_pretrained("allenai/led-base-16384")

max_input_length = 16384
max_target_length = 256

In [5]:
dataset = load_dataset("dennlinger/eur-lex-sum", 'english')

In [9]:
# Tokenise data 
def preprocess_data(dataset):
    inputs = dataset['train']['reference']
    targets = dataset['train']['summary']
    
    # Tokenise inputs
    model_inputs = tokenizer(
        inputs,
        max_length=max_input_length,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    
    # Classification token needs global attention so it can see the whole input context
    global_attention_mask = [1] + [0] * (max_input_length - 1)
    model_inputs["global_attention_mask"] = [global_attention_mask] * len(inputs)
    
    # Tokenise targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=max_target_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
    
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs