In [1]:
!pip install transformers==4.20.0
!pip install keras_nlp==0.3.0
!pip install datasets
!pip install huggingface-hub
!pip install nltk
!pip install rouge-score



In [2]:
import os
import logging

import nltk
import numpy as np
import tensorflow as tf
from tensorflow import keras

# Only log error messages
tf.get_logger().setLevel(logging.ERROR)

os.environ["TOKENIZERS_PARALLELISM"] = "false"



In [3]:
import keras_nlp

In [4]:
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq

In [5]:
from datasets  import load_dataset

In [6]:
model_path = "t5-small"
data_path = "EdinburghNLP/xsum"

In [7]:
dataset = load_dataset(data_path, split='train')

In [8]:
dataset

Dataset({
    features: ['document', 'summary', 'id'],
    num_rows: 204045
})

In [9]:
dataset[0]

 'summary': 'Clean-up operations are continuing across the Scottish Borders and Dumfries and Galloway after flooding caused by Storm Frank.',
 'id': '35232142'}

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_path)

In [11]:
tokenizer(dataset['document'][1], truncation=True ,return_tensors='tf')

{'input_ids': <tf.Tensor: shape=(1, 192), dtype=int32, numpy=
array([[   71,  1472,  6196,   877,   326,    44,     8,  9108,    86,
           29,    16,  6000,  1887,    44,    81, 11484,    10,  1755,
          272,  4209,    30,  1856,    11,  2554,   130,  1380,    12,
         1175,     8,  1595,     5,   282,    79,     3,  9094,  1067,
           79,  1509,     8,   192, 14264,     6,     3, 16669,   596,
           18,   969,    18,  1583,    16,     8,   443,  2447,     6,
            3,    35,  6106, 19565,    57, 12314,     7,     5,   555,
           13,     8,  1552,  1637,    19,    45,  3434,     6,     8,
          119,    45,  1473,    11, 14441,     5,    94,    47,    70,
          166,   706,    16,  5961,  5316,     5,    37,  2535,    13,
           80,    13,     8, 14264,   243,   186,    13,     8,  9234,
          141,   646,   525, 12770,     7,    30,  1476,    11,   175,
          141,   118, 10932,     5,  2867,  1637,    43, 13666,  3709,
        11210, 

In [12]:
final_dataset = dataset.train_test_split(
    train_size=0.25, test_size=0.05
)

In [13]:
final_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 51011
    })
    test: Dataset({
        features: ['document', 'summary', 'id'],
        num_rows: 10203
    })
})

In [14]:
def tokenization(examples):
    inputs = ['summarize: ' + doc for doc in examples["document"]]
    tok = tokenizer(inputs, truncation=True, padding = True, return_tensors='tf')
    
    with tokenizer.as_target_tokenizer(): 
        labels = tokenizer(examples['summary'],truncation=True, padding = True, return_tensors='tf')

    tok['labels'] = labels['input_ids']

    return {'input_ids': tok['input_ids'].numpy(),
            'labels': tok['labels'].numpy(),
            'attention_mask': tok['attention_mask'].numpy()}

In [15]:
tokenized_dataset = final_dataset.map(tokenization, batched=True)

  0%|          | 0/52 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'labels', 'attention_mask'],
        num_rows: 51011
    })
    test: Dataset({
        features: ['document', 'summary', 'id', 'input_ids', 'labels', 'attention_mask'],
        num_rows: 10203
    })
})

In [17]:
processed_data = tokenized_dataset.remove_columns(['document','summary', 'id'])

In [18]:
processed_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 51011
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 10203
    })
})

In [19]:
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_path)

All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


In [20]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model= model, return_tensors="tf")

In [21]:
train_dataset = model.prepare_tf_dataset(processed_data['train'], batch_size=32, tokenizer= tokenizer, collate_fn=data_collator, shuffle=True, drop_remainder=True)

In [22]:
test_dataset = model.prepare_tf_dataset(processed_data['test'], batch_size=32, tokenizer= tokenizer, collate_fn=data_collator, shuffle=False, drop_remainder=True)

In [23]:
optimizer = keras.optimizers.Adam(learning_rate=2e-5)
model.compile(optimizer=optimizer)

No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! To disable this behaviour please pass a loss argument, or explicitly pass `loss=None` if you do not want your model to compute a loss.


In [24]:
rouge_l = keras_nlp.metrics.RougeL()

In [25]:
def metric_fn(eval_predictions):
    predictions, labels = eval_predictions
    decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    for label in labels:
        label[label < 0] = tokenizer.pad_token_id  # Replace masked label tokens
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge_l(decoded_labels, decoded_predictions)
    result = {"RougeL": result["f1_score"]}

    return result

In [26]:
from transformers.keras_callbacks import KerasMetricCallback

metric_callback = KerasMetricCallback(
    metric_fn, eval_dataset=test_dataset)

In [27]:
callbacks = [metric_callback]

In [28]:
model.fit(train_dataset, validation_data=test_dataset, epochs=5, verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x780929b3b2e0>

In [29]:
model.save_weights('summarized_model')

In [32]:
tokenizer.save_vocabulary("/kaggle/working/")

('/kaggle/working/spiece.model',)

In [33]:
from transformers import pipeline

In [34]:
pipe = pipeline("summarization", model=model, tokenizer=tokenizer, framework="tf")

In [43]:
dataset[12]

{'document': 'Administrators confirmed the redundancies affecting 38 staff at Galashiels-based Murray and Burrell.\nThe business, established in 1928, went into administration last week citing "adverse trading conditions".\nThere are hopes some of the workers affected could find posts at another building firm in nearby Melrose which currently requires staff.\nThomson Cooper partner Richard Gardiner was appointed as administrator at Murray and Burrell on Monday.\nA statement confirmed: "Directors explored all options in an effort to preserve trading and jobs.\n"Regrettably, 38 jobs were lost as there is no prospect of continuing to trade."\nSouth of Scotland MSP Rachael Hamilton described it as a "sad day for the Borders".\nHowever, some of the workers laid off could find employment with a Melrose-based company.\nJS Crawford has said that, with several housing projects on its books, it needs staff.',
 'summary': 'Dozens of jobs have been lost after efforts to save an historic building f

In [44]:
pipe(dataset['document'][12])

Your max_length is set to 200, but you input_length is only 195. You might consider decreasing max_length manually, e.g. summarizer('...', max_length=97)


[{'summary_text': 'A small business in Scotland has been sacked after a number of workers were laid off by a company in a bid to preserve trading.'}]

# Thanks