In [None]:
from transformers import BartTokenizer, BartModel
import tensorflow as tf
import numpy as np

In [None]:
# Load tokenizer
tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

In [None]:
# CSV column headers and types
CSV_COLUMNS = ['id', 'article', 'highlights']
RECORD_DEFAULTS = [tf.constant('', dtype=tf.string)] * len(CSV_COLUMNS)

In [None]:
# Load dataset from CSV
train_dataset = tf.data.experimental.CsvDataset(
    filenames='train.csv',
    record_defaults=RECORD_DEFAULTS,
    header=True
)

valid_dataset = tf.data.experimental.CsvDataset(
    filenames = 'validation.csv',
    record_defaults = RECORD_DEFAULTS,
    header = True
)

test_dataset = tf.data.experimental.CsvDataset(
    filenames = 'test.csv',
    record_defaults = RECORD_DEFAULTS,
    header = True
)


In [None]:
# Convert each row to a dictionary
def to_dict(*fields):
    return dict(zip(CSV_COLUMNS, fields))

train_dataset_dict = train_dataset.map(to_dict)
valid_dataset_dict = valid_dataset.map(to_dict)
test_dataset_dict = test_dataset.map(to_dict)


In [None]:
# Tokenization function
def tokenize_bart(article_tensor, highlight_tensor):
    article = article_tensor.numpy().decode('utf-8')
    highlight = highlight_tensor.numpy().decode('utf-8')

    input_enc = tokenizer(
        article,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='np'
    )

    target_enc = tokenizer(
        highlight,
        padding='max_length',
        truncation=True,
        max_length=128,
        return_tensors='np'
    )

    labels = target_enc['input_ids'][0]
    labels[labels == tokenizer.pad_token_id] = -100

    return (
        input_enc['input_ids'][0],
        input_enc['attention_mask'][0],
        labels
    )

In [None]:
def tf_tokenize(example):
  input_ids,attention_mask,labels = tf.py_function(
      func = tokenize_bart,inp=[example['article'],example['highlights']],
      Tout=(tf.int32,tf.int32,tf.int32)
  )
  input_ids.set_shape([512])
  attention_mask.set_shape([512])
  labels.set_shape([128])
  return {
      'input_ids': input_ids,
      'attention_mask': attention_mask,
      'labels': labels
  }

In [None]:
tokenized_train_dataset = (
    train_dataset_dict
    .map(tf_tokenize,num_parallel_calls=tf.data.AUTOTUNE)
    .shuffle(1000)
    .batch(8)
    .prefetch(tf.data.AUTOTUNE)
    .repeat()
)

tokenized_validation_dataset= (
    valid_dataset_dict
    .map(tf_tokenize,num_parallel_calls=tf.data.AUTOTUNE)
    .shuffle(1000)
    .batch(8)
    .prefetch(tf.data.AUTOTUNE)
    .repeat()
)

tokenized_test_dataset = (
    test_dataset_dict
    .map(tf_tokenize,num_parallel_calls=tf.data.AUTOTUNE)
    .shuffle(1000)
    .batch(8)
    .prefetch(tf.data.AUTOTUNE)
    .repeat()
)

In [None]:
#Creating the model
from transformers import TFAutoModelForSeq2SeqLM
model = TFAutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.
All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)

In [None]:
model.compile(optimizer=optimizer)
model.fit(tokenized_train_dataset,validation_data=tokenized_validation_dataset,epochs=3,steps_per_epoch=625,validation_steps=187)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7b38a01c7210>

In [None]:
text = """
A drunk teenage boy had to be rescued by security after jumping into a lions' enclosure at a zoo in western India.
Rahul Kumar, 17, clambered over the enclosure fence at theÂ Kamla Nehru Zoological Park in Ahmedabad, and began running towards the animals,
shouting he would 'kill them'. Mr Kumar explained afterwards that he was drunk and 'thought I'd stand a good chance' against the predators.
Next level drunk: Intoxicated Rahul Kumar, 17, climbed into the lions' enclosure at a zoo in Ahmedabad and began running towards the animals
shouting 'Today I kill a lion!' Mr Kumar had been sitting near the enclosure when he suddenly made a dash for the lions, surprising zoo security.
The intoxicated teenager ran towards the lions, shouting: 'Today I kill a lion or a lion kills me!' A zoo spokesman said: 'Guards had earlier spotted
him close to the enclosure but had no idea he was planing to enter it. 'Fortunately, there are eight moats to cross before getting to where
the lions usually are and he fell into the second one, allowing guards to catch up with him and take him out. 'We then handed him over to the
police.' Brave fool: Fortunately, Mr Kumar  fell into a moat as he ran towards the lions and could be rescued by zoo security staff before reaching
the animals (stock image) Kumar later explained: 'I don't really know why I did it. 'I was drunk and thought I'd stand a good chance.
A police spokesman said: 'He has been cautioned and will be sent for psychiatric evaluation. 'Fortunately for him, the lions were asleep and
the zoo guards acted quickly enough to prevent a tragedy similar to that in Delhi.
Last year a 20-year-old man was mauled to death by a tiger in the Indian capital after climbing into its enclosure at the city zoo.
"""

In [None]:
inputs = tokenizer(
    text,
    padding='max_length',
    max_length=512,
    truncation=True,
    return_tensors="tf"
)

TensorFlow and JAX classes are deprecated and will be removed in Transformers v5. We recommend migrating to PyTorch classes or pinning your version of Transformers.


In [None]:
summary_ids = model.generate(
    inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_length=128,
)

In [None]:
prediction = tokenizer.decode(summary_ids[0],skip_special_char=True)

In [None]:
model.save_pretrained('myFineTunedBart')
tokenizer.save_pretrained('myFineTunedBart')

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0}


('myFineTunedBart/tokenizer_config.json',
 'myFineTunedBart/special_tokens_map.json',
 'myFineTunedBart/vocab.json',
 'myFineTunedBart/merges.txt',
 'myFineTunedBart/added_tokens.json')

In [None]:
candidate=example_text
reference = prediction
rouge.get_scores(candidate,reference)