<a href="https://colab.research.google.com/github/code-infected/Pointer-summary-/blob/main/pointer_summary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install torch torchvision
!pip install pytorch-nlp
!pip install transformers
!pip install datasets transformers[sentencepiece]
!pip install sentencepiece
!pip install transformers tensorflow tensorflow_datasets
!pip install numpy




In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds
from transformers import GPT2LMHeadModel, GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

dataset = tfds.load("cnn_dailymail", split="train[:10%]")

def preprocess_data(data):
    article = data["article"].numpy().decode("utf-8")
    summary = data["highlights"].numpy().decode("utf-8")
    return article, summary

max_seq_length = 512
tokenized_data = [tokenizer.encode(summary, padding="max_length", max_length=max_seq_length)
                  for _, summary in map(preprocess_data, dataset)]

padding_mask = [tf.cast(tokens != tokenizer.pad_token_id, tf.float32) for tokens in tokenized_data]

train_dataset = tf.data.Dataset.from_tensor_slices(
    ({"input_ids": tokenized_data[:int(0.8 * len(tokenized_data))],
      "attention_mask": padding_mask[:int(0.8 * len(padding_mask))]},
     {"labels": tokenized_data[:int(0.8 * len(tokenized_data))]}))

model = GPT2LMHeadModel.from_pretrained("gpt2")
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-4)

num_epochs = 5
for epoch in range(num_epochs):
    total_loss = 0
    num_batches = 0

    for batch in train_dataset:
        with tf.GradientTape() as tape:
            inputs = batch[0]["input_ids"]
            attention_mask = batch[0]["attention_mask"]
            #inputs = tf.ensure_shape(inputs, [None, max_seq_length])
            labels = inputs[:, 1:]
            attention_mask_labels = attention_mask[:, 1:]

            outputs = model(inputs, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        total_loss += loss
        num_batches += 1

    average_loss = total_loss / num_batches
    print(f"Epoch {epoch+1}/{num_epochs}, Average Loss: {average_loss}")

prompt = "This is a news article about the latest developments in the COVID-19 pandemic."
input_ids = tokenizer.encode(prompt, padding="max_length", max_length=max_seq_length, return_tensors="tf")
generated_summary = model.generate(input_ids.numpy(), max_length=100, num_return_sequences=1, no_repeat_ngram_size=2)

summary = tokenizer.decode(generated_summary[0], skip_special_tokens=True)
print("Generated Summary:")
print(summary)
