# Install dependencies and libraries

In [1]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (

# Import Packages

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer
from datasets import load_dataset
from torch.cuda.amp import GradScaler, autocast
import os
import warnings
warnings.filterwarnings('ignore')

# Stage 1: Data Collection

In [3]:
def load_data():
  dataset = load_dataset("cnn_dailymail", "3.0.0", split="train[:200]")
  return dataset

In [4]:
dataset_test = load_data()

for sample in dataset_test:
  text = sample['article']

print(text[:1000])

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

(Mental Floss) -- 1. Bobby Murcer's biggest fan . Former New York Yankee Bobby Murcer warms up  during Old Timers Day Yankee Stadium on July 7. Our first story has a fairy-tale middle and a horrible ending. In August of '77, Bobby Murcer of the Cubs promised to hit a home run for terminally ill fan Scott Crull. That night, Murcer hit two of them. Pretty amazing, especially when you consider Murcer only hit nine homers the whole next season. But that's not why Crull -- a 12-year-old from Calumet City, Illinois -- makes this list. Broadcasting the game nationally on ABC, Keith Jackson told the country how Murcer had fulfilled the dying boy's last wish. Eyes watered, spines tingled. There was only one problem -- nobody had ever told the boy he was dying. His parents were horrified. Weeks later, Crull passed away. 2. The good luck charmers . Every sport has its own strange traditions. I'd argue hockey's "throwing an octopus on the ice for good luck" is the weirdest. Tossing the eight-tenta

# Stage 2: Data preprocessing

In [5]:
class SummarizationDataset(Dataset):
  def __init__(self, dataset, tokenizer, max_input_length=256, max_target_length=128):
    self.dataset = dataset
    self.tokenizer = tokenizer
    self.max_input_length = max_input_length
    self.max_target_length = max_target_length

  def __len__(self):
    return len(self.dataset)

  def __getitem__(self, idx):
    article = self.dataset[idx]['article']
    summary = self.dataset[idx]['highlights']

    article = "summarize: " + article

    input_encoding = self.tokenizer(
        article,
        max_length=self.max_input_length,
        padding="max_length",
        truncation = True,
        return_tensors = "pt"
    )

    target_encoding = self.tokenizer(
        summary,
        max_length = self.max_target_length,
        padding = "max_length",
        truncation = True,
        return_tensors = "pt"
    )

    return {
        "input_ids": input_encoding["input_ids"].squeeze(),
        "attention_mask": input_encoding["attention_mask"].squeeze(),
        "labels": target_encoding["input_ids"].squeeze()
    }


# Stage 3: Model Building

In [6]:
class CustomTransformer(nn.Module):
  def __init__(self, vocab_size, d_model = 256, nhead = 4, num_encoder_layers=3,
               num_decoder_layers=3, dim_feedforward=1024, dropout=0.1):
    super(CustomTransformer, self).__init__()
    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoder = nn.Parameter(torch.zeros(1,512,d_model))
    self.transformer = nn.Transformer(
        d_model = d_model,
        nhead = nhead,
        num_encoder_layers = num_encoder_layers,
        num_decoder_layers = num_decoder_layers,
        dropout = dropout
    )

    self.fc_out = nn.Linear(d_model, vocab_size)
    self.d_model = d_model

  def forward(self, src, tgt, src_mask=None, tgt_mask=None):
    src = self.embedding(src) * torch.sqrt(torch.tensor(self.d_model, dtype = torch.float))
    src = src + self.pos_encoder[:, :src.size(1),:]
    tgt = self.embedding(tgt) * torch.sqrt(torch.tensor(self.d_model, dtype = torch.float))
    tgt = tgt + self.pos_encoder[:, :tgt.size(1),:]

    output = self.transformer(src.transpose(0,1), tgt.transpose(0,1), src_mask, tgt_mask)
    output = self.fc_out(output)
    return output.transpose(0,1)

  def generate_square_subsequent_mask(self, sz):
    mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0,1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    return mask


In [7]:
def load_model_and_tokenizer():
  tokenizer = T5Tokenizer.from_pretrained("t5-small")
  model = CustomTransformer(vocab_size = tokenizer.vocab_size)
  return model, tokenizer

# Stage 4: Model Training

In [16]:
def train_model(model, dataloader, tokenizer, epochs=1, device="cuda" if torch.cuda.is_available() else "cpu"):
  model.to(device)
  optimizer = optim.Adam(model.parameters(), lr=0.001)
  criterion = nn.CrossEntropyLoss(ignore_index= tokenizer.pad_token_id)
  scaler = GradScaler()

  for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
      input_ids = batch["input_ids"].to(device)
      attention_mask = batch["attention_mask"].to(device)
      labels = batch["labels"].to(device)

      optimizer.zero_grad()

      decoder_input = labels[:,:-1]
      decoder_target = labels[:,1:]

      tgt_mask = model.generate_square_subsequent_mask(decoder_input.size(1)).to(device)

      with autocast():
        output = model(input_ids, decoder_input,tgt_mask=tgt_mask)
        loss = criterion(output.reshape(-1,output.size(-1)), decoder_target.reshape(-1))

      scaler.scale(loss).backward()
      scaler.step(optimizer)
      scaler.update()

      total_loss += loss.item()
    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader)}")

  torch.save(model.state_dict(), "custom_transformer_model.pth")
  os.makedirs("summerizer_tokens", exist_ok=True)
  tokenizer.save_pretrained("tokenizer")


# Summarization


In [17]:
def summarize_text(model, tokenizer, text, max_length=128, min_length=16,
                   num_beams=4, device="cuda" if torch.cuda.is_available() else "cpu"):
  model.eval()
  model = model.to(device)
  text = "summarize: " + text
  encoding = tokenizer(
      text,
      max_length=256,
      padding="max_length",
      truncation=True,
      return_tensors="pt"
  )
  input_ids = encoding["input_ids"].to(device)
  generated_ids = beam_search(model, tokenizer, input_ids, max_length, min_length, device)
  summary = tokenizer.decode(generated_ids, skip_special_tokens=True)
  return summary


In [24]:
def beam_search(model, tokenizer,input_ids, max_length, min_length, device, beam_size=4):
  model.eval()
  sequences = [(input_ids,0.0)]
  for step in range(max_length):
    all_candidates = []
    for seq, score in sequences:
      decoder_input = seq[:,-1:].to(device) if step ==0 else seq[:,1:].to(device)
      tgt_mask = model.generate_square_subsequent_mask(decoder_input.size(1)).to(device)
      output = model(input_ids, decoder_input,tgt_mask = tgt_mask)
      log_probs = torch.log_softmax(output[:,-1,:], dim=-1)
      topk_log_probs,topk_ids = log_probs.topk(beam_size)

      for i in range(beam_size):
        candidate_seq = torch.cat([seq,topk_ids[:,i].unsqueeze(1)],dim=1)
        candidate_score = score - topk_log_probs[0,i].item()
        all_candidates.append((candidate_seq, candidate_score))
    sequences = sorted(all_candidates, key=lambda x: x[1])[:beam_size]
    if step>=min_length and all(tokenizer.eos_token_id in seq[0] for seq,_ in sequences):
      break
  return sequences[0][0][0]

# Executor

In [25]:
def main():
  dataset = load_data()
  model, tokenizer = load_model_and_tokenizer()

  train_dataset = SummarizationDataset(dataset, tokenizer)
  train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

  train_model(model, train_dataloader,tokenizer)

  sample_text = """
    The quick brown fox jumps over the lazy dog. This is a classic pangram used to test typewriters and keyboards.
    It contains every letter of the English alphabet. The fox is known for its agility and cunning, while the dog,
    in this case, is depicted as idle. This sentence has been used in various contexts to demonstrate text processing.
    The pangram is often employed in design and development to ensure that fonts and text rendering systems display
    all characters correctly. Its brevity and inclusivity make it a practical tool for testing.
  """

  summary = summarize_text(model, tokenizer, sample_text)

  print("Original Text: ", sample_text)
  print("Length of original Text: ", len(sample_text.split()))
  print("Summarized Text:", summary )
  print("Length of summarized Text: ", len(summary.split()))

if __name__ == '__main__':
  main()

Epoch 1/1, Loss: 8.110279064178467
Original Text:  
    The quick brown fox jumps over the lazy dog. This is a classic pangram used to test typewriters and keyboards.
    It contains every letter of the English alphabet. The fox is known for its agility and cunning, while the dog,
    in this case, is depicted as idle. This sentence has been used in various contexts to demonstrate text processing.
    The pangram is often employed in design and development to ensure that fonts and text rendering systems display
    all characters correctly. Its brevity and inclusivity make it a practical tool for testing.
  
Length of original Text:  91
Summarized Text: summarize: The quick brown fox jumps over the lazy dog. This is a classic pangram used to test typewriters and keyboards. It contains every letter of the English alphabet. The fox is known for its agility and cunning, while the dog, in this case, is depicted as idle. This sentence has been used in various contexts to demonstrate text pr