In [1]:
# Install Required Packages
!pip install datasets transformers tokenizers torch nltk rouge_score pandas numpy tqdm wandb tensorboardX sentencepiece einops matplotlib
!pip install accelerate regex
!pip install gdown langdetect ipywidgets

Collecting accelerate
  Using cached accelerate-1.6.0-py3-none-any.whl.metadata (19 kB)
Using cached accelerate-1.6.0-py3-none-any.whl (354 kB)
Installing collected packages: accelerate
Successfully installed accelerate-1.6.0
Collecting gdown
  Using cached gdown-5.2.0-py3-none-any.whl.metadata (5.8 kB)
Collecting langdetect
  Using cached langdetect-1.0.9-py3-none-any.whl
Collecting fqdn (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.6.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets)
  Using cached fqdn-1.5.1-py3-none-any.whl.metadata (1.4 kB)
Collecting isoduration (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.6.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipywidgets)
  Using cached isoduration-20.11.0-py3-none-any.whl.metadata (5.7 kB)
Collecting uri-template (from jsonschema[format-nongpl]>=4.18.0->jupyter-events>=0.6.0->jupyter-server<3,>=2.4.0->notebook>=4.4.1->widgetsnbextension~=3.5.0->ipyw

In [2]:
import os
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import nltk
from nltk.tokenize import sent_tokenize
from datasets import load_dataset, Dataset
from transformers import PreTrainedTokenizerFast
from tokenizers import ByteLevelBPETokenizer, Tokenizer
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import wandb  # Optional for experiment tracking
from tensorboardX import SummaryWriter
import logging
import regex as re
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from torch.cuda.amp import autocast, GradScaler  # For mixed precision training
from torch.nn.utils import clip_grad_norm_  # For gradient clipping
import torch.cuda.amp as amp  # For mixed precision
from torch.utils.checkpoint import checkpoint  # For gradient checkpointing
import time  # For timing training
import json  # For saving configs
import matplotlib.pyplot as plt  # For plotting
import shutil  # For disk usage information
import random  # For seed setting

## Random Seed Initialization

The code below sets fixed random seeds across all libraries used in this project. This is critical for:

- **Reproducibility**: Ensures the same results can be obtained across different runs
- **Consistent evaluation**: Guarantees that the generated summaries remain consistent for proper analysis and comparison
- **Reliable generation**: With our temperature-based sampling approach, fixed seeds ensure consistent token selection during text generation
- **Deterministic behavior**: Makes debugging and validation possible by eliminating randomness as a variable

For academic and research contexts, reproducibility is a fundamental requirement. Without these seeds, the model would produce different summaries each time, making proper analysis and comparison impossible.

The `deterministic` and `benchmark` settings specifically configure CUDA operations to prioritize consistent results over performance optimizations.

In [4]:
# Set random seeds for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Download necessary NLTK data
nltk.download('punkt')

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Check GPU specs if available
if torch.cuda.is_available():
    print(f"GPU available: {torch.cuda.get_device_name(0)}")
    print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

# Check disk space
total, used, free = shutil.disk_usage("/")
print(f"Disk space - Total: {total/1e9:.1f} GB, Used: {used/1e9:.1f} GB, Free: {free/1e9:.1f} GB")

Using device: cpu
Disk space - Total: 372.8 GB, Used: 343.5 GB, Free: 29.4 GB


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nisha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [5]:
# Load CNN/DailyMail dataset
cnn_dailymail = load_dataset("cnn_dailymail", "3.0.0")

# info
print("Dataset loaded successfully!")
print(f"Dataset structure: {cnn_dailymail}")
print(f"Number of training examples: {len(cnn_dailymail['train'])}")
print(f"Number of validation examples: {len(cnn_dailymail['validation'])}")
print(f"Number of test examples: {len(cnn_dailymail['test'])}")

# example
sample = cnn_dailymail['train'][0]
print("\nSample article (first 300 chars):")
print(sample['article'][:300] + "...")
print("\nSample highlights (summary):")
print(sample['highlights'])

Dataset loaded successfully!
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})
Number of training examples: 287113
Number of validation examples: 13368
Number of test examples: 11490

Sample article (first 300 chars):
LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported £20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappoi...

Sample highlights (summary):
Harry Potter star Daniel Radcliffe gets £20M fortune as he turns 18 Monday .
Young actor says he has no plans to fritter his cash away .
Radcliffe's

# Custom Tokenization Strategy for Abstractive Summarization

## The Tokenization Architecture

The project implements a custom `OptimizedTokenizer` class that wraps around Hugging Face's Rust-based tokenizers library instead of the more commonly used SentencePiece. This choice was deliberate and offers several advantages for our summarization task.

### Key Components of the Implementation:

1. **ByteLevelBPE Tokenizer**: The implementation uses Byte-Level Byte-Pair Encoding (BPE) algorithm for tokenization, which works by iteratively merging the most frequent pairs of bytes in the text.

2. **Special Tokens Management**: Four special tokens are explicitly defined with assigned IDs:
   - `<pad>` (ID 0): For padding sequences to uniform length
   - `<sos>` (ID 1): Start of sequence marker
   - `<eos>` (ID 2): End of sequence marker
   - `<unk>` (ID 3): For handling unknown tokens

3. **Fast Training Process**: The tokenizer is trained on a sampled subset of the data (around 200,000 texts including both articles and summaries).

4. **Vocabulary Size**: A vocabulary size of 32,000 tokens is used, which is large enough to capture the diversity of news language.

## Rationale for Avoiding SentencePiece

The implementation deliberately avoids SentencePiece for several reasons:

1. **Performance**: ByteLevelBPE with the Rust implementation provides significant performance gains during both training and inference compared to SentencePiece.

2. **Efficiency**: Training the tokenizer takes minutes rather than hours, making development iterations faster.

3. **Memory Usage**: The Rust-based implementation is more memory-efficient, allowing for processing larger batches during training.

4. **Fine-grained Control**: The implementation provides explicit control over special token IDs, which is important for the transformer architecture.

5. **HuggingFace Integration**: Using `PreTrainedTokenizerFast` wrapper ensures compatibility with the broader ecosystem.

## Implementation Details

The implementation includes methods for:
- Training the tokenizer on a corpus (`train`)
- Encoding single texts (`encode`)
- Batch encoding multiple texts (`batch_encode`)
- Decoding token IDs back to text (`decode`)
- Saving and loading the tokenizer (`save`, `load`)

When used in the pipeline, the tokenizer truncates inputs to 512 tokens and targets to 128 tokens, which aligns with the characteristics of CNN/DailyMail articles and summaries.

For a text summarization task like ours, having an efficient tokenization process is crucial due to the large amount of text data being processed. The chosen approach optimizes for both training speed and runtime performance. both training speed and runtime performance.

In [7]:
# Tokenizer Class
class OptimizedTokenizer:
    def __init__(self, vocab_size=32000):
        self.vocab_size = vocab_size
        self.tokenizer = None
        self.special_tokens = ["<pad>", "<sos>", "<eos>", "<unk>"]
        self.special_token_ids = {
            "<pad>": 0,
            "<sos>": 1,
            "<eos>": 2,
            "<unk>": 3
        }
        
    def train(self, texts, model_prefix="tokenizer", num_samples=None):

        os.makedirs(model_prefix, exist_ok=True)
        
        # Sample texts to speed up training if needed
        if num_samples and len(texts) > num_samples:
            import random
            random.seed(42)
            texts = random.sample(texts, num_samples)
        
        # Write sample texts to file
        corpus_path = "corpus.txt"
        print(f"Writing {len(texts)} texts to file...")
        with open(corpus_path, 'w', encoding='utf-8') as f:
            for text in texts:
                f.write(text + '\n')
        
        # Initialize and train the tokenizer 
        print("Training tokenizer...")
        from tokenizers import Tokenizer
        from tokenizers.models import BPE
        from tokenizers.trainers import BpeTrainer
        from tokenizers.pre_tokenizers import Whitespace
        
        # Create a new BPE tokenizer
        tokenizer = Tokenizer(BPE(unk_token="<unk>"))
        tokenizer.pre_tokenizer = Whitespace()
        
        # Prepare the trainer
        trainer = BpeTrainer(
            vocab_size=self.vocab_size,
            special_tokens=self.special_tokens,
            min_frequency=2
        )
        
        # Train the tokenizer
        tokenizer.train(files=[corpus_path], trainer=trainer)
        
        # Save the tokenizer
        tokenizer_path = os.path.join(model_prefix, "tokenizer.json")
        tokenizer.save(tokenizer_path)
        print(f"Tokenizer saved to {tokenizer_path}")
        
        # Load the tokenizer
        from transformers import PreTrainedTokenizerFast
        self.tokenizer = PreTrainedTokenizerFast(
            tokenizer_file=tokenizer_path,
            bos_token="<sos>",
            eos_token="<eos>",
            pad_token="<pad>",
            unk_token="<unk>"
        )
        
        # Set special token IDs explicitly
        self.tokenizer.pad_token_id = 0
        self.tokenizer.bos_token_id = 1
        self.tokenizer.eos_token_id = 2
        self.tokenizer.unk_token_id = 3
        
        # Clean up
        if os.path.exists(corpus_path):
            os.remove(corpus_path)
            
        print(f"Tokenizer training complete!")
        print(f"Vocabulary size: {self.tokenizer.vocab_size}")
        
    def encode(self, text, max_length=None, padding="max_length", truncation=True):
        if self.tokenizer is None:
            raise ValueError("Tokenizer not trained. Call train() first.")
        
        # Use the HuggingFace tokenizer
        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            return_attention_mask=True,
            return_tensors=None
        )
        
        return {
            "input_ids": encoding["input_ids"],
            "attention_mask": encoding["attention_mask"]
        }
    
    def batch_encode(self, texts, max_length=None, padding="max_length", truncation=True):
        if self.tokenizer is None:
            raise ValueError("Tokenizer not trained. Call train() first.")
        
        # Batch encode
        encodings = self.tokenizer(
            texts,
            add_special_tokens=True,
            max_length=max_length,
            padding=padding,
            truncation=truncation,
            return_attention_mask=True,
            return_tensors=None
        )
        
        return {
            "input_ids": encodings["input_ids"],
            "attention_mask": encodings["attention_mask"]
        }
    
    def decode(self, token_ids, skip_special_tokens=True):
        if self.tokenizer is None:
            raise ValueError("Tokenizer not trained. Call train() first.")
            
        if isinstance(token_ids, torch.Tensor):
            token_ids = token_ids.cpu().tolist()
            
        return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
    
    def save(self, path):
        if self.tokenizer is None:
            raise ValueError("Tokenizer not trained. Call train() first.")
        
        os.makedirs(path, exist_ok=True)
        self.tokenizer.save_pretrained(path)
        
        # Save the special token mapping separately
        with open(os.path.join(path, "special_tokens.json"), "w") as f:
            json.dump(self.special_token_ids, f)
    
    def load(self, path):
        from transformers import PreTrainedTokenizerFast
        
        tokenizer_path = os.path.join(path, "tokenizer.json")
        if os.path.exists(tokenizer_path):
            self.tokenizer = PreTrainedTokenizerFast(
                tokenizer_file=tokenizer_path,
                bos_token="<sos>",
                eos_token="<eos>",
                pad_token="<pad>",
                unk_token="<unk>"
            )
        else:
            # Try loading as a pretrained tokenizer
            self.tokenizer = PreTrainedTokenizerFast.from_pretrained(path)
        
        # Load special token mapping if it exists
        special_tokens_path = os.path.join(path, "special_tokens.json")
        if os.path.exists(special_tokens_path):
            with open(special_tokens_path, "r") as f:
                self.special_token_ids = json.load(f)
        
        # Ensure the tokenizer has the correct special tokens
        self.tokenizer.pad_token = "<pad>"
        self.tokenizer.bos_token = "<sos>"
        self.tokenizer.eos_token = "<eos>"
        self.tokenizer.unk_token = "<unk>"
        
        # Set special token IDs explicitly
        self.tokenizer.pad_token_id = self.special_token_ids["<pad>"]
        self.tokenizer.bos_token_id = self.special_token_ids["<sos>"]
        self.tokenizer.eos_token_id = self.special_token_ids["<eos>"]
        self.tokenizer.unk_token_id = self.special_token_ids["<unk>"]

In [8]:
# Initialize tokenizer
MAX_VOCAB_SIZE = 32000
tokenizer = OptimizedTokenizer(vocab_size=MAX_VOCAB_SIZE)

# Check if tokenizer already exists
if os.path.exists(r"D:\NLP-Project\processed_text_summarization_data\tokenizer\tokenizer.json"):
    print("Loading existing tokenizer...")
    tokenizer.load(r"D:\NLP-Project\processed_text_summarization_data\tokenizer")
    print(f"Tokenizer loaded with vocabulary size: {tokenizer.tokenizer.vocab_size}")

Loading existing tokenizer...
Tokenizer loaded with vocabulary size: 32000


# Transformer Architecture: Advanced Components for Summarization

## Positional Encoding

The positional encoding component addresses a fundamental limitation of transformer models: they have no inherent understanding of sequence order. Unlike RNNs, which process tokens sequentially, transformers process all tokens in parallel.

This implementation uses sinusoidal positional encodings, which add position-dependent patterns to each embedding. The mathematical properties of these sine/cosine functions allow the model to attend to relative positions, making it possible to understand the sequential nature of text while retaining the benefits of parallel processing.

## Enhanced Multi-Head Attention

The `ImprovedMultiHeadAttention` class represents a refined implementation of the attention mechanism that forms the core of the transformer. Key enhancements include:

- **Numerical Stability**: Uses a smaller negative value (-1e4 instead of -1e9) for masked positions to prevent overflow in mixed precision training
- **Proper Initialization**: Weight matrices are initialized with Xavier uniform distribution to ensure stable gradient flow
- **Flexible Masking**: Supports multiple mask dimensions for different attention patterns

Multi-head attention allows the model to jointly attend to information from different representation subspaces at different positions. This enables capturing different types of dependencies in the text - some heads might focus on local syntactic patterns while others capture document-level semantic relationships.

## Position-wise Feed-Forward Networks

The feed-forward networks apply two linear transformations with a GELU activation in between. This creates a component that can model complex token-level transformations. Key features:

- **GELU Activation**: Uses Gaussian Error Linear Unit instead of ReLU, providing smoother gradients
- **Proper Dropout**: Applied after activation to improve regularization
- **Careful Initialization**: Parameter initialization designed to prevent vanishing/exploding gradients

## Encoder and Decoder Architecture

The encoder and decoder follow the classic transformer design but with several optimizations:

- **Pre-Layer Normalization**: Unlike the original transformer's post-layer norm, this implementation applies normalization before each sub-layer, significantly improving training stability
- **Residual Connections**: Carefully implemented skip connections help maintain gradient flow through deep networks
- **Shared Embeddings**: Input and output embeddings are shared to reduce parameters and improve regularization

## Complete Transformer Model

The `ImprovedTransformer` class brings everything together with several enhancements:

- **Efficient Masking**: Optimized logic for creating source and target masks
- **Three-Way Weight Tying**: Shares weights between encoder embeddings, decoder embeddings, and the output projection layer
- **Beam Search Generation**: Implements a sophisticated beam search algorithm with top-k sampling for better summary quality
- **Temperature Control**: Allows controlling the randomness in the generation process

These architectural choices reflect both the original transformer design principles and more recent improvements developed by the NLP community, creating a model particularly well-suited for abstractive summarization.

In [10]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_seq_length=5000, dropout=0.1):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)
        
        pe = torch.zeros(max_seq_length, d_model)
        position = torch.arange(0, max_seq_length, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        
        self.register_buffer('pe', pe)
        
    def forward(self, x):
        x = x + self.pe[:, :x.size(1), :].to(x.device)
        return self.dropout(x)

class ImprovedMultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super(ImprovedMultiHeadAttention, self).__init__()
        
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        
        self.d_model = d_model
        self.num_heads = num_heads
        self.d_k = d_model // num_heads
        
        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)
        self.W_o = nn.Linear(d_model, d_model)
        
        for module in [self.W_q, self.W_k, self.W_v, self.W_o]:
            nn.init.xavier_uniform_(module.weight)
            nn.init.zeros_(module.bias)
            
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, query, key, value, mask=None, return_attention=False):
        batch_size = query.size(0)
        
        Q = self.W_q(query)
        K = self.W_k(key)
        V = self.W_v(value)
        
        Q = Q.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        K = K.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        V = V.view(batch_size, -1, self.num_heads, self.d_k).transpose(1, 2)
        
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.d_k)
        
        if mask is not None:
            if mask.dim() == 2:
                mask = mask.unsqueeze(1).unsqueeze(2)
            elif mask.dim() == 3:
                mask = mask.unsqueeze(1)
            
            scores = scores.masked_fill(mask == 0, -1e4)
        
        attn_weights = F.softmax(scores, dim=-1)
        attn_weights = self.dropout(attn_weights)
        
        context = torch.matmul(attn_weights, V)
        context = context.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        
        output = self.W_o(context)
        
        if return_attention:
            return output, attn_weights
        else:
            return output

In [11]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1, activation='gelu'):
        super(FeedForward, self).__init__()
        
        self.linear1 = nn.Linear(d_model, d_ff)
        self.linear2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)
        
        if activation == 'relu':
            self.activation = F.relu
        elif activation == 'gelu':
            self.activation = F.gelu
        else:
            raise ValueError(f"Unsupported activation: {activation}")
        
        nn.init.xavier_uniform_(self.linear1.weight)
        nn.init.zeros_(self.linear1.bias)
        nn.init.xavier_uniform_(self.linear2.weight)
        nn.init.zeros_(self.linear2.bias)
        
    def forward(self, x):
        return self.linear2(self.dropout(self.activation(self.linear1(x))))

class EncoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1, activation='gelu'):
        super(EncoderLayer, self).__init__()
        
        self.self_attn = ImprovedMultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout, activation)
        self.norm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.norm2 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        
    def forward(self, x, mask=None):
        norm_x = self.norm1(x)
        attn_output = self.self_attn(norm_x, norm_x, norm_x, mask)
        x = x + self.dropout1(attn_output)
        
        norm_x = self.norm2(x)
        ff_output = self.feed_forward(norm_x)
        x = x + self.dropout2(ff_output)
        
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout=0.1, activation='gelu'):
        super(DecoderLayer, self).__init__()
        
        self.self_attn = ImprovedMultiHeadAttention(d_model, num_heads, dropout)
        self.cross_attn = ImprovedMultiHeadAttention(d_model, num_heads, dropout)
        self.feed_forward = FeedForward(d_model, d_ff, dropout, activation)
        self.norm1 = nn.LayerNorm(d_model, eps=1e-6)
        self.norm2 = nn.LayerNorm(d_model, eps=1e-6)
        self.norm3 = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout1 = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.dropout3 = nn.Dropout(dropout)
        
    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        norm_x = self.norm1(x)
        self_attn_output = self.self_attn(norm_x, norm_x, norm_x, tgt_mask)
        x = x + self.dropout1(self_attn_output)
        
        norm_x = self.norm2(x)
        cross_attn_output = self.cross_attn(norm_x, enc_output, enc_output, src_mask)
        x = x + self.dropout2(cross_attn_output)
        
        norm_x = self.norm3(x)
        ff_output = self.feed_forward(norm_x)
        x = x + self.dropout3(ff_output)
        
        return x

In [12]:
# Encoder and Decoder
class Encoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, dropout=0.1, activation='gelu'):
        super(Encoder, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_encoding = PositionalEncoding(d_model, dropout=dropout)
        
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, num_heads, d_ff, dropout, activation)
            for _ in range(num_layers)
        ])
        
        self.norm = nn.LayerNorm(d_model, eps=1e-6)
        
        nn.init.normal_(self.embedding.weight, mean=0, std=d_model**-0.5)
        
    def forward(self, x, mask=None):
        x = self.embedding(x) * math.sqrt(self.embedding.embedding_dim)
        x = self.pos_encoding(x)
        
        for layer in self.layers:
            x = layer(x, mask)
            
        x = self.norm(x)
            
        return x

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_heads, d_ff, num_layers, dropout=0.1, activation='gelu'):
        super(Decoder, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, d_model, padding_idx=0)
        self.pos_encoding = PositionalEncoding(d_model, dropout=dropout)
        
        self.layers = nn.ModuleList([
            DecoderLayer(d_model, num_heads, d_ff, dropout, activation)
            for _ in range(num_layers)
        ])
        
        self.norm = nn.LayerNorm(d_model, eps=1e-6)
        
        nn.init.normal_(self.embedding.weight, mean=0, std=d_model**-0.5)
        
    def forward(self, x, enc_output, src_mask=None, tgt_mask=None):
        x = self.embedding(x) * math.sqrt(self.embedding.embedding_dim)
        x = self.pos_encoding(x)
        
        for layer in self.layers:
            x = layer(x, enc_output, src_mask, tgt_mask)
            
        x = self.norm(x)
            
        return x

In [13]:
# Transformer Model
class ImprovedTransformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, d_model=512, num_heads=8,
                 d_ff=2048, num_layers=6, dropout=0.1, activation='gelu',
                 share_embeddings=True):
        super(ImprovedTransformer, self).__init__()
        
        self.pad_token_id = 0
        self.sos_token_id = 1
        self.eos_token_id = 2
        
        self.encoder = Encoder(src_vocab_size, d_model, num_heads, d_ff, num_layers, dropout, activation)
        self.decoder = Decoder(tgt_vocab_size, d_model, num_heads, d_ff, num_layers, dropout, activation)
        
        self.final_layer = nn.Linear(d_model, tgt_vocab_size, bias=False)
        
        if share_embeddings:
            self.encoder.embedding.weight = self.decoder.embedding.weight
            
        self.final_layer.weight = self.decoder.embedding.weight
        
    def create_src_mask(self, src):
        src_mask = (src != self.pad_token_id).unsqueeze(1).unsqueeze(2)
        return src_mask
    
    def create_tgt_mask(self, tgt):
        tgt_pad_mask = (tgt != self.pad_token_id).unsqueeze(1).unsqueeze(3)
        
        tgt_len = tgt.size(1)
        tgt_sub_mask = torch.tril(torch.ones((tgt_len, tgt_len), device=tgt.device)).bool()
        tgt_sub_mask = tgt_sub_mask.unsqueeze(0).unsqueeze(1)
        
        tgt_mask = tgt_pad_mask & tgt_sub_mask
        return tgt_mask
    
    def forward(self, src, tgt):
        src_mask = self.create_src_mask(src)
        tgt_mask = self.create_tgt_mask(tgt)
        
        enc_output = self.encoder(src, src_mask)
        dec_output = self.decoder(tgt, enc_output, src_mask, tgt_mask)
        
        output = self.final_layer(dec_output)
        
        return output

In [14]:
# Setup constants
MAX_INPUT_LENGTH = 512
MAX_OUTPUT_LENGTH = 128

# Load the tokenizer (use existing class from Cell 4)
tokenizer = OptimizedTokenizer(vocab_size=32000)
tokenizer.load(r"D:\NLP-Project\processed_text_summarization_data\tokenizer")
print(f"Tokenizer loaded with vocabulary size: {tokenizer.tokenizer.vocab_size}")

Tokenizer loaded with vocabulary size: 32000


In [15]:
# Load model from checkpoint
best_model_path = r"D:\NLP-Project\processed_text_summarization_data\best_model (1).pt"
checkpoint = torch.load(best_model_path, map_location=device)

# Get model config from checkpoint
if 'model_config' in checkpoint:
    config = checkpoint['model_config']
    print(f"Model config from checkpoint: {config}")
    
    # Recreate the model with the same architecture
    model = ImprovedTransformer(
        src_vocab_size=config['vocab_size'],
        tgt_vocab_size=config['vocab_size'],
        d_model=config['d_model'],
        num_heads=config['num_heads'],
        d_ff=config['d_ff'],
        num_layers=config['num_layers'],
        dropout=config['dropout'],
        activation=config['activation']
    ).to(device)
else:
    # Default configuration if not found in checkpoint
    print("Model config not found in checkpoint, using default values")
    model = ImprovedTransformer(
        src_vocab_size=32000,
        tgt_vocab_size=32000,
        d_model=768,
        num_heads=12,
        d_ff=3072,
        num_layers=6,
        dropout=0.1,
        activation='gelu'
    ).to(device)

# Load the model weights
model.load_state_dict(checkpoint['model_state_dict'])
print("Model loaded successfully!")

# Set model to evaluation mode
model.eval()

Model config from checkpoint: {'vocab_size': 32000, 'd_model': 768, 'num_heads': 12, 'd_ff': 3072, 'num_layers': 6, 'dropout': 0.1, 'activation': 'gelu'}
Model loaded successfully!


ImprovedTransformer(
  (encoder): Encoder(
    (embedding): Embedding(32000, 768, padding_idx=0)
    (pos_encoding): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (self_attn): ImprovedMultiHeadAttention(
          (W_q): Linear(in_features=768, out_features=768, bias=True)
          (W_k): Linear(in_features=768, out_features=768, bias=True)
          (W_v): Linear(in_features=768, out_features=768, bias=True)
          (W_o): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): FeedForward(
          (linear1): Linear(in_features=768, out_features=3072, bias=True)
          (linear2): Linear(in_features=3072, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (norm2): LayerNorm((768,), eps=1e-06

In [16]:
import gdown

url = "https://drive.google.com/file/d/1RDfNmQroAwyrtQL3SExGAvrYIEqBARa0/view"
output_file = 'nsw-project-tweet-datasets.zip'

# Check if file already exists
if os.path.exists(output_file):
    print(f"File {output_file} already exists. Skipping download.")
else:
    print(f"Downloading dataset from Google Drive...")
    # Convert to direct download link
    file_id = url.split('/')[-2]
    direct_url = f'https://drive.google.com/uc?id={file_id}'
    
    gdown.download(direct_url, output_file, quiet=False)
    print(f"Download complete: {output_file}")    

File nsw-project-tweet-datasets.zip already exists. Skipping download.


In [17]:
# Extract the downloaded zip file
import zipfile

output_dir = 'twitter_data'
os.makedirs(output_dir, exist_ok=True)

# Extract zip file
zip_path = 'nsw-project-tweet-datasets.zip'
print(f"Extracting {zip_path} to {output_dir}...")

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(output_dir)
    
# List the extracted files to verify
tweet_files = []
network_files = []

for root, dirs, files in os.walk(output_dir):
    for file in files:
        if file.endswith('_tweets.csv'):
            tweet_files.append(os.path.join(root, file))
        elif file.endswith('_network.csv'):
            network_files.append(os.path.join(root, file))

print(f"Extracted {len(tweet_files)} tweet files and {len(network_files)} network files")
print("\nTweet files:")
for file in tweet_files:
    print(f" - {file}")

Extracting nsw-project-tweet-datasets.zip to twitter_data...
Extracted 9 tweet files and 9 network files

Tweet files:
 - twitter_data\#AYODHYAVERDICT_tweets.csv
 - twitter_data\#cancelallBlueTicksinIndia_tweets.csv
 - twitter_data\#HistoryOfAyodhya_tweets.csv
 - twitter_data\#jnuprotest_tweets.csv
 - twitter_data\#Kashmir_tweets.csv
 - twitter_data\#KejriwalMustResign_tweets.csv
 - twitter_data\#ShivSenaCheatsMaharashtra_tweets.csv
 - twitter_data\#ShutDownJNU_tweets.csv
 - twitter_data\#WhereIsAmitShah_tweets.csv


# Twitter Data Preprocessing

The code processes Twitter data to make it suitable for summarization. It takes tweets organized by hashtags and transforms them into clean documents by:

1. Reading CSV files containing tweets from different political hashtags in India
2. Cleaning each tweet by removing URLs, mentions, and extra spaces
3. Checking if tweets are in English using language detection
4. Combining all English tweets from the same hashtag into one document

This approach makes sense because:
- Raw tweets contain a lot of noise (links, @mentions, etc.) that would confuse the model
- By filtering for English only, we ensure the model can properly understand the content
- Grouping by hashtag creates natural topic boundaries that make summarization more meaningful
- The preprocessing reduces the data to its essential content, making it easier for the model to identify key themes


In [19]:
# Preprocess Twitter data into documents by hashtag
from langdetect import detect
import re

# Create directory for processed documents
processed_dir = 'processed_data'
os.makedirs(processed_dir, exist_ok=True)

def clean_tweet(text):
    """Clean the tweet text"""
    if pd.isna(text):
        return ""
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove hashtags symbol (but keep the text)
    text = re.sub(r'#', '', text)
    
    # Remove RT symbol
    text = re.sub(r'RT\s+', '', text)
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def is_english(text):
    """Check if text is in English"""
    if pd.isna(text) or len(text) < 10:
        return False
    
    try:
        return detect(text) == 'en'
    except:
        return False

# Process each tweet file
hashtag_documents = {}

for file_path in tweet_files:
    # Extract hashtag name from file path
    hashtag = os.path.basename(file_path).replace("_tweets.csv", "")
    print(f"Processing #{hashtag}...")
    
    # Load the tweets
    df = pd.read_csv(file_path)
    print(f"  - Loaded {len(df)} tweets")
    
    # Find the text column
    text_column = None
    for col in ['text', 'tweet', 'content', 'status', 'full_text']:
        if col in df.columns:
            text_column = col
            break
    
    if text_column is None:
        print(f"  - Warning: No text column found in {hashtag}")
        continue
    
    # Clean and filter tweets
    df['cleaned_text'] = df[text_column].apply(clean_tweet)
    
    # Filter for English tweets
    english_tweets = df[df['cleaned_text'].apply(is_english)]
    print(f"  - Found {len(english_tweets)} English tweets out of {len(df)} total")
    
    if len(english_tweets) > 0:
        # Create a document for this hashtag
        document = "\n".join(english_tweets['cleaned_text'].tolist())
        hashtag_documents[hashtag] = document
        
        # Save to file
        doc_path = os.path.join(processed_dir, f"{hashtag}_document.txt")
        with open(doc_path, "w", encoding="utf-8") as f:
            f.write(document)
        
        print(f"  - Created document with {len(document)} characters")
    else:
        print(f"  - No English tweets found for #{hashtag}")

print(f"\nCreated {len(hashtag_documents)} documents")

Processing ##AYODHYAVERDICT...
  - Loaded 200 tweets
  - Found 128 English tweets out of 200 total
  - Created document with 28997 characters
Processing ##cancelallBlueTicksinIndia...
  - Loaded 198 tweets
  - Found 63 English tweets out of 198 total
  - Created document with 6376 characters
Processing ##HistoryOfAyodhya...
  - Loaded 200 tweets
  - Found 83 English tweets out of 200 total
  - Created document with 17866 characters
Processing ##jnuprotest...
  - Loaded 200 tweets
  - Found 120 English tweets out of 200 total
  - Created document with 27980 characters
Processing ##Kashmir...
  - Loaded 200 tweets
  - Found 172 English tweets out of 200 total
  - Created document with 28219 characters
Processing ##KejriwalMustResign...
  - Loaded 200 tweets
  - Found 73 English tweets out of 200 total
  - Created document with 10098 characters
Processing ##ShivSenaCheatsMaharashtra...
  - Loaded 200 tweets
  - Found 117 English tweets out of 200 total
  - Created document with 22514 char

In [20]:
# Display a preview of each document
for hashtag, doc in hashtag_documents.items():
    preview = doc[:1000] + "..." if len(doc) > 200 else doc
    print(f"\n#{hashtag} ({len(doc)} chars):")
    print(preview)


##AYODHYAVERDICT (28997 chars):
Ahead of AyodhyaVerdict, appeal to everyone to accept the judgement of Honourable SupremeCourt. Let us continue to live in peace and harmony. The spirit of brotherhood is the hallmark of our secular fabric.
We respect the Hon’ble Supreme Court’s AyodhyaVerdict. It reaffirms the secular values of our country. All should join together in harmony and brotherhood to carry out this immensely important judicial order.
This is the last prayer offered at BabriMasjid Image speaks a thousand words... 💔 AYODHYAVERDICT
"There is a profound reason Maharshi Valmiki titled his immortal work Ramayana, the Journey of Sri Ramachandra. One wonders whether it was an exile or an enduring national pilgrimage of which he was the pioneer. " ------ My essay on the AYODHYAVERDICT
I welcome Hon Supreme Court's AyodhyaVerdict on Sri Ram Janmbhoomi. I appeal to people of all religions to honour the verdict and maintain peace and harmony. We remain committed to the ideals of India. 

# Mixed Language Tweets

When dealing with tweets from multilingual regions like India, the language detection process faces significant challenges with mixed-language content. Here's r explanation of why Hindi phrases might slip through an English filter:

## How Language Detection Works

Most language detection libraries (including `langdetect` used in the notebook) work by analyzing character and word n-grams (sequences) and comparing them against statistical models of different languages. They typically:

1. Look at character and word patterns
2. Calculate probability scores for each language
3. Choose the language with the highest probabihed text common in the Indian social media context.

# Generation Function

The summary generation function takes a document and creates a concise summary using the transformer model:

1. It first prepares the input by trimming very long documents and converting text to tokens
2. The document gets processed by the encoder part of the model just once
3. The function then generates the summary one word at a time:
   - It looks at what it's generated so far
   - Predicts what should come next
   - Adds some randomness through "temperature" to make it more creative
   - Adds the new word to the growing summary
   - Continues until it decides it's complete or hits the length limit

This approach is sensible because:
- Running the encoder just once saves a lot of computation time
- Adding words one by one mimics how humans write summaries
- The temperature setting balances between boring, predictable text and completely random text
- The process stops naturally when the model determines the summary is complete

In [22]:
# Alternative batch implementation for faster generation
def generate_summary_fast(model, tokenizer, text, max_length=50, temperature=0.7, method="sampling"):

    # Truncate input text if too long to avoid OOM errors
    if len(text) > 10000:
        text = text[:10000]
    
    # Tokenize the input text
    encoding = tokenizer.encode(text, max_length=MAX_INPUT_LENGTH, padding="max_length", truncation=True)
    input_ids = torch.tensor([encoding["input_ids"]]).to(device)
    
    # Set the model to evaluation mode
    model.eval()
    
    with torch.no_grad():
        # Generate decoder start token
        decoder_input_ids = torch.tensor([[tokenizer.tokenizer.bos_token_id]]).to(device)
        
        # Create source mask
        src_mask = model.create_src_mask(input_ids)
        
        # Generate the encoding once
        encoder_output = model.encoder(input_ids, src_mask)
        
        # Generate tokens step by step
        for _ in range(max_length):
            # Create target mask
            tgt_mask = model.create_tgt_mask(decoder_input_ids)
            
            # Forward pass through the decoder
            decoder_output = model.decoder(decoder_input_ids, encoder_output, src_mask, tgt_mask)
            
            # Get logits for the next token
            next_token_logits = model.final_layer(decoder_output[:, -1])
            
            # Apply temperature
            next_token_logits = next_token_logits / temperature
            
            # Sample from the distribution for creative generation
            probs = F.softmax(next_token_logits, dim=-1)
            next_token_id = torch.multinomial(probs, 1)
            
            # Add to sequence
            decoder_input_ids = torch.cat([decoder_input_ids, next_token_id], dim=1)
            
            # Stop if EOS token is generated
            if next_token_id.item() == tokenizer.tokenizer.eos_token_id:
                break
                
        # Decode the sequence
        output_ids = decoder_input_ids[0].cpu().tolist()
        summary = tokenizer.decode(output_ids, skip_special_tokens=True)
        
        return summary

# Define optimized document summarization function
def summarize_documents_optimized(hashtag_documents, model, tokenizer, save_path="summaries_optimized"):

    os.makedirs(save_path, exist_ok=True)
    
    # Only parameters
    params = [
        {"name": "creative", "method": "sampling", "max_length": 50, "temperature": 0.7}
    ]
    
    # Check for existing progress
    progress_file = os.path.join(save_path, "progress.json")
    completed = {}
    if os.path.exists(progress_file):
        with open(progress_file, "r") as f:
            completed = json.load(f)
    
    # Process documents
    all_results = {}
    
    for i, (hashtag, document) in enumerate(hashtag_documents.items()):
        print(f"\nProcessing #{hashtag} ({i+1}/{len(hashtag_documents)})...")
        
        # Skip if already completed
        if hashtag in completed and "creative" in completed[hashtag]:
            print(f"  Skipping #{hashtag} - already processed")
            # Load existing results
            result_file = os.path.join(save_path, f"{hashtag}_summaries.json")
            if os.path.exists(result_file):
                with open(result_file, "r") as f:
                    hashtag_results = json.load(f)
                    all_results[hashtag] = hashtag_results
            continue
        
        # Initialize results for this hashtag
        hashtag_results = {}
        if hashtag in completed:
            # Load partial results
            result_file = os.path.join(save_path, f"{hashtag}_summaries.json")
            if os.path.exists(result_file):
                with open(result_file, "r") as f:
                    hashtag_results = json.load(f)
        
        # Process with creative parameters
        param = params[0]  # Only one parameter set now
        param_name = param["name"]
        
        print(f"  Generating {param_name} summary...")
        
        try:
            # Generate summary
            summary = generate_summary_fast(
                model,
                tokenizer,
                document,
                max_length=param["max_length"],
                temperature=param["temperature"],
                method=param["method"]
            )
            
            hashtag_results[param_name] = summary
            print(f"    Generated summary: {len(summary.split())} words")
            
            # Update progress
            if hashtag not in completed:
                completed[hashtag] = []
            if param_name not in completed[hashtag]:
                completed[hashtag].append(param_name)
            
            # Save progress
            with open(progress_file, "w") as f:
                json.dump(completed, f)
            
            # Save results for this hashtag
            with open(os.path.join(save_path, f"{hashtag}_summaries.json"), "w") as f:
                json.dump(hashtag_results, f, indent=2)
                
        except Exception as e:
            print(f"    Error generating summary: {str(e)}")
        
        all_results[hashtag] = hashtag_results
    
    return all_results

In [23]:
print("Starting summarization process...")
summaries_optimized = summarize_documents_optimized(hashtag_documents, model, tokenizer)

Starting summarization process...

Processing ##AYODHYAVERDICT (1/9)...
  Skipping ##AYODHYAVERDICT - already processed

Processing ##cancelallBlueTicksinIndia (2/9)...
  Skipping ##cancelallBlueTicksinIndia - already processed

Processing ##HistoryOfAyodhya (3/9)...
  Skipping ##HistoryOfAyodhya - already processed

Processing ##jnuprotest (4/9)...
  Skipping ##jnuprotest - already processed

Processing ##Kashmir (5/9)...
  Skipping ##Kashmir - already processed

Processing ##KejriwalMustResign (6/9)...
  Skipping ##KejriwalMustResign - already processed

Processing ##ShivSenaCheatsMaharashtra (7/9)...
  Skipping ##ShivSenaCheatsMaharashtra - already processed

Processing ##ShutDownJNU (8/9)...
  Skipping ##ShutDownJNU - already processed

Processing ##WhereIsAmitShah (9/9)...
  Skipping ##WhereIsAmitShah - already processed


In [24]:
def display_optimized_summaries(summaries):

    for hashtag, hashtag_summaries in summaries.items():
        print(f"\n\n{'=' * 50}")
        print(f"HASHTAG: #{hashtag}")
        print(f"{'=' * 50}")
        
        if "creative" in hashtag_summaries:
            summary = hashtag_summaries["creative"]
            print(f"\n--- Summary ---")
            print(f"{summary}")
            print(f"[Word count: {len(summary.split())}]")

print("\nGenerated Creative Summaries:")
display_optimized_summaries(summaries_optimized)


def save_combined_summaries(summaries, filename="creative_summaries.txt"):

    with open(filename, "w") as f:
        for hashtag, hashtag_summaries in summaries.items():
            f.write(f"\n\n{'=' * 50}\n")
            f.write(f"HASHTAG: #{hashtag}\n")
            f.write(f"{'=' * 50}\n\n")
            
            if "creative" in hashtag_summaries:
                summary = hashtag_summaries["creative"]
                f.write(f"\n--- Creative Summary ---\n")
                f.write(f"{summary}\n")
                f.write(f"[Word count: {len(summary.split())}]\n")
    
    print(f"All creative summaries saved to {filename}")

save_combined_summaries(summaries_optimized)


Generated Creative Summaries:


HASHTAG: ##AYODHYAVERDICT

--- Summary ---
We continue to live in peace and harmony ; we respect the Hon our able Supreme Court . This can happen only in India . We pray for peace , love and harmony A YO D H Y AV D IC T ' There is a profound reason Mah ar
[Word count: 50]


HASHTAG: ##cancelallBlueTicksinIndia

--- Summary ---
Facebook has been accused of encouraging cast e discrimination by sup pressing the voices of the marginal ised . The trend surfaced in connection with allegations of Twitter encouraging cast e discrimination by sup pressing the voices of the marginal ised . The issue is being brought in in by
[Word count: 50]


HASHTAG: ##HistoryOfAyodhya

--- Summary ---
History Of Ram Mand ir History Of Ay od hy a fought with the M ugh als . After losing his empire in 15 80 AD , Akbar ' s empire divided into 12 sub as . Temple of Sh ri in Ram Hidden , was destroyed by Ma ula
[Word count: 50]


HASHTAG: ##jnuprotest

--- Summary ---
B JP on ads s

# Token Splitting Explanation

The weird spacing in summaries (like "A YO D H Y A" instead of "AYODHYA") happens because:

1. The model doesn't understand words as whole units - it breaks them into pieces called "tokens"
2. Common words like "the" or "and" get their own tokens
3. Uncommon words, especially hashtags like "AYODHYAVERDICT," get broken into smaller chunks
4. When converting the model's output back to text, each token gets separated by a space

This happens because:
- The model has a limited vocabulary of 32,000 tokens
- It can't possibly include every possible word, especially unusual hashtags
- Breaking words into smaller pieces lets it handle any word it encounters
- This approach is more efficient than trying to include every possible word
- The downside is that when these pieces get put back together, they show up with spaces between them

This is particularly noticeable with hashtags and unusual proper nouns that weren't common in the model's training data.

In [26]:
def analyze_optimized_summaries(summaries):
    
    # Calculate length statistics
    creative_stats = {
        "word_counts": [],
        "char_lengths": []
    }
    
    # Collect data
    for hashtag, hashtag_summaries in summaries.items():
        if "creative" in hashtag_summaries:
            summary = hashtag_summaries["creative"]
            word_count = len(summary.split())
            char_length = len(summary)
            creative_stats["word_counts"].append(word_count)
            creative_stats["char_lengths"].append(char_length)
    
    # Calculate statistics
    if creative_stats["word_counts"]:
        avg_words = sum(creative_stats["word_counts"]) / len(creative_stats["word_counts"])
        avg_chars = sum(creative_stats["char_lengths"]) / len(creative_stats["char_lengths"])
        min_words = min(creative_stats["word_counts"])
        max_words = max(creative_stats["word_counts"])
    else:
        avg_words = 0
        avg_chars = 0
        min_words = 0
        max_words = 0
    
    # Print statistics
    print("\nCreative Summary Statistics:")
    print(f"Average Word Count: {avg_words:.1f}")
    print(f"Min Word Count: {min_words}")
    print(f"Max Word Count: {max_words}")
    print(f"Average Character Length: {avg_chars:.1f}")
    
    return {
        "avg_words": avg_words,
        "min_words": min_words,
        "max_words": max_words,
        "avg_chars": avg_chars
    }

if summaries_optimized:
    summary_stats = analyze_optimized_summaries(summaries_optimized)


Creative Summary Statistics:
Average Word Count: 49.4
Min Word Count: 45
Max Word Count: 50
Average Character Length: 225.3
