In [1]:
!pip install pytorch-lightning lightning
!pip install pytorch
!pip install numpy
!pip install lightning
!pip install sumy

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.5.0.post0-py3-none-any.whl.metadata (21 kB)
Collecting lightning
  Downloading lightning-2.5.0.post0-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.4/40.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.6.3-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.14.0-py3-none-any.whl.metadata (5.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.1.0->pytorch-lightning)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from tor

In [2]:
import numpy as np
import lightning as L
import copy as copy
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from sklearn.metrics import confusion_matrix
import pandas as pd
from sklearn.metrics import classification_report


#Decoder

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import copy

def clones(module, N):
    return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

class SublayerConnection(nn.Module):
    def __init__(self, size, dropout):
        super(SublayerConnection, self).__init__()
        self.norm = nn.LayerNorm(size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

def attention(query, key, value, mask=None, dropout=None):
    d_k = query.size(-1)
    scores = torch.matmul(query, key.transpose(-1, -2)) / math.sqrt(d_k)
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -1e9)
    p_attn = F.softmax(scores, dim=-1)
    if dropout is not None:
        p_attn = dropout(p_attn)
    return torch.matmul(p_attn, value), p_attn

class MultiHeadedAttention(nn.Module):
    def __init__(self, h, d_model, dropout=0.1):
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        self.d_k = d_model // h
        self.h = h
        self.linears = clones(nn.Linear(d_model, d_model), 4)
        self.attn = None
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        if mask is not None:
            mask = mask.unsqueeze(1)
        nbatches = query.size(0)
        query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
                             for l, x in zip(self.linears, (query, key, value))]
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        return self.linears[-1](x)

class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        return self.w_2(self.dropout(F.relu(self.w_1(x))))

class Embeddings(nn.Module):
    def __init__(self, d_model, vocab):
        super(Embeddings, self).__init__()
        self.lut = nn.Embedding(vocab, d_model)
        self.d_model = d_model

    def forward(self, x):
        return self.lut(x) * math.sqrt(self.d_model)

class PositionEncoding(nn.Module):
    def __init__(self, d_model, dropout, max_len=5000):
        super(PositionEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:, :x.size(1)]
        return self.dropout(x)

class DecoderLayer(nn.Module):
    def __init__(self, size, self_attn, src_attn, feed_forward, dropout):
        super(DecoderLayer, self).__init__()
        self.size = size
        self.self_attn = self_attn
        self.src_attn = src_attn
        self.feed_forward = feed_forward
        self.sublayer = clones(SublayerConnection(size, dropout), 3)

    def forward(self, x, memory=None, src_mask=None, tgt_mask=None):
        # Self-Attention (decoder attends to itself)
        x = self.sublayer[0](x, lambda x: self.self_attn(x, x, x, tgt_mask))

        # Cross-Attention (decoder attends to encoder memory, but we skip this if memory is None)
        if memory is not None:
            x = self.sublayer[1](x, lambda x: self.src_attn(x, memory, memory, src_mask))

        return self.sublayer[2](x, self.feed_forward)

class Decoder(nn.Module):
    def __init__(self, layer, N):
        super(Decoder, self).__init__()
        self.layers = clones(layer, N)
        self.norm = nn.LayerNorm(layer.size)

    def forward(self, x, memory=None, src_mask=None, tgt_mask=None):
        for layer in self.layers:
            x = layer(x, memory, src_mask, tgt_mask)  # memory can be None
        return self.norm(x)

class Generator(nn.Module):
    def __init__(self, d_model, vocab_size):
        super(Generator, self).__init__()
        self.proj = nn.Linear(d_model, vocab_size)

    def forward(self, x):
        return F.log_softmax(self.proj(x), dim=-1)

def make_decoder_model(vocab_size, N=6, d_model=512, d_ff=1024, h=8, dropout=0.1):
    attn = MultiHeadedAttention(h, d_model)
    ff = PositionwiseFeedForward(d_model, d_ff, dropout)
    position = PositionEncoding(d_model, dropout)
    decoder = Decoder(DecoderLayer(d_model, attn, attn, ff, dropout), N)
    embedding = nn.Sequential(Embeddings(d_model, vocab_size), position)
    generator = Generator(d_model, vocab_size)

    model = nn.ModuleDict({
        "decoder": decoder,
        "embedding": embedding,
        "generator": generator
    })

    return model

def generate_summary(model, input_ids, memory, max_len=50):
    output_ids = []
    x = model["embedding"](input_ids)

    for _ in range(max_len):
        x = model["decoder"](x, memory, None, None)
        logits = model["generator"](x[:, -1, :])
        next_word = torch.argmax(logits, dim=-1)
        output_ids.append(next_word.item())
        if next_word.item() == 2:
            break

    return output_ids


#20 newsgroup

In [4]:
!pip install sumy



In [5]:
from sklearn.datasets import fetch_20newsgroups
import re
import nltk
from nltk.tokenize import sent_tokenize
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

nltk.download('punkt')
nltk.download('all')

categories = ['sci.space', 'rec.autos', 'comp.graphics', 'talk.politics.mideast']
dataset = fetch_20newsgroups(subset='train', categories=categories, remove=('headers', 'footers', 'quotes'))
texts = dataset.data

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s.]', '', text)
    return text.strip()

def extract_first_sentence(text):
    sentences = sent_tokenize(text)
    return sentences[0] if sentences else "No summary available"

def summarize_text(text, num_sentences=2):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return " ".join([str(sentence) for sentence in summary])

data_pairs = []
for text in texts:
    text_clean = clean_text(text)
    summary = summarize_text(text_clean)
    data_pairs.append((text_clean, summary))

print("🔹 Original Text:\n", data_pairs[0][0])
print("\n🔹 Generated Summary:\n", data_pairs[0][1])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downlo

🔹 Original Text:

🔹 Generated Summary:


#word to tokeninzir

In [6]:
!pip install torch torchvision transformers datasets


Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.0-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [7]:
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader

tokenizer = AutoTokenizer.from_pretrained("t5-small")

MAX_INPUT_LENGTH = 512
MAX_SUMMARY_LENGTH = 150

def encode_text(text, max_length):
    return tokenizer(text,
                     padding="max_length",
                     truncation=True,
                     max_length=max_length,
                     return_tensors="pt")

sample_text = data_pairs[0][0]
sample_summary = data_pairs[0][1]

input_ids = encode_text(sample_text, MAX_INPUT_LENGTH)["input_ids"]
summary_ids = encode_text(sample_summary, MAX_SUMMARY_LENGTH)["input_ids"]

print("🔹 Example Tokenized Input:", input_ids.shape)
print("🔹 Example Tokenized Summary:", summary_ids.shape)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

🔹 Example Tokenized Input: torch.Size([1, 512])
🔹 Example Tokenized Summary: torch.Size([1, 150])


In [8]:
class SummarizationDataset(Dataset):
    def __init__(self, data_pairs, tokenizer, max_input_len, max_summary_len):
        self.data_pairs = data_pairs
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_summary_len = max_summary_len

    def __len__(self):
        return len(self.data_pairs)

    def __getitem__(self, idx):
        text, summary = self.data_pairs[idx]

        # Tokenize input and summary
        input_enc = self.tokenizer(text,
                                   padding="max_length",
                                   truncation=True,
                                   max_length=self.max_input_len,
                                   return_tensors="pt")

        summary_enc = self.tokenizer(summary,
                                     padding="max_length",
                                     truncation=True,
                                     max_length=self.max_summary_len,
                                     return_tensors="pt")

        return {
            "input_ids": input_enc["input_ids"].squeeze(0),
            "attention_mask": input_enc["attention_mask"].squeeze(0),
            "labels": summary_enc["input_ids"].squeeze(0)
        }

dataset = SummarizationDataset(data_pairs, tokenizer, MAX_INPUT_LENGTH, MAX_SUMMARY_LENGTH)

sample = dataset[0]
print("🔹 Input IDs Shape:", sample["input_ids"].shape)
print("🔹 Labels Shape:", sample["labels"].shape)


🔹 Input IDs Shape: torch.Size([512])
🔹 Labels Shape: torch.Size([150])


In [9]:
BATCH_SIZE = 8

train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

batch = next(iter(train_loader))
print("🔹 Batch Input Shape:", batch["input_ids"].shape)
print("🔹 Batch Labels Shape:", batch["labels"].shape)


🔹 Batch Input Shape: torch.Size([8, 512])
🔹 Batch Labels Shape: torch.Size([8, 150])


#Training and optimization function

##Loss :CrossEntropyLoss white Ifnore Index

In [10]:
import torch.nn as nn
import torch.optim as optim

class LabelSmoothingLoss(nn.Module):
    def __init__(self, smoothing=0.1, vocab_size=50265, ignore_index=0):
        super(LabelSmoothingLoss, self).__init__()
        self.smoothing = smoothing
        self.vocab_size = vocab_size
        self.ignore_index = ignore_index
        self.criterion = nn.KLDivLoss(reduction='batchmean')

    def forward(self, pred, target):
        true_dist = torch.zeros_like(pred).fill_(self.smoothing / (self.vocab_size - 1))
        true_dist.scatter_(1, target.unsqueeze(1), 1.0 - self.smoothing)
        true_dist.masked_fill_((target == self.ignore_index).unsqueeze(1), 0.0)
        return self.criterion(pred, true_dist)

criterion = LabelSmoothingLoss(vocab_size=tokenizer.vocab_size, ignore_index=tokenizer.pad_token_id)


##optimizer Adam white Weight Decay

In [11]:
class NoamScheduler(optim.lr_scheduler.LambdaLR):
    def __init__(self, optimizer, model_size, factor, warmup_steps):
        self.model_size = model_size
        self.factor = factor
        self.warmup_steps = warmup_steps
        super(NoamScheduler, self).__init__(optimizer, self.lr_lambda)

    def lr_lambda(self, step):
        step = max(step, 1)  # Avoid division by zero
        return self.factor * (self.model_size ** -0.5) * min(step ** -0.5, step * self.warmup_steps ** -1.5)

model = make_decoder_model(vocab_size=tokenizer.vocab_size)
optimizer = optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.98), eps=1e-9, weight_decay=1e-4)
scheduler = NoamScheduler(optimizer, model_size=512, factor=1, warmup_steps=4000)


##train

In [14]:
import time
def train_epoch(model, data_loader, criterion, optimizer, scheduler, device):
    model.train()
    total_loss = 0

    for batch in data_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()

        # Get embeddings
        embedded_input = model["embedding"](input_ids)

        # Pass through decoder
        outputs = model["decoder"](embedded_input, None, None, None)
        logits = model["generator"](outputs)  # (batch_size, input_seq_len, vocab_size)

        # Ensure logits match labels by truncating to the label sequence length
        logits = logits[:, :labels.shape[1], :]  # Trim to match labels' seq length

        # Debugging print
        # print(f"Logits Shape: {logits.shape}, Labels Shape: {labels.shape}")

        # Compute loss
        loss = criterion(logits.permute(0, 2, 1), labels)

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    return total_loss / len(data_loader)


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

EPOCHS = 5
for epoch in range(EPOCHS):
    start_time = time.time()
    train_loss = train_epoch(model, train_loader, criterion, optimizer, scheduler, device)
    end_time = time.time()

    print(f"Epoch {epoch+1}/{EPOCHS}, Loss: {train_loss:.4f}, Time: {end_time - start_time:.2f}s")


Epoch 1/5, Loss: 632.5149, Time: 87.17s
Epoch 2/5, Loss: 632.2951, Time: 90.23s
Epoch 3/5, Loss: 631.9152, Time: 90.40s
Epoch 4/5, Loss: 631.4798, Time: 90.84s
Epoch 5/5, Loss: 630.6077, Time: 90.97s


##checkpoints

In [16]:
def save_checkpoint(model, optimizer, epoch, path="decoder_checkpoint.pth"):
    checkpoint = {
        "model_state_dict": model.state_dict(),
        "optimizer_state_dict": optimizer.state_dict(),
        "epoch": epoch
    }
    torch.save(checkpoint, path)
    print(f"✅ Model saved at epoch {epoch}")

save_checkpoint(model, optimizer, epoch)


✅ Model saved at epoch 4


#Testing and evaluating the model

## Inference

In [23]:
import torch.nn.functional as F

def generate_summary(model, tokenizer, text, max_length=150, temperature=0.7, top_k=50, device="cuda"):
    model.eval()
    with torch.no_grad():
        input_ids = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length").input_ids.to(device)

        embedded_input = model["embedding"](input_ids)
        outputs = model["decoder"](embedded_input, None, None, None)
        logits = model["generator"](outputs)  # (batch_size, seq_len, vocab_size)

        # 🔹 Sampling במקום Argmax
        logits = logits[:, -1, :] / temperature
        probs = F.softmax(logits, dim=-1)
        top_k_probs, top_k_indices = torch.topk(probs, top_k, dim=-1)
        sampled_token = top_k_indices[:, torch.multinomial(top_k_probs, 1).squeeze(1)]

        summary = tokenizer.decode(sampled_token[0], skip_special_tokens=True)

    return summary



In [28]:
def generate_summary_with_eos(model, tokenizer, text, max_length=150, device="cuda"):
    model.eval()
    with torch.no_grad():
        input_ids = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding="max_length").input_ids.to(device)

        embedded_input = model["embedding"](input_ids)
        outputs = model["decoder"](embedded_input, None, None, None)
        logits = model["generator"](outputs)

        predicted_ids = []
        for i in range(logits.shape[1]):
            next_token_id = torch.argmax(logits[:, i, :], dim=-1).item()
            if next_token_id == tokenizer.eos_token_id:
                break
            predicted_ids.append(next_token_id)

        summary = tokenizer.decode(predicted_ids, skip_special_tokens=True)

    return summary


In [30]:
test_text = """
NASA's James Webb Space Telescope has successfully completed its alignment phase,
and engineers say the telescope's performance is even better than expected.
This marks a significant milestone in the mission, as it paves the way for the
final stages of preparation before the telescope begins its scientific operations.
Scientists anticipate that the telescope will provide unprecedented views of
distant galaxies, exoplanets, and cosmic phenomena.
"""

generated_summary = generate_summary(model, tokenizer, test_text)
print("🔹 **Original Text:**\n", test_text)
print("\n🔹 **Generated Summary:**\n", generated_summary)


🔹 **Original Text:**
 
NASA's James Webb Space Telescope has successfully completed its alignment phase,
and engineers say the telescope's performance is even better than expected. 
This marks a significant milestone in the mission, as it paves the way for the 
final stages of preparation before the telescope begins its scientific operations. 
Scientists anticipate that the telescope will provide unprecedented views of 
distant galaxies, exoplanets, and cosmic phenomena.


🔹 **Generated Summary:**
 outdoors


In [20]:
!pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=b7da36d28869e9b48ef0236cbce94589b7e5081dbcebc9ec5efcc5e86a458df0
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [21]:
from rouge_score import rouge_scorer

def calculate_rouge(reference_summary, generated_summary):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference_summary, generated_summary)
    return scores

reference_summary = test_text.split(".")[0]

rouge_scores = calculate_rouge(reference_summary, generated_summary)

print("\n🔹 **ROUGE Score:**")
for metric, score in rouge_scores.items():
    print(f"{metric.upper()}: Precision={score.precision:.2f}, Recall={score.recall:.2f}, F1={score.fmeasure:.2f}")



🔹 **ROUGE Score:**
ROUGE1: Precision=0.00, Recall=0.04, F1=0.00
ROUGE2: Precision=0.00, Recall=0.00, F1=0.00
ROUGEL: Precision=0.00, Recall=0.04, F1=0.00


In [22]:
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer

def summarize_textrank(text, num_sentences=2):
    parser = PlaintextParser.from_string(text, Tokenizer("english"))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, num_sentences)
    return " ".join([str(sentence) for sentence in summary])

textrank_summary = summarize_textrank(test_text)

print("\n🔹 **TextRank Summary:**\n", textrank_summary)

textrank_rouge_scores = calculate_rouge(reference_summary, textrank_summary)

print("\n🔹 **TextRank ROUGE Score:**")
for metric, score in textrank_rouge_scores.items():
    print(f"{metric.upper()}: Precision={score.precision:.2f}, Recall={score.recall:.2f}, F1={score.fmeasure:.2f}")



🔹 **TextRank Summary:**
 NASA's James Webb Space Telescope has successfully completed its alignment phase, and engineers say the telescope's performance is even better than expected. This marks a significant milestone in the mission, as it paves the way for the final stages of preparation before the telescope begins its scientific operations.

🔹 **TextRank ROUGE Score:**
ROUGE1: Precision=0.48, Recall=1.00, F1=0.65
ROUGE2: Precision=0.47, Recall=1.00, F1=0.64
ROUGEL: Precision=0.48, Recall=1.00, F1=0.65


#Fine-Tuning

In [31]:
!pip install transformers datasets




In [32]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_name = "t5-small"
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [33]:
from torch.utils.data import Dataset

class SummarizationDataset(Dataset):
    def __init__(self, data_pairs, tokenizer, max_input_len=512, max_summary_len=150):
        self.data_pairs = data_pairs
        self.tokenizer = tokenizer
        self.max_input_len = max_input_len
        self.max_summary_len = max_summary_len

    def __len__(self):
        return len(self.data_pairs)

    def __getitem__(self, idx):
        text, summary = self.data_pairs[idx]

        input_enc = self.tokenizer(
            "summarize: " + text,
            padding="max_length",
            truncation=True,
            max_length=self.max_input_len,
            return_tensors="pt"
        )

        summary_enc = self.tokenizer(
            summary,
            padding="max_length",
            truncation=True,
            max_length=self.max_summary_len,
            return_tensors="pt"
        )

        return {
            "input_ids": input_enc["input_ids"].squeeze(0),
            "attention_mask": input_enc["attention_mask"].squeeze(0),
            "labels": summary_enc["input_ids"].squeeze(0)
        }

dataset = SummarizationDataset(data_pairs, tokenizer)


In [34]:
from torch.utils.data import DataLoader

BATCH_SIZE = 8
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)


In [35]:
import torch
from transformers import AdamW

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

def train_t5(model, data_loader, optimizer, num_epochs=3):
    model.train()

    for epoch in range(num_epochs):
        total_loss = 0

        for batch in data_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {total_loss / len(data_loader):.4f}")

train_t5(model, train_loader, optimizer, num_epochs=3)


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/3, Loss: 1.1253
Epoch 2/3, Loss: 0.3742
Epoch 3/3, Loss: 0.2969


In [36]:
def generate_finetuned_summary(model, tokenizer, text, max_length=150):
    model.eval()
    with torch.no_grad():
        input_ids = tokenizer(
            "summarize: " + text, return_tensors="pt", max_length=512, truncation=True
        ).input_ids.to(device)

        outputs = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True)
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return summary

test_text = """
NASA's James Webb Space Telescope has successfully completed its alignment phase,
and engineers say the telescope's performance is even better than expected.
This marks a significant milestone in the mission, as it paves the way for the
final stages of preparation before the telescope begins its scientific operations.
Scientists anticipate that the telescope will provide unprecedented views of
distant galaxies, exoplanets, and cosmic phenomena.
"""

generated_summary = generate_finetuned_summary(model, tokenizer, test_text)
print("🔹 **Generated Summary:**\n", generated_summary)


🔹 **Generated Summary:**
 This marks a significant milestone in the mission, as it paves the way for the final stages of preparation before the telescope begins its scientific operations. Scientists anticipate that the telescope will provide unprecedented views of distant galaxies, exoplanets, and cosmic phenomena.
