In [2]:
!pip install -U transformers
!pip install -U datasets
!pip install tensorboard
!pip install sentencepiece
!pip install accelerate
!pip install evaluate
!pip install rouge_score

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m24.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

## Imports

In [3]:
import torch
import pprint
import evaluate
import numpy as np

from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset

In [4]:
pp = pprint.PrettyPrinter()

## Prepare Dataset

In [106]:
import pandas as pd
dataset = pd.read_csv('/content/drive/MyDrive/news_filtered.csv')

In [107]:
import unicodedata
import re
def clean_words(sentence):
    sentence = str(sentence).lower()
    sentence = unicodedata.normalize('NFKD', sentence).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    # URL'leri temizle
    sentence = re.sub(r"http[s]?://\S+", "", sentence)

    # Özel karakterler ve rakamlar
    sentence = re.sub(r"[\[\]\\0-9()\"$#%/@;:<>{}`+=~|.!?,-]|\bcnn\b", "", sentence)

    # Diğer özel karakterler
    sentence = re.sub(r"[&…•♦◆★☆■□▪▫▶◀▲▼]", "", sentence)

    # Ekstra boşlukları temizle
    sentence = re.sub(r"\s+", " ", sentence)

    # Yeni satırları temizle
    sentence = re.sub(r"\\n", "", sentence)

    # Baş ve sondaki boşlukları kaldır
    sentence = sentence.strip()

    # Artık tokenize etmiyor ve lemma uygulamıyoruz - tokenizer bu işi yapacak
    return sentence
dataset['article'] = dataset['article'].apply(clean_words)
dataset['highlights'] = dataset['highlights'].apply(clean_words)

In [108]:
def max_length(short_texts, long_texts, prct=85):
    """
    Metinlerin kelime sayılarını ve uzunluk dağılımlarını analiz eder.

    Args:
        short_texts: Özet metinleri listesi
        long_texts: Orijinal metin listesi
        prct: Maksimum uzunluk için kullanılacak yüzdelik değer

    Returns:
        tuple: Belirlenen yüzdeliğe karşılık gelen uzunluklar (uzun, kısa)
    """
    # Metinlerin kelime sayılarını hesapla
    length_longs = [len(text.split()) for text in long_texts]
    length_shorts = [len(text.split()) for text in short_texts]

    # İstatistikler
    long_stats = {
        'min': min(length_longs),
        'max': max(length_longs),
        'mean': np.mean(length_longs),
        'median': np.median(length_longs),
        'std': np.std(length_longs)
    }

    short_stats = {
        'min': min(length_shorts),
        'max': max(length_shorts),
        'mean': np.mean(length_shorts),
        'median': np.median(length_shorts),
        'std': np.std(length_shorts)
    }

    # Yüzdelik değerleri
    percentiles = [50, 75, 85, 90, 95, 99]
    for p in percentiles:
        long_stats[f'p{p}'] = np.percentile(length_longs, p)
        short_stats[f'p{p}'] = np.percentile(length_shorts, p)

    # İstenilen yüzdelik değerdeki uzunlukları yazdır
    print(f'Orijinal metinlerin {prct}. yüzdelik uzunluğu: {long_stats[f"p{prct}"]}')
    print(f'En uzun orijinal metin: {long_stats["max"]}')
    print(f'Ortalama orijinal metin uzunluğu: {long_stats["mean"]:.1f}')
    print()
    print(f'Özet metinlerin {prct}. yüzdelik uzunluğu: {short_stats[f"p{prct}"]}')
    print(f'En uzun özet metin: {short_stats["max"]}')
    print(f'Ortalama özet uzunluğu: {short_stats["mean"]:.1f}')
    print()

    # İstenilen yüzdelik değerdeki uzunlukları döndür
    return int(long_stats[f'p{prct}']), int(short_stats[f'p{prct}'])

# Metinlerin uzunluk analizi
max_len_news, max_len_summary = max_length(dataset['highlights'].to_list(), dataset['article'].to_list())

Orijinal metinlerin 85. yüzdelik uzunluğu: 462.0
En uzun orijinal metin: 500
Ortalama orijinal metin uzunluğu: 356.6

Özet metinlerin 85. yüzdelik uzunluğu: 56.0
En uzun özet metin: 1026
Ortalama özet uzunluğu: 43.0



In [109]:
# excel veri tipinde olan dataseti Dataset türüne çevir
dataset = Dataset.from_pandas(dataset)
dataset.info

DatasetInfo(description='', citation='', homepage='', license='', features={'article': Value(dtype='string', id=None), 'highlights': Value(dtype='string', id=None)}, post_processed=None, supervised_keys=None, builder_name=None, dataset_name=None, config_name=None, version=None, splits=None, download_checksums=None, download_size=None, post_processing_size=None, dataset_size=None, size_in_bytes=None)

In [110]:
full_dataset = dataset.train_test_split(test_size=0.2, shuffle=True)

In [111]:
dataset_train = full_dataset['train']
dataset_valid = full_dataset['test']

In [139]:
print(dataset_train)
print(dataset_valid)

Dataset({
    features: ['article', 'highlights'],
    num_rows: 80212
})
Dataset({
    features: ['article', 'highlights'],
    num_rows: 20054
})


In [140]:
dataset_train[1000]

{'article': 'atletico madrid have opened talks with chelsea over a potential deal for german world cup winner andre schurrle schurrle only joined chelsea last summer for m from bayer leverkusen and has failed to hold down an automatic place atletico expressed an interest last week but are pressing for an answer from chelsea jose mourinho though would prefer to sell other foreign players rather than the versatile forward video scroll down for world cup winning andre schurrle walking on water blues bother atletico madrid are chasing chelsea forward andre schurrle and the yearold would appear to want to stay at stamford bridge after stating he hopes to replicate his success at the world cup with chelsea this season the german forward scored three goals to help his country reach the final in brazil where they edged out argentina in extra time playing for keeps chelsea manager jose mourinho would prefer not to part with the german forward german engineering andre schurrle was part of the ge

In [141]:
# dataset_train in highlights sutunundaki textlerin başındaki go kelimesini ve sonundaki stop kelimesini kaldır

# highlights sütunundaki "go" ve "stop" kelimelerini kaldır
dataset_train = dataset_train.map(lambda example: {
    'highlights': example['highlights'].replace('go ', '').replace(' stop', '')
})
dataset_valid = dataset_valid.map(lambda example: {
    'highlights': example['highlights'].replace('go ', '').replace(' stop', '')
})
# Değişikliği kontrol etmek için örnek bir satırı yazdır
print(dataset_train[1000]['highlights'])
print(dataset_valid[1000]['highlights'])

Map:   0%|          | 0/80212 [00:00<?, ? examples/s]

Map:   0%|          | 0/20054 [00:00<?, ? examples/s]

la liga champions atletico madrid are in talks with andre schurrle chelsea forward was part of germanys world cup winning team schurrle arrived at stamford bridge from bayer leverkusen for m jose mourinho would prefer to sell a different foreign player
ancient fishing people in china have built a village on water home to thousands the tanka people named gypsies of the sea live in floating homes and seafood farms


## Dataset Analysis

## Configurations

In [146]:
MODEL = 't5-small'
BATCH_SIZE = 16
NUM_PROCS = 4
EPOCHS = 5
OUT_DIR = 'results_t5base'
MAX_LENGTH = 512 # Maximum context length to consider while preparing dataset.

## Tokenization

In [94]:
tokenizer = T5Tokenizer.from_pretrained(MODEL)

In [143]:
# Function to convert text data into model inputs and targets
def preprocess_function(examples):
    # Set up the tokenizer for inputs
    inputs = [f"summarize: {article}" for article in examples['article']]
    model_inputs = tokenizer(
        inputs,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length'
    )

    # Set up the tokenizer for targets
    targets = [summary for summary in examples['highlights']]
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets,
            max_length=MAX_LENGTH,
            truncation=True,
            padding='max_length'
        )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Apply the function to the whole dataset
tokenized_train = dataset_train.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)
tokenized_valid = dataset_valid.map(
    preprocess_function,
    batched=True,
    num_proc=NUM_PROCS
)

Map (num_proc=4):   0%|          | 0/80212 [00:00<?, ? examples/s]



Map (num_proc=4):   0%|          | 0/20054 [00:00<?, ? examples/s]



## Model

In [144]:
model = T5ForConditionalGeneration.from_pretrained(MODEL)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
# Total parameters and trainable parameters.
total_params = sum(p.numel() for p in model.parameters())
print(f"{total_params:,} total parameters.")
total_trainable_params = sum(
    p.numel() for p in model.parameters() if p.requires_grad)
print(f"{total_trainable_params:,} training parameters.")

60,506,624 total parameters.
60,506,624 training parameters.


In [145]:
def preprocess_logits_for_metrics(logits, labels):
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

## Training

In [147]:
training_args = TrainingArguments(
    output_dir=OUT_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    fp16=True,  # Karma hassasiyet eğitimi
    gradient_accumulation_steps=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=OUT_DIR,
    logging_steps=1000,
    eval_strategy='steps',
    eval_steps=1000,
    save_strategy='epoch',
    save_total_limit=2,
    report_to='tensorboard',
    learning_rate=0.0001,
    dataloader_num_workers=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    #compute_metrics=compute_metrics
)

history = trainer.train()

Step,Training Loss,Validation Loss
1000,1.6647,0.201399
2000,0.2208,0.197548
3000,0.2169,0.196106
4000,0.2139,0.195245
5000,0.2122,0.194723
6000,0.2114,0.194424


## Inference

In [149]:
test_dataset = pd.read_excel('/content/drive/MyDrive/InshortsData.xlsx')

# Function to generate summary
def generate_summary(text):
    # Clean text
    cleaned_text = clean_words(text)

    # Prepare input
    input_text = f"summarize: {cleaned_text}"
    encoding = tokenizer(
        input_text,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length',
        return_tensors="pt"
    )

    # Move input to device
    input_ids = encoding.input_ids.to(device)
    attention_mask = encoding.attention_mask.to(device)

    # Generate summary
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=150,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )

    # Decode summary
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Generate summaries for test data
test_articles = test_dataset['Short'].tolist()
test_headlines = test_dataset['Headline'].tolist()
test_articles = test_articles[:1000]
test_headlines = test_headlines[:1000]
generated_summaries = []

print("Generating summaries for test data...")
for i, article in enumerate(test_articles):
    if i % 10 == 0:
        print(f"Processing {i}/{len(test_articles)}")

    if pd.isna(article):
        generated_summaries.append("")
        continue

    summary = generate_summary(article)
    generated_summaries.append(summary)


# Compute ROUGE metrics
results = rouge.compute(
    predictions=generated_summaries,
    references=test_headlines,
    use_stemmer=True,
    rouge_types=['rouge1', 'rouge2', 'rougeL']
)

# Print results
print("\nEvaluation Results:")
for metric, score in results.items():
    print(f"{metric}: {score:.4f}")

# Save results to file
with open(f"{OUT_DIR}/evaluation_results.txt", "w") as f:
    f.write("Evaluation Results:\n")
    for metric, score in results.items():
        f.write(f"{metric}: {score:.4f}\n")

# Sample outputs
print("\nSample Summaries:")
for i in range(min(5, len(test_articles))):
    if pd.isna(test_articles[i]):
        continue

    print(f"Article: {test_articles[i][:100]}...")
    print(f"Original Headline: {test_headlines[i]}")
    print(f"Generated Summary: {generated_summaries[i]}")
    print("-" * 50)

Generating summaries for test data...
Processing 0/1000
Processing 10/1000
Processing 20/1000
Processing 30/1000
Processing 40/1000
Processing 50/1000
Processing 60/1000
Processing 70/1000
Processing 80/1000
Processing 90/1000
Processing 100/1000
Processing 110/1000
Processing 120/1000
Processing 130/1000
Processing 140/1000
Processing 150/1000
Processing 160/1000
Processing 170/1000
Processing 180/1000
Processing 190/1000
Processing 200/1000
Processing 210/1000
Processing 220/1000
Processing 230/1000
Processing 240/1000
Processing 250/1000
Processing 260/1000
Processing 270/1000
Processing 280/1000
Processing 290/1000
Processing 300/1000
Processing 310/1000
Processing 320/1000
Processing 330/1000
Processing 340/1000
Processing 350/1000
Processing 360/1000
Processing 370/1000
Processing 380/1000
Processing 390/1000
Processing 400/1000
Processing 410/1000
Processing 420/1000
Processing 430/1000
Processing 440/1000
Processing 450/1000
Processing 460/1000
Processing 470/1000
Processing 48

In [153]:
test_dataset = pd.read_csv('/content/drive/MyDrive/bbc_news_dataset.csv')

# Function to generate summary
def generate_summary(text):
    # Clean text
    cleaned_text = clean_words(text)

    # Prepare input
    input_text = f"summarize: {cleaned_text}"
    encoding = tokenizer(
        input_text,
        max_length=MAX_LENGTH,
        truncation=True,
        padding='max_length',
        return_tensors="pt"
    )

    # Move input to device
    input_ids = encoding.input_ids.to(device)
    attention_mask = encoding.attention_mask.to(device)

    # Generate summary
    with torch.no_grad():
        outputs = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=150,
            num_beams=4,
            length_penalty=2.0,
            early_stopping=True
        )

    # Decode summary
    summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Generate summaries for test data
test_articles = test_dataset['Articles'].tolist()
test_headlines = test_dataset['Summaries'].tolist()
test_articles = test_articles
test_headlines = test_headlines
generated_summaries = []

print("Generating summaries for test data...")
for i, article in enumerate(test_articles):
    if i % 100 == 0:
        print(f"Processing {i}/{len(test_articles)}")

    if pd.isna(article):
        generated_summaries.append("")
        continue

    summary = generate_summary(article)
    generated_summaries.append(summary)


# Compute ROUGE metrics
results = rouge.compute(
    predictions=generated_summaries,
    references=test_headlines,
    use_stemmer=True,
    rouge_types=['rouge1', 'rouge2', 'rougeL']
)

# Print results
print("\nEvaluation Results:")
for metric, score in results.items():
    print(f"{metric}: {score:.4f}")

# Save results to file
with open(f"{OUT_DIR}/evaluation_results.txt", "w") as f:
    f.write("Evaluation Results:\n")
    for metric, score in results.items():
        f.write(f"{metric}: {score:.4f}\n")

# Sample outputs
print("\nSample Summaries:")
for i in range(min(5, len(test_articles))):
    if pd.isna(test_articles[i]):
        continue

    print(f"Article: {test_articles[i][:100]}...")
    print(f"Original Headline: {test_headlines[i]}")
    print(f"Generated Summary: {generated_summaries[i]}")
    print("-" * 50)

Generating summaries for test data...
Processing 0/1020
Processing 100/1020
Processing 200/1020
Processing 300/1020
Processing 400/1020
Processing 500/1020
Processing 600/1020
Processing 700/1020
Processing 800/1020
Processing 900/1020
Processing 1000/1020

Evaluation Results:
rouge1: 0.3143
rouge2: 0.1998
rougeL: 0.2403

Sample Summaries:
Article: WorldCom trial starts in New York..The trial of Bernie Ebbers, former chief executive of bankrupt US...
Original Headline: The trial of Bernie Ebbers, former chief executive of bankrupt US phone company WorldCom, has started in New York with the selection of the jury.Mr Ebbers, 63, is accused of being the mastermind behind an $11bn (£6bn) accounting fraud that eventually saw the firm collapse in July 2002.Under Mr Ebbers' leadership, WorldCom emerged from Mississippi obscurity to become a $160bn telecoms giant and the darling of late 1990s investors.Mr Ebbers' trial, which is expected to last two months, is the latest in a series of attempts

In [154]:
print("\nSample Summaries:")
for i in range(min(5, len(test_articles))):
    if pd.isna(test_articles[i]):
        continue

    print(f"Article: {test_articles[i+100]}...")
    print(f"Original Headline: {test_headlines[i+100]}")
    print(f"Generated Summary: {generated_summaries[i+100]}")
    print("-" * 50)


Sample Summaries:
Article: Absa and Barclays talks continue..South Africa biggest retail bank Absa has said it is still in talks with UK bank Barclays over the sale of majority stake in the group...In November, Absa said it was close to striking a deal with Barclays. But the group said Barclays is still waiting for the approval of South Africa's banking and competition authorities to make a formal offer. Absa also announced that it expects to see earnings grow by 20-25% in its current financial year..."Discussions with Barclays are continuing, but shareholders are advised that no agreement has been reached as to any offer being made by Barclays to acquire a majority stake in Absa," Absa said in a statement. If Barclays buys a stake in Absa it will be one of the largest foreign investments in South Africa in recent years. Absa currently has a market value of about $8.5bn (£4.4bn). Analysts said Absa's earnings forecast was better than expected. However, the company warned that headline

In [156]:
# Your 5 custom inputs
custom_inputs = [
    "Artificial intelligence is rapidly transforming industries worldwide. In healthcare, AI assists in diagnosing diseases and developing personalized treatments. The financial sector uses AI for fraud detection and algorithmic trading. Transportation is seeing AI integration in autonomous vehicles. While promising, the ethical implications and job market impact of AI are important considerations as its influence grows.",

    "Climate change poses a significant global threat. Rising temperatures lead to higher sea levels and more extreme weather events like hurricanes and droughts. Biodiversity is declining due to habitat loss and changing conditions. Urgent action is needed globally to reduce greenhouse gas emissions by transitioning to renewable energy and adopting sustainable practices to protect the planet.",

    "Recent space exploration discoveries are expanding our understanding of the universe. Powerful new telescopes like the James Webb Space Telescope provide detailed views of early galaxies. Missions to Mars are exploring its potential for life, while probes to moons like Europa suggest subsurface oceans that could harbor life. Private companies are accelerating space access with reusable rockets and ambitious mission plans.",

    "The global economy faces several challenges currently. Inflation is impacting purchasing power in many countries. Supply chain disruptions, caused by the pandemic and geopolitical events, continue to affect the availability and cost of goods. Geopolitical tensions further complicate the economic outlook. Central banks are working to control inflation without causing a recession, as businesses adapt to navigate this complex environment.",

    "The shift towards renewable energy sources is accelerating globally. Solar and wind power are becoming more efficient and cost-effective compared to fossil fuels. Investments in renewable energy infrastructure are increasing, supported by government policies. Advances in battery storage are improving the reliability of renewables. This transition is crucial for reducing emissions, improving air quality, and creating green jobs for a sustainable future."
]
# Generate summaries for custom inputs
print("\nGenerating summaries for custom inputs...")
custom_generated_summaries = []
for i, text in enumerate(custom_inputs):
    print(f"Processing custom input {i+1}/{len(custom_inputs)}")
    summary = generate_summary(text)
    custom_generated_summaries.append(summary)

# Print custom generated summaries
print("\nCustom Generated Summaries:")
for i, summary in enumerate(custom_generated_summaries):
    print(f"Input {i+1}: {custom_inputs[i][:100]}...")
    print(f"Generated Summary: {summary}")
    print("-" * 50)


Generating summaries for custom inputs...
Processing custom input 1/5
Processing custom input 2/5
Processing custom input 3/5
Processing custom input 4/5
Processing custom input 5/5

Custom Generated Summaries:
Input 1: Artificial intelligence is rapidly transforming industries worldwide. In healthcare, AI assists in d...
Generated Summary: ai helps in diagnosing diseases and developing personalized treatments the financial sector uses ai for fraud detection and algorithmic trading transportation is seeing ai integration in autonomous vehicles
--------------------------------------------------
Input 2: Climate change poses a significant global threat. Rising temperatures lead to higher sea levels and ...
Generated Summary: climate change poses a global threat rising temperatures lead to higher sea levels and more extreme weather events like hurricanes and droughts biodiversity is declining due to habitat loss and changing conditions
--------------------------------------------------
I