In [None]:
!pip install transformers

In [None]:
!pip install datasets torch sacrebleu rouge_score py7zr -q

In [None]:
!pip install sentencepiece

In [None]:
from transformers import pipeline, set_seed

In [None]:
import matplotlib.pyplot as plt

import pandas as pd
from datasets import load_metric

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from datasets import load_dataset

from tqdm import tqdm
import torch
from datasets import load_dataset

import os

nltk.download("punkt")

In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
model_ckpt = "google/pegasus-cnn_dailymail"  # You can use another checkpoint if you prefer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModelForSeq2SeqLM.from_pretrained(model_ckpt).to(device)

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
drive_path = "./drive/MyDrive"

In [None]:
#Get data
df = pd.read_csv(os.path.join(drive_path, 'dataset_writeups_original_cleaned.csv'), sep=',')
df = df.dropna().reset_index()

#Select part of data we want to keep
df = df[['original','cleaned']]

#Select only part of it (makes testing faster)
writeups = df
writeups.head()

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Split the DataFrame into training and test+validation sets
train_writeups, temp_writeups = train_test_split(writeups, test_size=0.2, random_state=42)

# Split the temporary set into test and validation sets
test_writeups, validation_writeups = train_test_split(temp_writeups, test_size=1/2, random_state=42)

# Convert the split DataFrames into datasets.Dataset objects
train_dataset = Dataset.from_pandas(train_writeups)
test_dataset = Dataset.from_pandas(test_writeups)
validation_dataset = Dataset.from_pandas(validation_writeups)

# Create a datasets.DatasetDict object with the train, test, and validation datasets
writeups_dataset = DatasetDict({"train": train_dataset, "test": test_dataset, "validation": validation_dataset})

print(writeups_dataset)

In [None]:
split_lengths = [len(writeups_dataset[split])for split in writeups_dataset]

print(f"Split lengths: {split_lengths}")
print(f"Features: {writeups_dataset['train'].column_names}")
print("\nOriginal:")

print(writeups_dataset["test"][0]["original"])

print("\nCleaned:")

print(writeups_dataset["test"][0]["cleaned"])

In [None]:
def generate_batch_sized_chunks(list_of_elements, batch_size):
    """split the dataset into smaller batches that we can process simultaneously
    Yield successive batch-sized chunks from list_of_elements."""
    for i in range(0, len(list_of_elements), batch_size):
        yield list_of_elements[i : i + batch_size]

def calculate_metric_on_test_ds(dataset, metric, model, tokenizer, 
                               batch_size=4,  # Updated batch size to match training settings
                               device=device, 
                               column_text="article", 
                               column_summary="highlights"):
    article_batches = list(generate_batch_sized_chunks(dataset[column_text], batch_size))
    target_batches = list(generate_batch_sized_chunks(dataset[column_summary], batch_size))

    for article_batch, target_batch in tqdm(
        zip(article_batches, target_batches), total=len(article_batches)):
        
        inputs = tokenizer(article_batch, max_length=1024, truncation=True, 
                        padding="max_length", return_tensors="pt")
        
        summaries = model.generate(input_ids=inputs["input_ids"].to(device),
                         attention_mask=inputs["attention_mask"].to(device), 
                         length_penalty=1.0, num_beams=8, max_length=256)
        ''' parameter for length penalty ensures that the model does not generate sequences that are too long. '''
        
        # Finally, we decode the generated texts, 
        # replace the <n> token, and add the decoded texts with the references to the metric.
        decoded_summaries = [tokenizer.decode(s, skip_special_tokens=True, 
                                clean_up_tokenization_spaces=True) 
               for s in summaries]      
        
        decoded_summaries = [d.replace("<n>", " ") for d in decoded_summaries]
        
        metric.add_batch(predictions=decoded_summaries, references=target_batch)
        
    #  Finally compute and return the ROUGE scores.
    score = metric.compute()
    return score

In [None]:
pipe = pipeline('summarization', model = model_ckpt)
pipe_out = pipe(writeups_dataset['test'][0]['original'])

print(pipe_out)

In [None]:
rouge_metric = load_metric('rouge')
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

score = calculate_metric_on_test_ds(writeups_dataset['test'], rouge_metric, model, tokenizer, column_text='original', column_summary='cleaned', batch_size=4) # Updated batch size to match training settings

In [None]:
rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
rouge_before = pd.DataFrame(rouge_dict, index=['model'])
display(rouge_before)

In [None]:
text_token_len = [len(tokenizer.encode(s)) for s in writeups_dataset['train']['original']]
cleaned_token_len = [len(tokenizer.encode(s)) for s in writeups_dataset['train']['cleaned']]

fig, axes = plt.subplots(1, 2, figsize=(10, 4), sharey=True)
axes[0].hist(text_token_len, bins=20, color='C0', edgecolor='C0')
axes[0].set_title("Original Token Length")
axes[0].set_xlabel("Length")
axes[0].set_ylabel("Count")

axes[1].hist(cleaned_token_len, bins=20, color='C0', edgecolor='C0')
axes[1].set_title("Cleaned Token Length")
axes[1].set_xlabel("Length")
plt.tight_layout()
plt.show()

In [None]:
def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['original'], max_length=1024, truncation=True)
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['cleaned'], max_length=256, truncation=True)
        
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

writeups_dataset_pt = {split: ds.map(convert_examples_to_features, batched=True) for split, ds in writeups_dataset.items()}

In [None]:
from transformers import DataCollatorForSeq2Seq

seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="model",
    num_train_epochs=500,
    per_device_train_batch_size=4,  # Increased batch size
    per_device_eval_batch_size=4,  # Increased batch size
    weight_decay=0.01,
    logging_steps=10,
    evaluation_strategy='steps',
    eval_steps=500,
    save_steps=1000,
    gradient_accumulation_steps=2,  # Adjusted gradient_accumulation_steps
    fp16=True,  # Enable mixed precision training
    load_best_model_at_end=True
)

In [None]:
from transformers import EarlyStoppingCallback

trainer = Trainer(model=model, args=training_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=writeups_dataset_pt["train"],
                  eval_dataset=writeups_dataset_pt["validation"],
                  callbacks=[EarlyStoppingCallback(early_stopping_patience=10)],  # Stop training when validation loss doesn't improve for 10 epochs
)

In [None]:
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb=100" # Updated settings

In [None]:
trainer.train()

score = calculate_metric_on_test_ds(
    writeups_dataset['test'], rouge_metric, trainer.model, tokenizer, batch_size=4, column_text='original', column_summary='cleaned'
)

rouge_dict = dict((rn, score[rn].mid.fmeasure) for rn in rouge_names)
rouge_after = pd.DataFrame(rouge_dict, index = [f'model'])
display(rouge_before)
display(rouge_after)

In [None]:
model_path = os.path.join(drive_path, "cleaning_model")

model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

tuned_model = AutoModelForSeq2SeqLM.from_pretrained(model_path)
tuned_tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
gen_kwargs = {"length_penalty": 1.0, "num_beams": 8, "max_length": 256}

In [None]:
import random

random_index = random.randint(0, len(writeups_dataset["test"]) - 1)
sample_text = writeups_dataset["test"][random_index]["original"]
reference = writeups_dataset["test"][random_index]["cleaned"]

# Use your fine-tuned model and tokenizer in the pipeline
pipe = pipeline("summarization", model=tuned_model, tokenizer=tuned_tokenizer)

print(f"Random index: {random_index}\n")

print("Original:")
print(sample_text)

print("\nReference Cleaned:")
print(reference)

print("\nModel Cleaned:")
print(pipe(sample_text, **gen_kwargs)[0]["summary_text"])