# CSCI 4435/5435: Text Mining & Natural Language Processing
# Assignment 6: Sequence to Sequence Models
## Student: Miguel Guirao
## Aggie ID: 800699208

# Summary
*

## 0. Load datasets

In [17]:
# import the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch

from huggingface_hub import notebook_login

from transformers import AutoTokenizer
from transformers import DataCollatorForSeq2Seq
from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate

In [18]:
torch.cuda.is_available()

True

In [19]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# import dataset News Summary
#summary_df = pd.read_csv('news_summary.csv')
# Load a dataset from a local CSV file
dataset = load_dataset("csv", data_files="news_summary.csv")

In [4]:
summary = dataset['train']
type(summary)

datasets.arrow_dataset.Dataset

In [5]:
summary = summary.train_test_split(test_size=0.2)

In [6]:
summary

DatasetDict({
    train: Dataset({
        features: ['headlines', 'text'],
        num_rows: 78720
    })
    test: Dataset({
        features: ['headlines', 'text'],
        num_rows: 19681
    })
})

In [None]:
# import Language Translation English 2 French
eng2french_df = pd.read_csv('eng_french.csv')
eng2french_df.shape

In [None]:
eng2french_df.head()

## 1. Task 1: Summarization
### (40 pts) Finetuning a T5 model (you can use the Google t5-small pretrained model) for summarization.

### Preprocessing

In [56]:
checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

prefix = "summarize: "

def preprocess_function2(examples):
    inputs = [prefix + doc for doc in examples["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["headlines"], max_length=128, truncation=True, padding='max_length')

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [57]:
def preprocess_function(examples):
    # Tokenize the source (input) texts
    model_inputs = tokenizer(
        examples["source"],
        max_length=128,
        padding="max_length",  # Ensure uniform length for batching
        truncation=True
    )

    # Tokenize the target (label) texts
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            examples["target"],
            max_length=128,
            padding="max_length",  # Ensure labels are also padded
            truncation=True
        )

    # Important for T5: replace pad token id with -100 so they are ignored in loss computation
    labels["input_ids"] = [
        [(label if label != tokenizer.pad_token_id else -100) for label in l]
        for l in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [58]:
# ValueError: 128 is not a valid PaddingStrategy, please select one of ['longest', 'max_length', 'do_not_pad']
tokenized_summary = summary.map(preprocess_function2, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, label_pad_token_id=-100,)

Map:   0%|          | 0/78720 [00:00<?, ? examples/s]

Map:   0%|          | 0/19681 [00:00<?, ? examples/s]

In [59]:
tokenized_summary

DatasetDict({
    train: Dataset({
        features: ['headlines', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 78720
    })
    test: Dataset({
        features: ['headlines', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 19681
    })
})

In [61]:
tokenized_summary['train']['input_ids'][0]

[21603,
 10,
 22842,
 4208,
 961,
 15,
 120,
 18,
 9160,
 11459,
 17462,
 14165,
 23,
 9,
 65,
 2162,
 24,
 8,
 166,
 999,
 2250,
 13,
 165,
 9279,
 18,
 17470,
 7070,
 443,
 25766,
 56,
 281,
 30,
 1048,
 17078,
 16,
 4144,
 37,
 25766,
 19,
 1644,
 12,
 1111,
 44,
 3,
 9,
 1634,
 13,
 300,
 11321,
 3,
 157,
 7656,
 16,
 20527,
 607,
 28,
 3,
 9,
 620,
 13,
 300,
 431,
 4906,
 2280,
 5,
 37,
 5449,
 56,
 3971,
 12,
 3,
 9,
 2411,
 491,
 6592,
 13,
 220,
 2280,
 5,
 1]

In [62]:
rouge = evaluate.load("rouge")

In [63]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [64]:
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [65]:
training_args = Seq2SeqTrainingArguments(
    output_dir="model",
    eval_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_summary["train"],
    eval_dataset=tokenized_summary["test"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch,Training Loss,Validation Loss


OverflowError: out of range integral type conversion attempted