In [1]:
%run scripts/setup.py

Beginning download...
Starting pariza/bbc-news-summary Download
Completed pariza/bbc-news-summary Download
Starting rmisra/news-category-dataset Download
Completed rmisra/news-category-dataset Download
Download completed!
json converted to DataFrame!
Downloading articles...
Articles downloaded!
Moving and renaming files...
Shutil task completed!


In [2]:
from transformers import AutoTokenizer

checkpoint = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
from scripts.build_features import BuildFeatures

bf = BuildFeatures()
datasets = bf.get_datasets()

In [4]:
datasets['val']

Dataset({
    features: ['text', 'summary'],
    num_rows: 482
})

In [5]:
prefix = "summarize: "


def preprocess_function(input):
    inputs = [prefix + doc for doc in input["text"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=input["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [6]:
tokenized_train_datasets = datasets['train'].map(preprocess_function, batched=True)
tokenized_val_datasets = datasets['val'].map(preprocess_function, batched=True)

Map:   0%|          | 0/68 [00:00<?, ? examples/s]

Map:   0%|          | 0/482 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

In [8]:
import evaluate

rouge = evaluate.load("rouge")

In [9]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [10]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [11]:
training_args = Seq2SeqTrainingArguments(
    output_dir="models/my_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
)

In [12]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_datasets,
    eval_dataset=tokenized_val_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()



  0%|          | 0/56 [00:00<?, ?it/s]

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


  0%|          | 0/97 [00:00<?, ?it/s]

{'eval_loss': 2.01261305809021, 'eval_rouge1': 0.1912, 'eval_rouge2': 0.112, 'eval_rougeL': 0.1659, 'eval_rougeLsum': 0.166, 'eval_gen_len': 19.0, 'eval_runtime': 517.8724, 'eval_samples_per_second': 0.931, 'eval_steps_per_second': 0.187, 'epoch': 1.0}


  0%|          | 0/97 [00:00<?, ?it/s]

{'eval_loss': 1.6712892055511475, 'eval_rouge1': 0.1941, 'eval_rouge2': 0.1179, 'eval_rougeL': 0.1693, 'eval_rougeLsum': 0.1695, 'eval_gen_len': 19.0, 'eval_runtime': 491.1517, 'eval_samples_per_second': 0.981, 'eval_steps_per_second': 0.197, 'epoch': 2.0}


  0%|          | 0/97 [00:00<?, ?it/s]

{'eval_loss': 1.5103946924209595, 'eval_rouge1': 0.1964, 'eval_rouge2': 0.1212, 'eval_rougeL': 0.1715, 'eval_rougeLsum': 0.1716, 'eval_gen_len': 19.0, 'eval_runtime': 510.9109, 'eval_samples_per_second': 0.943, 'eval_steps_per_second': 0.19, 'epoch': 3.0}


  0%|          | 0/97 [00:00<?, ?it/s]

{'eval_loss': 1.4624125957489014, 'eval_rouge1': 0.1984, 'eval_rouge2': 0.1237, 'eval_rougeL': 0.1731, 'eval_rougeLsum': 0.1732, 'eval_gen_len': 19.0, 'eval_runtime': 533.9732, 'eval_samples_per_second': 0.903, 'eval_steps_per_second': 0.182, 'epoch': 4.0}
{'train_runtime': 2573.4512, 'train_samples_per_second': 0.106, 'train_steps_per_second': 0.022, 'train_loss': 2.1006695883614674, 'epoch': 4.0}


TrainOutput(global_step=56, training_loss=2.1006695883614674, metrics={'train_runtime': 2573.4512, 'train_samples_per_second': 0.106, 'train_steps_per_second': 0.022, 'train_loss': 2.1006695883614674, 'epoch': 4.0})

In [13]:
evaluate = trainer.evaluate()

  0%|          | 0/97 [00:00<?, ?it/s]

In [14]:
evaluate

{'eval_loss': 1.4624125957489014,
 'eval_rouge1': 0.1984,
 'eval_rouge2': 0.1237,
 'eval_rougeL': 0.1731,
 'eval_rougeLsum': 0.1732,
 'eval_gen_len': 19.0,
 'eval_runtime': 519.1856,
 'eval_samples_per_second': 0.928,
 'eval_steps_per_second': 0.187,
 'epoch': 4.0}

In [18]:
trainer.save_pretrained()

AttributeError: 'Seq2SeqTrainer' object has no attribute 'save_pretrained'

In [15]:
text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."

In [20]:
trainer.save_model('models/saved_model')

In [21]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("models/saved_model")
inputs = tokenizer(text, return_tensors="pt").input_ids

In [27]:
from transformers import AutoModelForSeq2SeqLM

model = AutoModelForSeq2SeqLM.from_pretrained("models/saved_model")
outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)

In [29]:
tokenizer.decode(outputs[0], skip_special_tokens=True)

"the Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in American history. it'll ask the ultra-wealthy and corporations to pay their fair share."

In [None]:
# test_data_tokenized = tokenizer(test_data, padding=True, truncation=True, return_tensors="pt")

# # Generate predictions
# with torch.no_grad():
#     outputs = model.generate(
#         input_ids=test_data_tokenized.input_ids,
#         attention_mask=test_data_tokenized.attention_mask,
#         max_length=50,  # Set the maximum length of the generated sequences
#         num_beams=4,   # Number of beams for beam search
#         early_stopping=True,
#     )

# # Decode the generated sequences
# decoded_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

# # Print the generated sequences
# for i, text in enumerate(test_data):
#     print(f"Input: {text}")
#     print(f"Generated Output: {decoded_predictions[i]}\n")