This code is based on: https://huggingface.co/docs/transformers/en/tasks/summarization

This was run in Google Colab on a V100.

In [None]:
# install pip dependencies

%pip install evaluate rouge_score datasets transformers[torch] accelerate



In [None]:
# import modules

from transformers import DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from google.colab import drive
from datasets import DatasetDict, load_dataset
import evaluate
import numpy as np

In [None]:
# link drive and load datasets

drive.mount("/content/drive")

ds_PLOS = load_dataset('json',
                       data_files={"train": "/content/drive/MyDrive/CPSC_477/CPSC477_CHAD/biolaysumm2024_data/PLOS_train.jsonl", # your directories
                                   "validation": "/content/drive/MyDrive/CPSC_477/CPSC477_CHAD/biolaysumm2024_data/PLOS_val.jsonl"})


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# tokenize from pretrained tokenizer

checkpoint = "google-t5/t5-small"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# preprocess inputs

prefix = "summarize: "

def preprocess_function(examples):
  inputs = [prefix + doc for doc in examples["article"]]

  # change the max input length here
  model_inputs = tokenizer(inputs, max_length=4096, truncation=True, padding=True)

  labels = tokenizer(text_target=examples["lay_summary"], max_length=256, truncation=True, padding=True)

  model_inputs["labels"] = labels["input_ids"]
  return model_inputs

tokenized_dataset = ds_PLOS.map(preprocess_function, batched=True)

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

Map:   0%|          | 0/24773 [00:00<?, ? examples/s]

Map:   0%|          | 0/1376 [00:00<?, ? examples/s]

In [None]:
# remove unneeded columns

tokenized_dataset['train'] = tokenized_dataset['train'].remove_columns("lay_summary")
tokenized_dataset['train'] = tokenized_dataset['train'].remove_columns("article")
tokenized_dataset['train'] = tokenized_dataset['train'].remove_columns("headings")
tokenized_dataset['train'] = tokenized_dataset['train'].remove_columns("keywords")
tokenized_dataset['train'] = tokenized_dataset['train'].remove_columns("id")
tokenized_dataset['train'] = tokenized_dataset['train'].remove_columns("attention_mask")

tokenized_dataset['validation'] = tokenized_dataset['validation'].remove_columns("lay_summary")
tokenized_dataset['validation'] = tokenized_dataset['validation'].remove_columns("article")
tokenized_dataset['validation'] = tokenized_dataset['validation'].remove_columns("headings")
tokenized_dataset['validation'] = tokenized_dataset['validation'].remove_columns("keywords")
tokenized_dataset['validation'] = tokenized_dataset['validation'].remove_columns("id")
tokenized_dataset['validation'] = tokenized_dataset['validation'].remove_columns("attention_mask")

In [None]:
# evaluate metrics

rouge = evaluate.load("rouge")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
  labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
  decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

  result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

  prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
  result["gen_len"] = np.mean(prediction_lens)

  return {k: round(v, 4) for k, v in result.items()}

In [None]:
# get pretrained model

model_PLOS = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

In [None]:
# now train

training_args = Seq2SeqTrainingArguments(output_dir="/content/drive/MyDrive/CPSC_477/CPSC477_CHAD/abstractive_models/PLOS_model_4-24", # your directory
                                         remove_unused_columns=False,
                                         auto_find_batch_size=True)
                                        #  fp16_full_eval=True,
                                        #  fp16=True,
                                        #  optim_target_modules="galore_adamw_8bit")

trainer = Seq2SeqTrainer(
    model=model_PLOS,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


NameError: name 'Seq2SeqTrainingArguments' is not defined