In [7]:
from jupyterthemes.stylefx import set_nb_theme
set_nb_theme('chesterish')

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
import numpy as np
import torch

import datasets 

from datasets import load_dataset, load_metric

from transformers import (
    AutoModel,
    AutoModelForMaskedLM,
    AutoModelForSeq2SeqLM,
    AutoModelForTokenClassification,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    pipeline,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
)

  from .autonotebook import tqdm as notebook_tqdm
2023-02-21 15:40:52.888700: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-21 15:40:53.473104: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-02-21 15:40:53.473149: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [4]:
# Load the pre-trained model and tokenizer
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-small automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [5]:
def preprocess_function(batch):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length=512)
    outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length=128)
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()
    return batch

# Load the dataset
train_data = load_dataset("cnn_dailymail", "3.0.0", split="train")
val_data = load_dataset("cnn_dailymail", "3.0.0", split="validation[:10%]")

train_ds = train_data.map(
    preprocess_function, 
    batched=True, 
    batch_size=256, 
    remove_columns=["article", "highlights", "id"]
)

val_ds = val_data.map(
    preprocess_function, 
    batched=True, 
    batch_size=256, 
    remove_columns=["article", "highlights", "id"]
)

Found cached dataset cnn_dailymail (/home/mrbean/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
Found cached dataset cnn_dailymail (/home/mrbean/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1122/1122 [02:06<00:00,  8.88ba/s]
Loading cached processed dataset at /home/mrbean/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/1b3c71476f6d152c31c1730e83ccb08bcf23e348233f4fcc11e182248e6bf7de/cache-2d3b7edd75fb1188.arrow


In [6]:
class MyLightningModule(pl.LightningModule):
    def __init__(self, model_name, learning_rate, weight_decay, batch_size, num_training_steps):
        super().__init__()
        self.model_name = model_name
        self.learning_rate = learning_rate
        self.weight_decay = weight_decay
        self.batch_size = batch_size
        self.num_training_steps = num_training_steps
        
        # Load the pre-trained model and tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_name)

    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels,
        )
        return output.loss, output.logits
    
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        
        loss

# Define the data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

# Initialize the trainer arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    max_steps=5000,
    weight_decay=1e-4,
    push_to_hub=False,
    evaluation_strategy = "steps",
    eval_steps = 50,
    generation_max_length=128,
    predict_with_generate=True,
    logging_steps=100,
    gradient_accumulation_steps=1,
    fp16=True,
)

# Load the ROUGE metric
metric = load_metric("rouge")

# Define the evaluation function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    scores = metric.compute(predictions=decoded_preds, references=decoded_labels, rouge_types=["rouge1"])["rouge1"].mid
    return {"rouge1_precision": scores.precision, "rouge1_recall": scores.recall, "rouge1_fmeasure": scores.fmeasure}


# Initialize the trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Start the training
trainer.train()

  metric = load_metric("rouge")
max_steps is given, it will override any value given in num_train_epochs
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: id, article, highlights. If id, article, highlights are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 0
  Num Epochs = 1
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 5000
  Number of trainable parameters = 60506624


IndexError: Invalid key: 90427 is out of bounds for size 0

# Steps:
1. Rewrite code to be more general

a) Data loading should be from disk rather than their load_dataset, and should be on the fly

b) Rewrite to Lightning code, Trainer etc using Lightning, compute metric fine that we use huggingface

In [None]:
!nvidia-smi