Finetunig the mt5 model on the entire set.

In [1]:
!pip install pydantic==1.10.9 nvidia-ml-py3==7.352.0 pytorch-lightning==2.0.1.post0 transformers==4.28.0 torchvision==0.15.1 rouge-score==0.1.2 tensorboardx==2.6 accelerate==0.18.0 deepspeed==0.9.0 peft==0.2.0

Collecting pydantic==1.10.9
  Downloading pydantic-1.10.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-ml-py3==7.352.0
  Downloading nvidia-ml-py3-7.352.0.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pytorch-lightning==2.0.1.post0
  Downloading pytorch_lightning-2.0.1.post0-py3-none-any.whl (718 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m718.6/718.6 kB[0m [31m18.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m48.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torchvision==0.15.1
  Downloading torchvision-0.15.1-cp310-cp310-manylinux1_x86_64.whl (6.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [15]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl


from transformers import (
    AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

In [4]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    self.hparams = hparams

    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)

  def is_logger(self):
    return self.trainer.proc_rank <= 0

  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        lm_labels=lm_labels,
    )

  def _step(self, batch):
    lm_labels = batch["target_ids"]
    lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        lm_labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)

    tensorboard_logs = {"train_loss": loss}
    return {"loss": loss, "log": tensorboard_logs}

  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    return {"avg_train_loss": avg_train_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    return {"val_loss": loss}

  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    tensorboard_logs = {"val_loss": avg_loss}
    return {"avg_val_loss": avg_loss, "log": tensorboard_logs, 'progress_bar': tensorboard_logs}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]

  def optimizer_step(self, epoch, batch_idx, optimizer, optimizer_idx, second_order_closure=None):
    if self.trainer.use_tpu:
      xm.optimizer_step(optimizer)
    else:
      optimizer.step()
    optimizer.zero_grad()
    self.lr_scheduler.step()

  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)


In [5]:
logger = logging.getLogger(__name__)

class LoggingCallback(pl.Callback):
    def on_validation_end(self, trainer, pl_module):
        logger.info("***** Validation results *****")
        if pl_module.is_logger():
            metrics = trainer.callback_metrics
            # Log results
            for key in sorted(metrics):
                if key not in ["log", "progress_bar"]:
                    logger.info("{} = {}\n".format(key, str(metrics[key])))

    def on_test_end(self, trainer, pl_module):
        logger.info("***** Test results *****")

        if pl_module.is_logger():
            metrics = trainer.callback_metrics

            # Log and save results to file
            output_test_results_file = os.path.join(pl_module.hparams.output_dir, "test_results.txt")
            with open(output_test_results_file, "w") as writer:
                for key in sorted(metrics):
                    if key not in ["log", "progress_bar"]:
                        logger.info("{} = {}\n".format(key, str(metrics[key])))
                        writer.write("{} = {}\n".format(key, str(metrics[key])))

In [6]:
file_path = '/content/drive/MyDrive/multitude.csv'
df = pd.read_csv(file_path)


In [7]:
tokenizer = T5Tokenizer.from_pretrained('google/mt5-small')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

In [8]:
ids_hm = tokenizer.encode('0')
ids_mc = tokenizer.encode('1')
len(ids_hm), len(ids_mc)

(2, 2)

In [9]:
class MultitudeDataset(Dataset):
    def __init__(self, tokenizer, data_path, max_len=512, split='train'):
        # Load dataset
        self.data = pd.read_csv(data_path)
        self.data = self.data[self.data['split'] == split]

        self.max_len = max_len
        self.tokenizer = tokenizer
        self.inputs = []
        self.targets = []

        self._build()

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        source_ids = self.inputs[index]["input_ids"].squeeze()
        target_ids = self.targets[index]["input_ids"].squeeze()

        src_mask    = self.inputs[index]["attention_mask"].squeeze()
        target_mask = self.targets[index]["attention_mask"].squeeze()

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}

    def _build(self):
        self._build_examples_from_data(self.data)

    def _build_examples_from_data(self, data):
        for _, row in data.iterrows():
            text = row['text']
            label = row['label']
            target = str(label)  # Adjust if needed for multi-label targets

            # Tokenize inputs
            tokenized_inputs = self.tokenizer.batch_encode_plus(
                [text], max_length=self.max_len, padding='max_length', truncation=True, return_tensors="pt"
            )
            # Tokenize targets
            tokenized_targets = self.tokenizer.batch_encode_plus(
                [target], max_length=2, padding='max_length', truncation=True, return_tensors="pt"
            )

            self.inputs.append(tokenized_inputs)
            self.targets.append(tokenized_targets)

dataset = MultitudeDataset(tokenizer, file_path, max_len=512, split='train')
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [10]:
data = dataset[300]
print(tokenizer.decode(data['source_ids']))
print(tokenizer.decode(data['target_ids']))

By Christy George, Aug 24, 2015California is now heading into its fifth year of drought conditions and the state’s fire season is already on track to break a long-standing record for the costliest in the state’s history, according to Cal Fire.While the agency has seen more acres burned so far this season than in past years, the bulk of the estimated $305 million the state’s spent on firefighting is the result of a few high-profile blazes.The most notable of those is the 400-square-mile Butte Fire, which has been burning since Sept. 9 in Amador and Calaveras counties. The blaze consumed a home, several outbuildings and burned at least 10,000 acres. The cost to contain the fire — estimated at 25 percent Sunday — is $63 million.Another blaze, which has burned about 100,000 acres in El Dorado and Amador counties and is only 48 percent contained, has cost the state about $43 million.A massive fire in Lake and Calaveras counties known as the Valley Fire has consumed almost 76,000 acres and f

In [11]:
!mkdir -p mt5_classification

In [12]:
# Update args_dict
# args_dict = {
#     'data_path': '/kaggle/input/multitude/multitude.csv',
#     'output_dir': '/kaggle/working/mt5_classification',
#     'num_train_epochs': 2,
#     'gradient_accumulation_steps': 1,
#     'n_gpu': 2,
#     'fp_16': False,
#     'opt_level': 'O1',
#     'max_grad_norm': 1.0
# }


args_dict = dict(
    data_path = '/content/drive/MyDrive/multitude.csv',
    output_dir = '/content/drive/MyDrive/mt5',
    model_name_or_path='google/mt5-base',
    tokenizer_name_or_path='google/mt5-base',
    max_seq_length=512,
    learning_rate=3e-4,
    weight_decay=0.0,
    adam_epsilon=1e-8,
    warmup_steps=0,
    train_batch_size=8,
    eval_batch_size=8,
    num_train_epochs=2,
    gradient_accumulation_steps=16,
    n_gpu=1,
    early_stop_callback=False,
    fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
    opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
    max_grad_norm=1.0, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
    seed=42,
    max_len= 512
)
args = argparse.Namespace(**args_dict)

# Checkpoint callback
checkpoint_callback = pl.callbacks.ModelCheckpoint(
    dirpath=args.output_dir, filename="checkpoint-{epoch:02d}-{val_loss:.2f}", monitor="val_loss", mode="min", save_top_k=5
)

# Train params
train_params = dict(
    max_epochs=args.num_train_epochs,
    devices=args.n_gpu,
    accelerator="gpu",
    precision=16 if args.fp_16 else 32,
    gradient_clip_val=args.max_grad_norm,
    callbacks=[LoggingCallback(), checkpoint_callback],
)

In [13]:
def get_dataset(tokenizer, type_path, args):
    return MultitudeDataset(tokenizer=tokenizer, data_path=args.data_path, max_len=args.max_len, split=type_path)

In [14]:
model = T5FineTuner(args)

In [None]:
trainer = pl.Trainer(**train_params)

In [None]:
trainer.fit(model)