<a href="https://colab.research.google.com/github/edmarRod/autowiki/blob/main/trec_car_training_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get dataset from drive

In [None]:
from google.colab import drive
import json

drive.mount('/content/drive')
path = r"drive/MyDrive/Unicamp/Pos/ia376e_2021S2/autowiki/"

with open(path + 'train.json', 'r') as f:
  data = json.load(f)

Mounted at /content/drive


# Transform to training dataset

In [None]:
dataset = []
for key, val in data.items():
  try:
    x = key + ' [SEP] ' + val['abstract']
    y = ' [SEP] '.join(val['sections'])
    if y == '':
      raise ValueError()
    dataset.append((x,y))
  except:
    pass

# Training

In [None]:
! pip install sacrebleu
! pip install pytorch-lightning
! pip install transformers
! pip install sentencepiece

Collecting sacrebleu
  Downloading sacrebleu-2.0.0-py3-none-any.whl (90 kB)
[?25l[K     |███▋                            | 10 kB 25.6 MB/s eta 0:00:01[K     |███████▏                        | 20 kB 25.1 MB/s eta 0:00:01[K     |██████████▉                     | 30 kB 16.5 MB/s eta 0:00:01[K     |██████████████▍                 | 40 kB 14.4 MB/s eta 0:00:01[K     |██████████████████              | 51 kB 8.6 MB/s eta 0:00:01[K     |█████████████████████▋          | 61 kB 9.0 MB/s eta 0:00:01[K     |█████████████████████████▎      | 71 kB 8.6 MB/s eta 0:00:01[K     |████████████████████████████▉   | 81 kB 9.6 MB/s eta 0:00:01[K     |████████████████████████████████| 90 kB 4.8 MB/s 
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting portalocker
  Downloading portalocker-2.3.2-py2.py3-none-any.whl (15 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.4 portalocker-2.3.2 sacrebleu-2.

In [None]:
# Importar todos os pacotes de uma só vez para evitar duplicados ao longo do notebook.
import gzip
import nvidia_smi
import os
import pytorch_lightning as pl
import random
import sacrebleu
import torch
import torch.nn.functional as F

from google.colab import drive

from pytorch_lightning.callbacks import ModelCheckpoint

from torch.utils.data import DataLoader
from torch.utils.data import Dataset

from typing import Dict
from typing import List
from typing import Tuple

In [None]:
# Important: Fix seeds so we can replicate results
seed = 123
random.seed(seed)
# np.random.seed(seed)
torch.random.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [None]:
print(f"Pytorch Lightning Version: {pl.__version__}")
nvidia_smi.nvmlInit()
handle = nvidia_smi.nvmlDeviceGetHandleByIndex(0)
print(f"Device name: {nvidia_smi.nvmlDeviceGetName(handle)}")

def gpu_usage():
    global handle
    return str(nvidia_smi.nvmlDeviceGetUtilizationRates(handle).gpu) + '%'

Pytorch Lightning Version: 1.5.2
Device name: b'Tesla P100-PCIE-16GB'


In [None]:
# Configurações gerais
model_name = "t5-small"
batch_size = 64
accumulate_grad_batches = 2
source_max_length = 128
target_max_length = 128
learning_rate = 1e-3

In [None]:
random.shuffle(dataset)

train_len = int(len(dataset)*.8)
x_train = dataset[:train_len]
x_val = dataset[train_len:]

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/773k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.32M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.17k [00:00<?, ?B/s]

In [None]:
class MyDataset(Dataset):
    def __init__(self, text_pairs: List[Tuple[str]], tokenizer,
                 source_max_length: int = 32, target_max_length: int = 32):
        self.tokenizer = tokenizer
        self.text_pairs = text_pairs
        self.source_max_length = source_max_length
        self.target_max_length = target_max_length
        
    def __len__(self):
        return len(self.text_pairs)
    
    def __getitem__(self, idx):
        source, target = self.text_pairs[idx]

        # task_prefix = 'translate English to Portuguese: '
        source_tokenizer_output = self.tokenizer(source, truncation=True, padding='max_length', max_length=self.source_max_length, return_tensors='pt')
        target_tokenizer_output = self.tokenizer(target, truncation=True, padding='max_length', max_length=self.target_max_length, return_tensors='pt')

        source_token_ids = source_tokenizer_output['input_ids'].squeeze(0)
        target_token_ids = target_tokenizer_output['input_ids'].squeeze(0)

        source_mask = source_tokenizer_output['attention_mask'].squeeze(0)
        target_mask = target_tokenizer_output['attention_mask'].squeeze(0)

        original_source = source
        original_target = target
        
        return (source_token_ids, source_mask, target_token_ids, target_mask,
                original_source, original_target)

In [None]:
text_pairs = [('we like pizza', 'eu gosto de pizza')]
dataset_debug = MyDataset(
    text_pairs=text_pairs,
    tokenizer=tokenizer,
    source_max_length=source_max_length,
    target_max_length=target_max_length)

dataloader_debug = DataLoader(dataset_debug, batch_size=10, shuffle=True, 
                              num_workers=0)

source_token_ids, source_mask, target_token_ids, target_mask, _, _ = next(iter(dataloader_debug))
print('source_token_ids:\n', source_token_ids)
print('source_mask:\n', source_mask)
print('target_token_ids:\n', target_token_ids)
print('target_mask:\n', target_mask)

print('source_token_ids.shape:', source_token_ids.shape)
print('source_mask.shape:', source_mask.shape)
print('target_token_ids.shape:', target_token_ids.shape)
print('target_mask.shape:', target_mask.shape)

source_token_ids:
 tensor([[  62,  114, 6871,    1,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]])
source_mask:
 tensor([[1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0,

In [None]:
dataset_train = MyDataset(text_pairs=x_train,
                          tokenizer=tokenizer,
                          source_max_length=source_max_length,
                          target_max_length=target_max_length)

dataset_val = MyDataset(text_pairs=x_val,
                        tokenizer=tokenizer,
                        source_max_length=source_max_length,
                        target_max_length=target_max_length)

# dataset_test = MyDataset(text_pairs=x_test,
#                          tokenizer=tokenizer,
#                          source_max_length=source_max_length,
#                          target_max_length=target_max_length)

train_dataloader = DataLoader(dataset_train, batch_size=batch_size,
                              shuffle=True, num_workers=0)

val_dataloader = DataLoader(dataset_val, batch_size=batch_size, shuffle=False, 
                            num_workers=0)

# test_dataloader = DataLoader(dataset_test, batch_size=batch_size,
#                              shuffle=False, num_workers=0)

#TODO change to real test
test_dataloader = val_dataloader

In [None]:
from sacrebleu.metrics import BLEU

from sklearn.metrics import f1_score
import numpy as np


def compute_f1_score(predicted_list: list, target_list: list) -> float:
    """
    Given the lists of target and predicted sequences, it returns the F1-Score
    :param predicted_list: list of predicted sequence
    :param target_list: list of target sequence
    :return: f1_score
    """

    scores = []
    for predicted, target in zip(predicted_list, target_list):
        predicted = [w.strip() for w in predicted.split('[SEP]')]
        target = [w.strip() for w in target.split('[SEP]')]

        # let target and predicted sequences with the same size
        diff_len = len(predicted) - len(target)
        if diff_len > 0:
            target += diff_len * [""]
        elif diff_len < 0:
            predicted += abs(diff_len) * [""]

        scores.append(f1_score(target, predicted, average='macro'))

    return np.array(scores).mean()

class T5Finetuner(pl.LightningModule):

    def __init__(self, model_name, learning_rate, source_max_length, target_max_length, batch_size):
      super(T5Finetuner, self).__init__()

      if 't5' in model_name:
        from transformers import T5ForConditionalGeneration,T5Tokenizer
        model = T5ForConditionalGeneration.from_pretrained(model_name)
        tokenizer = T5Tokenizer.from_pretrained(model_name)
      elif 'bert-' in model_name:
        from transformers import BertTokenizer, TFBertForPreTraining
        tokenizer = BertTokenizer.from_pretrained(model_name)
        model = TFBertForPreTraining.from_pretrained(model_name)
      else:
        raise NotImplementedError()

      self.learning_rate = learning_rate
      self.source_max_length = source_max_length
      self.target_max_length = target_max_length
      self.batch_size = batch_size
      self.model_name = model_name
      self.tokenizer = tokenizer
      self.model = model

      self.bleu = BLEU()

      self.log_examples=True

      self.save_hyperparameters()

      self.wandb_table = wandb.Table(columns = ['Epoch', 'Source', 'Target', 'Predicted'])

      
    def forward(self, source_token_ids, source_mask, target_token_ids=None,
                target_mask=None):

      if self.training:
          loss = self.model(input_ids=source_token_ids,
                            attention_mask=source_mask,
                            labels=target_token_ids).loss
          return loss
      else:
          generated_ids = self.model.generate(input_ids=source_token_ids,
                                              attention_mask=source_mask, 
                                              max_length=self.target_max_length)
          return generated_ids

    def training_step(self, batch, batch_nb):
      source_token_ids, source_mask, target_token_ids, target_mask, _, _ = batch
        
      # fwd
      loss = self(source_token_ids, source_mask, target_token_ids, target_mask)

      # logs
      self.log('train_loss', loss.detach(), on_step=True, on_epoch=True, logger=True)

      tensorboard_logs = {'train_loss': loss.detach()}
      progress_bar = {'gpu_usage': gpu_usage()}
      return {'loss': loss, 'log': tensorboard_logs,
              'progress_bar': progress_bar}

    def validation_step(self, batch, batch_nb):
      avg_bleu = self.get_bleu(batch, batch_nb, True)
      loss = self.get_loss(batch, batch_nb)
      return {'val_bleu': avg_bleu, 'val_loss': loss}

    def test_step(self, batch, batch_nb):
      avg_bleu = self.get_bleu(batch, batch_nb, False)
      loss = self.get_loss(batch, batch_nb)
      return {'test_bleu': avg_bleu, 'test_loss': loss}

    def get_bleu(self, batch, batch_nb, is_test):
      source_token_ids, source_mask, target_token_ids, target_mask, original_source, original_target = batch

      generated_ids = self(source_token_ids, source_mask, target_token_ids, target_mask)

      output_seq = self.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

      avg_bleu = self.bleu.corpus_score(output_seq, [original_target]).score

      if self.log_examples & is_test:
        self.wandb_table.add_data(self.current_epoch ,original_source[:1], original_target[:1],output_seq[:1])

      return avg_bleu

    def get_loss(self, batch, batch_nb):
      source_token_ids, source_mask, target_token_ids, target_mask, original_source, original_target = batch

      loss = self.model(input_ids=source_token_ids,
                  attention_mask=source_mask,
                  labels=target_token_ids).loss
      return loss


    def validation_epoch_end(self, outputs):
      avg_bleu = sum([x['val_bleu'] for x in outputs]) / len(outputs)
      avg_loss = sum([x['val_loss'] for x in outputs]) / len(outputs)

      self.log("avg_val_bleu", avg_bleu, prog_bar=True)
      self.log("avg_val_loss", avg_loss.detach(), prog_bar=True)

    def test_epoch_end(self, outputs):
      avg_bleu = sum([x['test_bleu'] for x in outputs]) / len(outputs)
      avg_loss = sum([x['test_loss'] for x in outputs]) / len(outputs)

      self.log("avg_test_bleu", avg_bleu, prog_bar=True)
      self.log("avg_test_loss", avg_loss.detach(), prog_bar=True)

      wandb.log({'validation_samples' : self.wandb_table})
    
    def configure_optimizers(self):

      optimizer = torch.optim.Adam(
          [p for p in self.parameters() if p.requires_grad],
          lr=self.learning_rate, eps=1e-08)
      
      scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=1.0)  # This is the same as no LR decay.

      return {'optimizer': optimizer, 'lr_scheduler': scheduler, 'monitor': 'avg_val_bleu'}

    def train_dataloader(self):
      #TODO place dataset into own module
      dataset_train = MyDataset(text_pairs=x_train,
                          tokenizer=self.tokenizer,
                          source_max_length=self.source_max_length,
                          target_max_length=self.target_max_length)
      train_dataloader = DataLoader(dataset_train, batch_size=self.batch_size,
                          shuffle=True, num_workers=0)
      return train_dataloader

    def val_dataloader(self):
      dataset_val = MyDataset(text_pairs=x_val,
                        tokenizer=self.tokenizer,
                        source_max_length=self.source_max_length,
                        target_max_length=self.target_max_length)
      val_dataloader = DataLoader(dataset_val, batch_size=self.batch_size,
                          shuffle=False, num_workers=0)
      
      return val_dataloader

    def test_dataloader(self):
      # TODO change to real test
      dataset_val = MyDataset(text_pairs=x_val,
                  tokenizer=self.tokenizer,
                  source_max_length=self.source_max_length,
                  target_max_length=self.target_max_length)
      val_dataloader = DataLoader(dataset_val, batch_size=self.batch_size,
                          shuffle=False, num_workers=0)
      return val_dataloader

# Wandb setup

In [None]:
!pip install wandb
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33medrod[0m (use `wandb login --relogin` to force relogin)


In [None]:
from pytorch_lightning.loggers import WandbLogger
import wandb

wandb_logger = WandbLogger(project="autowiki", log_model="all")

In [None]:
# model = T5Finetuner(model_name=model_name,
#                     learning_rate=learning_rate, 
#                     source_max_length=source_max_length,
#                     target_max_length=target_max_length,
#                     batch_size=batch_size)

# trainer = pl.Trainer(gpus=1,
#                      precision=16, 
#                      checkpoint_callback=False,  # Disable checkpoint saving.
#                      fast_dev_run=False)

# trainer.fit(model)
# trainer.test(model)
# del model  # Para não ter estouro de mémoria da GPU

In [None]:
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import LearningRateMonitor

max_epochs = 100

lr_monitor = LearningRateMonitor(logging_interval='step')
early_monitor = EarlyStopping(monitor="avg_val_loss", min_delta=0.00, patience=3, mode="min")
wandb_logger = WandbLogger(project="autowiki", log_model="all")

checkpoint_path = r"drive/MyDrive/Unicamp/Pos/ia376e_2021S2/autowiki/checkpoints.ckpt"
checkpoint_dir = os.path.dirname(os.path.abspath(checkpoint_path))
os.makedirs(checkpoint_dir, exist_ok=True)
print(f'Files in {checkpoint_dir}: {os.listdir(checkpoint_dir)}')
print(f'Saving checkpoints to {checkpoint_dir}')
checkpoint_callback = ModelCheckpoint(dirpath=checkpoint_dir,
                                      save_top_k=3, monitor='avg_val_loss', mode='min')

resume_from_checkpoint = None
if os.path.exists(checkpoint_path):
    print(f'Restoring checkpoint: {checkpoint_path}')
    resume_from_checkpoint = checkpoint_path

trainer = pl.Trainer(gpus=1,
                     precision=16,
                     max_epochs=max_epochs,
                     check_val_every_n_epoch=1,
                     accumulate_grad_batches=accumulate_grad_batches,
                     callbacks=[checkpoint_callback,lr_monitor,early_monitor],
                     progress_bar_refresh_rate=50,
                     resume_from_checkpoint=resume_from_checkpoint,
                     logger=wandb_logger)

model = T5Finetuner(model_name=model_name,
                    learning_rate=learning_rate, 
                    source_max_length=source_max_length,
                    target_max_length=target_max_length,
                    batch_size=batch_size)

trainer.fit(model)

Files in /content/drive/MyDrive/Unicamp/Pos/ia376e_2021S2/autowiki: ['sample.json', 'train.json', 'epoch=1-step=2223.ckpt', 'epoch=2-step=3335.ckpt', 'epoch=3-step=4447.ckpt']
Saving checkpoints to /content/drive/MyDrive/Unicamp/Pos/ia376e_2021S2/autowiki


Using 16bit native Automatic Mixed Precision (AMP)
  f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs


Downloading:   0%|          | 0.00/231M [00:00<?, ?B/s]

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[34m[1mwandb[0m: Currently logged in as: [33medrod[0m (use `wandb login --relogin` to force relogin)



  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
121.013   Total estimated model params size (MB)
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Validation sanity check: 0it [00:00, ?it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"


Training: 0it [00:00, ?it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"


Validating: 0it [00:00, ?it/s]

  "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"


Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

Validating: 0it [00:00, ?it/s]

In [None]:
trainer.test(ckpt_path='best')

Restoring states from the checkpoint path at /content/drive/MyDrive/Unicamp/Pos/ia376e_2021S2/autowiki/epoch=6-step=7783.ckpt
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Loaded model weights from checkpoint at /content/drive/MyDrive/Unicamp/Pos/ia376e_2021S2/autowiki/epoch=6-step=7783.ckpt


Testing: 0it [00:00, ?it/s]

--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'avg_test_bleu': 25.85979652404785, 'avg_test_loss': 0.2724027633666992}
--------------------------------------------------------------------------------


[{'avg_test_bleu': 25.85979652404785, 'avg_test_loss': 0.2724027633666992}]

In [None]:
wandb.finish()

VBox(children=(Label(value=' 4859.12MB of 4859.12MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, ma…

0,1
avg_test_bleu,▁
avg_test_loss,▁
avg_val_bleu,▄▆▇▇▇▇█▁▁▁
avg_val_loss,▂▁▁▁▁▁▁███
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▆▆▆▆▆▆▆▆▇▇▇▇████
lr-Adam,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss_epoch,█▃▂▁▁
train_loss_step,██▅▄▅▆▄▄▅▅▃▂▅▅▄▃▄▅▂▂▄▁▁▁▂▂▄▃▃▅▃▃
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███

0,1
avg_test_bleu,25.8598
avg_test_loss,0.2724
avg_val_bleu,3.21473
avg_val_loss,0.60397
epoch,7.0
lr-Adam,0.001
train_loss_epoch,
train_loss_step,
trainer/global_step,7784.0
