## Projeto final da disciplina IA376J

#### Treinamento de um modelo T5 Base no dataset DocVQA

In [None]:
!pip install transformers==3.5.0
!pip install pytorch-lightning
!pip install ftfy
!pip install neptune-client==0.4.130

In [None]:
import torch
if torch.cuda.is_available(): 
   dev = "cuda:0"
else: 
   dev = "cpu" 
print(dev, torch.cuda.get_device_name(0))
device = torch.device(dev)

cuda:0 Tesla P100-PCIE-16GB


In [None]:
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
import torchvision
from torchvision import transforms, utils
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer
import neptune
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import NeptuneLogger


from random import randrange
import random
import numpy as np
import collections
import os
import glob
import json
from ftfy import fix_encoding

from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from PIL import ImageChops
from matplotlib.pyplot import imshow
import matplotlib.pyplot as plt

### Métricas

In [None]:
def normalize_answer(s):
    """Lower text and remove extra whitespace."""

    def white_space_fix(text):
        return ' '.join(text.split())

    def lower(text):
        return text.lower()

    return white_space_fix(lower(s))

def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()

def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

### DocVQA

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tarfile
### unzip files
for mode in ['train','val','test']:
    tf = tarfile.open("/content/drive/MyDrive/OCR_checkpoints/{}.tar.gz".format(mode))
    tf.extractall()

In [None]:
class DocVQA(Dataset): 
    '''
    Adaptado a partir da classe dataset implementada pelo Diedre
    '''

    def __init__(self,
                 mode: str,
                 tokenizer_string: str = 't5-base', #'microsoft/layoutlm-base-uncased',
           #      transform: object = None,
                 seq_len: int = 512,
                 no_image: bool = False):
        '''
        mode: one of train, val and test.
        tokenizer_string: input tokenizer string 
        transform: transforms to be applied to the document image if applicable.
        seq_len: maximum sequence len of encoded tokens.
        no_image: if True, don't load document images.
        returns:
            dict:
                document: transformed document image.
                input_tokens: tokenized text contained in the document.
                input_text: text contained in the document.
                bboxes: bounding boxes for each OCR detection in the document, on the format [tl_col, tl_row, br_col, br_row].
        '''
        super().__init__()
        assert mode in ["train", "val", "test"]
        with open(f"{mode}/{mode}_v1.0.json", 'r') as data_json_file:
            self.data_json = json.load(data_json_file)

        self.folder = f"{mode}"
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_string)
        self.seq_len = seq_len
        self.mode = mode
        self.no_image = no_image

        print(f"{self.mode} DocVQA folder {self.folder} tokenizer {self.tokenizer} seq_len {self.seq_len} "
              f"no_image {self.no_image}")

      

    def __len__(self):
        return len(self.data_json["data"])

    def __getitem__(self, i: int):
        data = self.data_json["data"][i]
            
        ID = data["ucsf_document_id"] + '_' + data["ucsf_document_page_no"]
        ocr_file = os.path.join(self.folder, "ocr_results", ID + ".json")
        with open(ocr_file, 'r') as ocr_file:
            ocr_info = json.load(ocr_file)

      # Retira o texto da imagem a partir do OCR
        lines = ocr_info['recognitionResults'][0]['lines']
        nlines = len(lines)

        bboxes = []
        input_text = ''
        for line in range(nlines):
            input_text += lines[line]['text'] + ' '
           

        question = self.data_json["data"][i]['question']
        input_context = 'Question: ' + question + ' Context: ' + input_text 
        input_tokens = self.tokenizer.encode_plus(input_context, padding='max_length', truncation=True, max_length=self.seq_len, return_tensors='pt',return_token_type_ids=True)
      

        target_text = random.choice(data["answers"]) if self.mode == "train" else data.get("answers", ["NA"])[0]
        target = self.tokenizer.encode(target_text, padding='max_length', truncation=True, max_length=32, return_tensors='pt')[0]

        return_dict = {"input_ids": input_tokens["input_ids"].squeeze(),
                       "token_type_ids": input_tokens["token_type_ids"].squeeze(),
                       "attention_mask": input_tokens["attention_mask"].squeeze(),
                       "input_context": input_context,
                       "target": target,
                       "target_text": target_text}

        return return_dict

In [None]:
class T5Finetuner(pl.LightningModule):

    '''
    Implementação do modelo no Pytorch Lightning baseado na implementação da Luiza Amador Pozzobon
    
    '''

    def __init__(self, train_dataloader, val_dataloader, test_dataloader, params):
        super(T5Finetuner, self).__init__()

        self.params = params
        
        self._train_dataloader = train_dataloader
        self._val_dataloader   = val_dataloader
        self._test_dataloader  = test_dataloader

        self.decoder = T5ForConditionalGeneration.from_pretrained('t5-base')

        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        self.learning_rate = params['learning_rate']

    def forward(self, batch): 
        
        if self.training:
            outputs = self.decoder(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["target"])[0]
            return outputs
        else:
            return self.decoder.generate(input_ids=batch["input_ids"], max_length=self.params['seq_len'])

    
    def training_step(self, batch, batch_idx): 
        loss = self(batch)
        self.log('loss', loss, on_epoch=True, prog_bar=True)
        return loss

    def validation_step(self, batch, batch_idx):
        pred_tokens = self(batch)
        decoded_pred = [fix_encoding(self.tokenizer.decode(tokens)) for tokens in pred_tokens]
        return {"pred": decoded_pred, "target": batch["target_text"]}

    def test_step(self, batch, batch_idx):
        pred_tokens = self(batch)
        decoded_pred = [fix_encoding(self.tokenizer.decode(tokens)) for tokens in pred_tokens]
        return {"pred": decoded_pred, "target": batch["target_text"]}

    def validation_epoch_end(self, outputs):
        # Flatten dos targets e preds para arrays
        trues = sum([list(x['target']) for x in outputs], [])
        preds = sum([list(x['pred']) for x in outputs], [])

          #  n = random.choice(range(len(trues)))
        n = randrange(len(trues))
        print(f"\nSample Target: {trues[n]}\nPrediction: {preds[n]}\n")

        f1 = []
        exact = []
        for true, pred in zip(trues, preds):
            f1.append(compute_f1(a_gold=true, a_pred=pred))
            exact.append(compute_exact(a_gold=true, a_pred=pred))
        f1 = np.mean(f1)
        exact = np.mean(exact)

        self.log("val_f1", f1, prog_bar=True)
        self.log("val_exact", exact, prog_bar=True)

    def test_epoch_end(self, outputs):
        # Flatten dos targets e preds para arrays
        trues = sum([list(x['target']) for x in outputs], [])
        preds = sum([list(x['pred']) for x in outputs], [])  # TESTAR FIX ENCODING

            #n = random.choice(range(len(trues)))
        for random in range(5):
            n = randrange(len(trues))
            print(f"\nSample Target: {trues[n]}\nPrediction: {preds[n]}\n")

        f1 = []
        exact = []
        for true, pred in zip(trues, preds):
            f1.append(compute_f1(a_gold=true, a_pred=pred))
            exact.append(compute_exact(a_gold=true, a_pred=pred))
        f1 = np.mean(f1)
        exact = np.mean(exact)

        self.log("test_f1", f1, prog_bar=True)
        self.log("test_exact", exact, prog_bar=True)

    def configure_optimizers(self):
        return torch.optim.Adam(
            [p for p in self.parameters() if p.requires_grad],
            lr=self.learning_rate, eps=1e-08)

    def train_dataloader(self):
        return self._train_dataloader

    def val_dataloader(self):
        return self._val_dataloader

    def test_dataloader(self):
        return self._test_dataloader

In [None]:
learning_rate =  5e-5#@param {type: "number"} 5e-5
batch_size =    4#@param {type: "integer"} 32
sequence =  32#@param {type: "integer"}
patience =  5#@param {type: "integer"}
max_epochs =  5#@param {type: "integer"}


params = {    
    'batch_size': batch_size,
    'seq_len': sequence,
    'learning_rate': learning_rate,
    'max_epochs': max_epochs,
    'patience': patience,
    'monitor_variable': 'val_f1'
}

In [None]:
tokenizer_string = 't5-base'
seq_len = 512

train = DocVQA('train', tokenizer_string, seq_len)
val = DocVQA('val', tokenizer_string, seq_len)
test = DocVQA('val', tokenizer_string, seq_len)

train_loader = DataLoader(train,
                          shuffle = True,
                          batch_size=params['batch_size'],
                          num_workers=4)

val_loader = DataLoader(val,
                        batch_size=params['batch_size'],
                        num_workers=4)


test_loader = DataLoader(test,
                         batch_size=params['batch_size'],
                         num_workers=4)

print('Seq_len = ', seq_len)

train DocVQA folder train tokenizer PreTrainedTokenizer(name_or_path='t5-base', vocab_size=32100, model_max_len=512, is_fast=False, padding_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>

In [None]:
checkpoint_path = '/content/drive/MyDrive/OCR_checkpoints/DocVQA-T5-epoch=4-val_f1=0.18-val_exact=0.11.ckpt' # (fonte tipo variável) img2text-teste_2-epoch=19-val_exact=0.25  

checkpoint_dir = os.path.dirname(os.path.abspath(checkpoint_path))
print(f'Files in {checkpoint_dir}: {os.listdir(checkpoint_dir)}')
print(f'Saving checkpoints to {checkpoint_dir}')


checkpoint_callback = ModelCheckpoint(prefix="DocVQA-T5",
                                      filepath="/content/drive/MyDrive/OCR_checkpoints/{epoch}-{val_f1:.2f}-{val_exact:.2f}",
                                      save_top_k=-1)  # -1 = Keeps all checkpoints, 1 = save best

resume_from_checkpoint = None
if os.path.exists(checkpoint_path):
    print(f'Restoring checkpoint: {checkpoint_path}')
    resume_from_checkpoint = checkpoint_path

callbacks = [pl.callbacks.EarlyStopping(monitor=params['monitor_variable'], 
                                        patience=params["patience"], 
                                        mode='max')]

# Log results to Neptune.
neptune_logger = pl.loggers.neptune.NeptuneLogger(
    api_key='eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vdWkubmVwdHVuZS5haSIsImFwaV91cmwiOiJodHRwczovL3VpLm5lcHR1bmUuYWkiLCJhcGlfa2V5IjoiNmJkMmRkNGMtMzNlMi00MzY2LThiZTUtODMxYmFlMzUwYzcwIn0=',
    project_name='guilhermemr04/sandbox',
    params=params,)


trainer = pl.Trainer(gpus=1,
                     logger=neptune_logger,
                     precision=32, 
                     log_gpu_memory=True,
                     max_epochs=5,#params['max_epochs'],
                     check_val_every_n_epoch=1,
                     profiler=True,
                     callbacks=None,#callbacks,
                     accumulate_grad_batches=8,
                     checkpoint_callback= checkpoint_callback,
                  #   limit_val_batches=0.05,
                     progress_bar_refresh_rate=100,
                     resume_from_checkpoint=resume_from_checkpoint)
                     

model = T5Finetuner(train_dataloader=train_loader,
                    val_dataloader=val_loader,
                    test_dataloader=test_loader,
                    params=params)

Files in /content/drive/MyDrive/OCR_checkpoints: ['img2text-basic-epoch=0-val_exact=0.59.ckpt', 'img2text-basic-epoch=0-val_exact=0.70.ckpt', 'img2text-intermediate-epoch=8-val_exact=0.36.ckpt', 'img2text-final-epoch=47-val_exact=0.37.ckpt', 'img2text-final-epoch=74-val_exact=0.44.ckpt', 'img2text-Nota_Fiscal-epoch=88-val_exact=0.27.ckpt', 'img2text-Nota_Fiscal-epoch=95-val_exact=0.27.ckpt', 'img2text-teste-epoch=13-val_exact=0.33.ckpt', 'img2text-teste_2-epoch=19-val_exact=0.25.ckpt', 'img2text-teste_2-epoch=21-val_exact=0.28.ckpt', 'img2text-teste_2-epoch=22-val_exact=0.30.ckpt', 'img2text-teste_3-epoch=31-val_exact=0.10.ckpt', 'img2text-teste_3-epoch=32-val_exact=0.09.ckpt', 'img2text-SROIE-epoch=43-val_exact=0.22.ckpt', 'img2text-SROIE-epoch=51-val_exact=0.22.ckpt', 'img2text-SROIE_v2-epoch=40-val_exact=0.43.ckpt', 'img2text-SROIE_v2-epoch=47-val_exact=0.42.ckpt', 'img2text-SROIE_v3-epoch=41-val_exact=0.44.ckpt', 'img2text-SROIE_v3-epoch=46-val_exact=0.46.ckpt', 'test.tar.gz', 'val

NeptuneLogger will work in online mode
GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


In [None]:
trainer.fit(model)


  | Name    | Type                       | Params
-------------------------------------------------------
0 | decoder | T5ForConditionalGeneration | 222 M 
-------------------------------------------------------
222 M     Trainable params
0         Non-trainable params
222 M     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…


Sample Target: Paul
Prediction: False



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Sample Target: FIRST AMERICAN NATIONAL BANK
Prediction: First American National Bank



Failed to send channel value.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/channels/channels_values_sender.py", line 156, in _send_values
    self._experiment._send_channels_values(channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/experiments.py", line 1167, in _send_channels_values
    self._backend.send_channels_values(self, channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/utils.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/backends/hosted_neptune_backend.py", line 571, in send_channels_values
    raise ChannelsValuesSendBatchError(experiment.id, batch_errors)
neptune.api_exceptions.ChannelsValuesSendBatchError: Received batch errors sending channels' values to experiment SAN-4. Cause: Error(code=400, message='X-coordinates must be strictly increasing for channel: d8721e50-dae0-49fe-8ec9-dfa74cdcc7f2.

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Sample Target: YEAR
Prediction: % of calories as Sat



Failed to send channel value.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/channels/channels_values_sender.py", line 156, in _send_values
    self._experiment._send_channels_values(channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/experiments.py", line 1167, in _send_channels_values
    self._backend.send_channels_values(self, channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/utils.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/backends/hosted_neptune_backend.py", line 571, in send_channels_values
    raise ChannelsValuesSendBatchError(experiment.id, batch_errors)
neptune.api_exceptions.ChannelsValuesSendBatchError: Received batch errors sending channels' values to experiment SAN-4. Cause: Error(code=400, message='X-coordinates must be strictly increasing for channel: d8721e50-dae0-49fe-8ec9-dfa74cdcc7f2.

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Sample Target: 209.54
Prediction: 208.81



Failed to send channel value.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/channels/channels_values_sender.py", line 156, in _send_values
    self._experiment._send_channels_values(channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/experiments.py", line 1167, in _send_channels_values
    self._backend.send_channels_values(self, channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/utils.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/backends/hosted_neptune_backend.py", line 571, in send_channels_values
    raise ChannelsValuesSendBatchError(experiment.id, batch_errors)
neptune.api_exceptions.ChannelsValuesSendBatchError: Received batch errors sending channels' values to experiment SAN-4. Cause: Error(code=400, message='X-coordinates must be strictly increasing for channel: d8721e50-dae0-49fe-8ec9-dfa74cdcc7f2.

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Sample Target: Smoking too much
Prediction: Maintaining right weight



Failed to send channel value.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/channels/channels_values_sender.py", line 156, in _send_values
    self._experiment._send_channels_values(channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/experiments.py", line 1167, in _send_channels_values
    self._backend.send_channels_values(self, channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/utils.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/backends/hosted_neptune_backend.py", line 571, in send_channels_values
    raise ChannelsValuesSendBatchError(experiment.id, batch_errors)
neptune.api_exceptions.ChannelsValuesSendBatchError: Received batch errors sending channels' values to experiment SAN-4. Cause: Error(code=400, message='X-coordinates must be strictly increasing for channel: d8721e50-dae0-49fe-8ec9-dfa74cdcc7f2.

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…


Sample Target: 1957
Prediction: 1954



Failed to send channel value.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/channels/channels_values_sender.py", line 156, in _send_values
    self._experiment._send_channels_values(channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/experiments.py", line 1167, in _send_channels_values
    self._backend.send_channels_values(self, channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/utils.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/backends/hosted_neptune_backend.py", line 571, in send_channels_values
    raise ChannelsValuesSendBatchError(experiment.id, batch_errors)
neptune.api_exceptions.ChannelsValuesSendBatchError: Received batch errors sending channels' values to experiment SAN-4. Cause: Error(code=400, message='X-coordinates must be strictly increasing for channel: d8721e50-dae0-49fe-8ec9-dfa74cdcc7f2.




Failed to send channel value.
Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/channels/channels_values_sender.py", line 156, in _send_values
    self._experiment._send_channels_values(channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/experiments.py", line 1167, in _send_channels_values
    self._backend.send_channels_values(self, channels_with_values)
  File "/usr/local/lib/python3.6/dist-packages/neptune/utils.py", line 211, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/neptune/internal/backends/hosted_neptune_backend.py", line 571, in send_channels_values
    raise ChannelsValuesSendBatchError(experiment.id, batch_errors)
neptune.api_exceptions.ChannelsValuesSendBatchError: Received batch errors sending channels' values to experiment SAN-4. Cause: Error(code=400, message='X-coordinates must be strictly increasing for channel: d8721e50-dae0-49fe-8ec9-dfa74cdcc7f2.

1

In [None]:
trainer.test(model)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


Sample Target: 112,000
Prediction: 45,000


Sample Target: 209
Prediction: 209


Sample Target: 2-inch
Prediction: 1-inch heels


Sample Target: Optimized diametrical clearance promotes true fluid film lubrication
Prediction: OPTIMIZED DIAMETRICAL CLEARANCE


Sample Target: Al-Adil, K.M
Prediction: Abbas, H.


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_exact': 0.5049541970461768,
 'test_f1': 0.5887844795574804,
 'val_exact': 0.5049541970461768,
 'val_f1': 0.5887844795574804}
--------------------------------------------------------------------------------


[{'test_exact': 0.5049541970461768,
  'test_f1': 0.5887844795574804,
  'val_exact': 0.5049541970461768,
  'val_f1': 0.5887844795574804}]