<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/DQVA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!nvidia-smi

Wed Jan  6 09:39:02 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    23W / 300W |     10MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [13]:
!tar -xf /content/drive/MyDrive/Colab\ Notebooks/Final-project/train.tar.gz 
!tar -xf /content/drive/MyDrive/Colab\ Notebooks/Final-project/val.tar.gz 
!tar -xf /content/drive/MyDrive/Colab\ Notebooks/Final-project/test.tar.gz

In [7]:
!pip install -q "transformers<4.0.0"
!pip install -q sentence_transformers

In [15]:
# Python / Básics
import os
import gc
import glob
import time
import json
import random
import logging
import numpy as np
import collections
import seaborn as sns
from pathlib import Path
import matplotlib.pyplot as plt

# PIL 
from PIL import Image

# Torch
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torchvision import transforms
from collections import OrderedDict
import torchvision.models as models
from torch.utils.data import DataLoader, Dataset

# Transformers / Sentence Transformers
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from sentence_transformers import SentenceTransformer

In [14]:
# ==================================== #
# === Função que carregas as seeds === #
# ==================================== #

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
manual_seed = 2357 # only primers ;)

def deterministic(rep=True):
    if rep:
        np.random.seed(manual_seed)
        torch.manual_seed(manual_seed)
        if torch.cuda.is_available():
            torch.cuda.manual_seed(manual_seed)
            torch.cuda.manual_seed_all(manual_seed)
        torch.backends.cudnn.enabled = False 
        torch.backends.cudnn.benchmark = False
        torch.backends.cudnn.deterministic = True
        print(f'Experimento deterministico, seed: {manual_seed} -- ', end = '')
        print(f'Existe {torch.cuda.device_count()} GPU\
 {torch.cuda.get_device_name(0)} disponível.')
    else:
        print('Experimento randomico')
deterministic()        

Experimento deterministico, seed: 2357 -- Existe 1 GPU Tesla V100-SXM2-16GB disponível.


In [21]:
# ======================================================= #
# === Função que cria embeddings para OCR e Perguntas === #
# ======================================================= #

def create_embeddings(model, path_json, mode='ocr', phase='val', thrs=48, emb_size=512):
    if mode == 'ocr':
        
        with open(path_json, 'rb') as handle:
            dataset = json.loads(handle.read())

        ocrs_list = []
        for i, d in enumerate(dataset['data']):
            ocr_file = d['image'].replace('documents', 'ocr_results').replace('.png', '.json')
            with open(phase+'/'+ ocr_file, 'rb') as f:
                ocr = json.loads(f.read())

            lines = ocr['recognitionResults'][0]['lines']

            text = ' '.join([w['text'] for l in lines for w in l['words']])
            ocrs_list.append(lines)

        doc_list_item = []
        Embeddings = np.zeros((len(ocrs_list), thrs, emb_size), dtype=np.float32)
        for i, doc in enumerate(ocrs_list):
            test_list_item = []
            for item in doc:
                test_list_item.append(item['text'])


            if len(test_list_item) > thrs:          
                test_list_item = test_list_item[:thrs]
                t = model.encode(test_list_item)
            else:
                t = model.encode(test_list_item)
            if t.size == 0:
                continue
            Embeddings[i, :t.shape[0], :] = t[:,:]

        return Embeddings
    
    else:
        with open(path_json, 'rb') as handle:
            dataset = json.loads(handle.read())
        
        questions = []
        for d in dataset['data']:
            questions.append(d['question'])

        Embeddings_q = np.zeros((len(questions), thrs, emb_size), dtype=np.float32)

        for i, q in enumerate(questions):
            q_emb = model.encode(q)
            Embeddings_q[i, 0, :] = q_emb
        return  Embeddings_q           

# ---------------------------------------------------------------------------------------
model = SentenceTransformer('msmarco-distilroberta-base-v2')
emb_size = model.get_sentence_embedding_dimension()
model.max_seq_length = 256
path_train = 'train/train_v1.0.json'
path_val   = 'val/val_v1.0.json'

start = time. time()
Embeddings_OCR_train = create_embeddings(model, path_train, mode='ocr', phase='train', thrs=48, emb_size=emb_size)
end = time. time()
print(f'Embeddings_OCR_train:       {Embeddings_OCR_train.shape} -- Time: {(end - start)/60: .4}min')

start = time. time()
Embeddings_Q_train = create_embeddings(model, path_train, mode='q', phase='train', thrs=1, emb_size=emb_size)
end = time. time()
print(f'Embeddings_Questions_train: {Embeddings_Q_train.shape}  -- Time: {(end - start)/60: .4}min')

start = time. time()
Embeddings_OCR_val = create_embeddings(model, path_val, mode='ocr', phase='val', thrs=48, emb_size=emb_size)
end = time. time()
print(f'Embeddings_OCR_val:         {Embeddings_OCR_val.shape}  -- Time: {(end - start)/60: .4}min')

start = time. time()
Embeddings_Q_val = create_embeddings(model, path_val, mode='q', phase='val', thrs=1, emb_size=emb_size)
end = time. time()
print(f'Embeddings_Questions_val:   {Embeddings_Q_val.shape}   -- Time: {(end - start)/60: .4}min')

Embeddings_OCR_train:       (39463, 48, 768) -- Time: 790.7727513313293
Embeddings_Questions_train: (39463, 1, 768) -- Time: 351.23548913002014
Embeddings_OCR_val:         (5349, 48, 768) -- Time: 107.45399713516235
Embeddings_Questions_val:   (5349, 1, 768) -- Time: 47.866716623306274


In [29]:
# ====================== #
# === Classe Dataset === #
# ====================== #

class DVQADataset(Dataset):
    def __init__(self, path_dir, path_data, tokenizer, embeddings_ocr, embeddings_questions, target_max_len, transform):
        super().__init__()

        with open(path_data, 'rb') as handle:
            dataset = json.loads(handle.read())
        
        self.data_dir = path_dir
        self.data = dataset['data']

        self.tokenizer = tokenizer
        self.embeddings_ocr = embeddings_ocr
        self.embeddings_questions = embeddings_questions
        self.target_max_len = target_max_len
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def _gen_image(self, idx):
        img_file = Path(self.data[idx]['image'])
        image = Image.open(self.data_dir / img_file).convert('RGB')
        return image

    def _gen_answer(self, idx):
        answer = random.choice(self.data[idx]['answers'])
        tokenization = self.tokenizer(
            text=answer,
            max_length=self.target_max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        answer_ids = tokenization['input_ids'].squeeze(0)

        return answer_ids, answer

    def __getitem__(self, idx):
        img = self._gen_image(idx)
        img = self.transform(img)
        
        ocr_embeddings = self.embeddings_ocr[idx]        
        questions_embeddings = self.embeddings_questions[idx]
        
        answer_ids, answer = self._gen_answer(idx)

        return img, ocr_embeddings, questions_embeddings, answer_ids, answer

In [47]:
# ============================== #
# === Datasets e Dataloaders === #
# ============================== #
 
BATCH_SZ = 10
TARGET_MAX_LEN = 10
t5_tokenizer = T5Tokenizer.from_pretrained('t5-base')
transforms_img = transforms.Compose([transforms.Resize((640, 480)), transforms.ToTensor()])

ds_train = DVQADataset(
    path_dir='train/', 
    path_data=path_train,
    tokenizer=t5_tokenizer,
    embeddings_ocr=Embeddings_OCR_train,
    embeddings_questions=Embeddings_Q_train, 
    target_max_len=TARGET_MAX_LEN,
    transform=transforms_img, 
)

ds_val = DVQADataset(
    path_dir='val/',
    path_data=path_val,
    tokenizer=t5_tokenizer,
    embeddings_ocr=Embeddings_OCR_val,
    embeddings_questions=Embeddings_Q_val, 
    target_max_len=TARGET_MAX_LEN,
    transform=transforms_img, 
)

dataloaders = {
     'train': DataLoader(
         ds_train,
         batch_size=BATCH_SZ,
         shuffle=True,
         num_workers=4,
         pin_memory=True
         ),
               
     'val': DataLoader(
         ds_val,
         batch_size=BATCH_SZ,
         num_workers=4,
         pin_memory=True
         )
}
 
# Verificando os tamanhos dos dataloaders
_ = {x: len(dataloaders[x]) for x in dataloaders.keys()}
_

{'train': 3947, 'val': 535}

In [48]:
# =========================== #
# === Teste do Dataloader === #
# =========================== #
 
img, ocr_embeddings, questions_embeddings, answer_ids, answer = next(iter(dataloaders['train']))
 
print('image.shape:                ', img.shape)
print('ocr_embeddings.shape:       ', ocr_embeddings.shape)
print('questions_embeddings.shape: ', questions_embeddings.shape)
print('answer_ids.shape:           ', answer_ids.shape)
print('answer:                     ', answer)

image.shape:                 torch.Size([10, 3, 640, 480])
ocr_embeddings.shape:        torch.Size([10, 48, 768])
questions_embeddings.shape:  torch.Size([10, 1, 768])
answer_ids.shape:            torch.Size([10, 10])
answer:                      ['CG1 - 1990 Flue Cured', 'John Quist', '17.9', 'Cincinnati, Ohio', '02/15/2009', '328.7', 'Paul Saltman', 'John E. Kilpatrick', '51,794', '410']


In [49]:
# ================ #
# === Métricas === #
# ================ #
 
def normalize_answer(s):
    def white_space_fix(text):
        return ' '.join(text.split())
 
    def lower(text):
        return text.lower()
 
    return white_space_fix(lower(s))
 
def get_tokens(s):
    if not s: return []
    return normalize_answer(s).split()
 
def compute_exact(a_gold, a_pred):
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))
 
def compute_f1(a_gold, a_pred):
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())
    if len(gold_toks) == 0 or len(pred_toks) == 0:
        return int(gold_toks == pred_toks)
    if num_same == 0:
        return 0
    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)
    f1 = (2 * precision * recall) / (precision + recall)
    return f1

In [50]:
path_save_model = '/content/drive/MyDrive/Colab Notebooks/Final-project/saved_epochs/'

In [51]:
# ===================================== #
# === Funções de Treino e Validação === #
# ===================================== #
 
def train_one_epoch(model, batch, device, optimizer):
    batch_device = tuple(t.to(device) if type(t) != list else t for t in batch)
    model.zero_grad()
    loss = model(batch_device)
    loss.backward()
    optimizer.step()
    return loss.item()
 
def train_model(model, device, dataloader, optimizer):
    model.train()
    total_loss = 0
    for batch in dataloader:
        total_loss += train_one_epoch(model, batch, device, optimizer)
    avg_loss = total_loss / len(dataloader)
    return avg_loss
 
# ------------------------------------------------------------------------------------
 
def val_one_epoch(model, batch, device, tokenizer, compute_exact, compute_f1):
    batch_device = tuple(t.to(device) if type(t) != list else t for t in batch)
    img, ocr_embeddings, questions_embeddings, answer_ids, answers = batch  
    
    predicted_ids = model(batch_device)
    preds = [tokenizer.decode(ids) for ids in predicted_ids]
    
    batch_size = img.shape[0]
    exact = sum([compute_exact(ans, pred) for ans, pred in zip(answers, preds)]) / batch_size
    f1 = sum([compute_f1(ans, pred) for ans, pred in zip(answers, preds)]) / batch_size
 
    return exact, f1
 
def val_model(model, device, tokenizer, dataloader, compute_exact, compute_f1, epoch_i):
    model.eval()
    total_exact, total_f1 = [],[]
    for batch in dataloader:
        exact, f1 = val_one_epoch(model, batch, device, tokenizer, compute_exact, compute_f1)
        total_exact.append(exact)
        total_f1.append(f1)
    avg_exact = np.array(total_exact).mean()
    avg_f1 = np.array(total_f1).mean()
    
    torch.save(model.state_dict(), path_save_model+'V8_'+str(epoch_i))
 
    return avg_exact, avg_f1

In [52]:
# =========================== #
# === Modelo: Resnet + T5 === #
# =========================== #
 
class DQVA_Model(nn.Module):
    def __init__(self, convnet, nlp_model, target_max_len):
        super().__init__()
 
        self.convnet = convnet
        self.target_max_len = target_max_len
 
        self.t5 = T5ForConditionalGeneration.from_pretrained(nlp_model)

    def forward(self, batch):
        img, ocr_embeddings, questions_embeddings, answer_ids, answers = batch  
        
        img_embeds = self.convnet(img)
        inputs_embeds = torch.cat((img_embeds, ocr_embeddings, questions_embeddings), dim=1)

        if self.training:
            outputs = self.t5(
                inputs_embeds=inputs_embeds, 
                labels=answer_ids,
                return_dict=True
            )
            return outputs.loss
 
        else:
            return self.generate(inputs_embeds)
 
    def generate(self, inputs_embeds):
        decoded_ids = torch.full(
            size=(inputs_embeds.shape[0], 1), 
            fill_value=self.t5.config.decoder_start_token_id, 
            dtype=torch.long
        ).to(inputs_embeds.device)
 
        encoder_hidden_states = self.t5.get_encoder()(
            inputs_embeds=inputs_embeds
        )
 
        for step in range(self.target_max_len):
            logits = self.t5(decoder_input_ids=decoded_ids,
                             encoder_outputs=encoder_hidden_states)[0]
            next_token_logits = logits[:, -1, :]
            next_token_id = next_token_logits.argmax(1).unsqueeze(-1)
            if torch.eq(next_token_id[:, -1], self.tokenizer.eos_token_id).all():
                break
 
            decoded_ids = torch.cat([decoded_ids, next_token_id], dim=-1)
        return decoded_ids
 
class EncoderResnet(nn.Module):
    def __init__(self, cnn, channels, embed_size):
        super(EncoderResnet, self).__init__()
        self.cnn = nn.Sequential(*list(cnn.children())[:-2])
        self.conv1 = nn.Conv2d(channels, embed_size, 1)
        self.adaptive = nn.AdaptiveAvgPool2d((12, 9))
        self.embed_size = embed_size
        
    def forward(self, x):
        output = self.cnn(x) # N, C=2048 H, W
        output = self.conv1(output)
        output = self.adaptive(output)
        output = output.view(output.size(0), self.embed_size, -1)  
        output = output.permute(0, 2, 1) 
        return output
 
    def freeze(self):
        for p in self.cnn.parameters():
            p.requires_grad = False
        for c in list(self.cnn.children()):
            for p in c.parameters():
                p.requires_grad = False

In [53]:
try:
    del model
    del resnet
    gc.collect()
    torch.cuda.empty_cache()
except:
    pass
 
lr = 5e-5
resnet = models.resnet50(pretrained=True)
resnet_model = EncoderResnet(resnet, 4 * emb_size, emb_size).to(device)
resnet_model.freeze()
 
model = DQVA_Model(
    convnet=resnet_model, 
    nlp_model='t5-base', 
    target_max_len=TARGET_MAX_LEN
    ).to(device)
 
optimizer = AdamW([p for p in model.parameters() if p.requires_grad], lr=lr)

In [54]:
# ========================= #
# === Número de params. === #
# ========================= #
 
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
 
print('\n','#' * 54, f'\n # Número de params. {count_parameters(model):,}' \
       ' trainable parameters #\n', '#' * 54,'\n')  
# model


 ###################################################### 
 # Número de params. 223,953,024 trainable parameters #
 ###################################################### 



In [None]:
# =================== #
# === Treinamento === #
# =================== #
 
LOAD = False
N_EPOCHS = 100
deterministic()
 
if LOAD:
    model.load_state_dict(torch.load(path_save_model+'V8_'+str(30), 
                                     map_location=torch.device(device)))
 
# ---------------------------------------------------------------------------------
training_stats = []
for epoch_i in range(1, N_EPOCHS+1):
    loss_train =  train_model(model, device, dataloaders['train'], optimizer)
    exact, f1 = val_model(model, device, t5_tokenizer, dataloaders['val'], 
                          compute_exact, compute_f1, epoch_i)
 
    print(f'EPOCH_{epoch_i}: LOSS TRAIN: {loss_train:.4} -- EXACT_MATCH: {exact:.4}  --  F1: {f1:.4}')
 
    training_stats.append(
        {
            'epoch': epoch_i,
            'Training Loss': loss_train,
            'Exact match': exact,
            'F1 score': f1,
        }
    )

Experimento deterministico, seed: 2357 -- Existe 1 GPU Tesla V100-SXM2-16GB disponível.


In [None]:
df_stats = pd.DataFrame(data=training_stats)
df_stats = df_stats.set_index('epoch')
pd.set_option('precision', 3)
df_stats

In [None]:
df_stats[['Exact match', 'F1 score']].plot(
    figsize=(12, 5), 
    lw=2, 
    ylabel="Score", 
    fontsize=16, 
    grid=True, 
    title='Scores em VAL split'
    )