<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/Kfold_banking77.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Fri Aug 19 11:48:30 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P8    11W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
%%capture
!pip install transformers

In [8]:
import os
import gc
import numpy as np
import pandas as pd

import torch
from torch.utils.data import TensorDataset, DataLoader


from transformers import BertForSequenceClassification, BertTokenizerFast
from transformers import AdamW, get_linear_schedule_with_warmup

device = 'cuda'


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

In [10]:
path_base = '/content/drive/MyDrive/Artigos/BACEN/data_translated/'

df = pd.read_parquet(path_base+'banking77_ptbr_train')

# ajustando o label de str para int
df = df.assign(Label = df.Label.apply(lambda x: np.int64(x.replace('.', ''))))
df = df.sample(frac=1).reset_index(drop=True)

# - - - - -
print(df.shape)
df

(10003, 2)


Unnamed: 0,Data,Label
0,Não recebi meu dinheiro mais cedo e diz que a ...,46
1,Preciso de informações sobre ativar meu cartão?,0
2,Como posso configurar o Google Pay no meu disp...,2
3,Como é que quando tentei pagar sem contato no ...,23
4,Preciso fazer meu PIN de cartão um número dife...,21
...,...,...
9998,"Não faço ideia do que está acontecendo, mas to...",27
9999,Acho que fui injustamente acusado em uma trans...,15
10000,"Mandei um cheque para o sistema, mas ainda não...",6
10001,Por que não consegui dinheiro do caixa eletrôn...,26


#Build Kfold Loop

In [11]:
BSIZE = 16
MAX_LEN = 32
path_model = 'neuralmind/bert-base-portuguese-cased'
tokenizer = BertTokenizerFast.from_pretrained(path_model)

Downloading tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [50]:
def get_banking77_tokens(data, label, maxlen):
    ids, labels = [], []
    for text, label in zip(data, label):
        tokens = tokenizer.encode_plus(
            text=text,
            truncation=True, 
            max_length=maxlen,
            padding='max_length',
            return_tensors='pt'
        )
        labels.append(int(label))
        ids.append(tokens['input_ids'])

    ids = torch.vstack(ids)
    return ids, torch.tensor(labels)

def train_one_epoch(model, dataloader, optim, device='cpu'):
    loss_sum = 0
    model.train() 
    for batch in dataloader:
        model.zero_grad()
        batch_ids, batch_label = (b.to(device) for b in batch)
        outs = model(input_ids=batch_ids, labels=batch_label)
        loss = outs['loss']
        loss_sum += loss.item()
        loss.backward()
        optim.step()
    return loss_sum/len(dataloader)

def evaluate(model, loader, device):
    total_acc, total_f1, total_loss = 0,0,0
    model.eval()
    for batch in loader:
        model.zero_grad()
        
        batch_ids, batch_label = (b.to(device) for b in batch)
        
        with torch.no_grad():
            outs = model(input_ids=batch_ids, labels=batch_label)
        
        loss = outs['loss']
        total_loss += loss.item()
        
        logits = torch.nn.functional.softmax(outs['logits'].cpu().detach(), dim=-1).numpy()
        y_pred = np.argmax(logits, axis=1)
        total_f1  += f1_score(y_true=batch_label.cpu(), y_pred=y_pred, average='macro')
        total_acc += accuracy_score(y_true=batch_label.cpu(), y_pred=y_pred)
    
    avg_loss = total_loss/len(loader)
    avg_f1 = total_f1/len(loader)
    avg_acc = total_acc/len(loader)

    return avg_loss, avg_f1, avg_acc


def train(model, train_loader, valid_loader, optim, n_epochs=10, device='cpu'):
    train_losses, valid_losses = [], []
    f1s, accs = [], []
    best_f1 = -1
    best_epoch = 0
    for epoch in range(n_epochs):
        train_loss = train_one_epoch(model, train_loader, optim, device)
        valid_loss, f1Score, accScore = evaluate(model, valid_loader, device)

        # Accumulating train and validation losses
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        f1s.append(f1Score)
        accs.append(accScore)

        if f1Score > best_f1:
            best_f1 = f1Score
            best_epoch = epoch + 1

        print(f'ep: [{epoch+1}/{n_epochs}] -- T: {train_loss:.3} -- V: {valid_loss:.3} -- F1: {f1Score:.3} -- ACC: {accScore:.3}')

    return best_f1, best_epoch

def perform_kfold_cv(df, k_folds, n_epochs=10, random_state=341, device='cpu', batch_size=BSIZE):
    skf = StratifiedKFold(n_splits=k_folds, random_state=random_state, shuffle=True)
    folds_f1s = []
    trained_models = []

    for fold, (train_index, eval_index) in enumerate(skf.split(np.zeros(df.shape[0]), df.Label)):
        print(f'--- K Fold [{fold+1}/{k_folds}] ---')
        model = BertForSequenceClassification.from_pretrained(
            path_model, 
            num_labels=df.Label.nunique(), 
            return_dict=True,
        ).to(device)
        optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

        X_train, y_train = df.Data[train_index], df.Label[train_index]
        X_eval, y_eval = df.Data[eval_index], df.Label[eval_index]
        
        train_texts, train_labels = get_banking77_tokens(X_train.to_list(), y_train.to_list(), MAX_LEN)
        eval_texts, eval_labels = get_banking77_tokens(X_eval.to_list(), y_eval.to_list(), MAX_LEN)

        dataset_train = TensorDataset(train_texts, train_labels)        
        dataset_eval  = TensorDataset(eval_texts, eval_labels)

        train_loader = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
        valid_loader = DataLoader(dataset=dataset_eval, batch_size=batch_size, shuffle=False)                        
        
        best_f1, best_epoch = train(model, train_loader, valid_loader, optim, n_epochs, device=device)
        print(f'Fold {fold+1} got F1 = {best_f1:.3} at epoch {best_epoch}')
        folds_f1s.append((best_f1, best_epoch, train_index, eval_index))
        trained_models.append(model)

    return folds_f1s, trained_models

In [51]:
try:
    del model
    gc.collect()
    torch.cuda.empty_cache()
except:
    pass

k_folds = 3
folds_f1s = perform_kfold_cv(df, k_folds, n_epochs=3, random_state=341, device=device, batch_size=BSIZE)

--- K Fold [1/3] ---


Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

ep: [1/3] -- T: 4.25 -- V: 3.41 -- F1: 0.163 -- ACC: 0.257
ep: [2/3] -- T: 1.7 -- V: 0.776 -- F1: 0.732 -- ACC: 0.831
ep: [3/3] -- T: 0.531 -- V: 0.507 -- F1: 0.787 -- ACC: 0.872
Fold 1 got F1 = 0.787 at epoch 3
--- K Fold [2/3] ---


Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

ep: [1/3] -- T: 4.31 -- V: 4.24 -- F1: 0.0148 -- ACC: 0.0278
ep: [2/3] -- T: 3.41 -- V: 1.55 -- F1: 0.523 -- ACC: 0.672
ep: [3/3] -- T: 0.968 -- V: 0.625 -- F1: 0.758 -- ACC: 0.853
Fold 2 got F1 = 0.758 at epoch 3
--- K Fold [3/3] ---


Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

ep: [1/3] -- T: 4.31 -- V: 4.24 -- F1: 0.0189 -- ACC: 0.0357
ep: [2/3] -- T: 4.19 -- V: 4.11 -- F1: 0.0583 -- ACC: 0.0762
ep: [3/3] -- T: 2.57 -- V: 1.06 -- F1: 0.637 -- ACC: 0.767
Fold 3 got F1 = 0.637 at epoch 3


# Inference TopK

In [59]:
sent = 'quando chega meu cartao'

def evaluate(df, model, tokenizer, sent, device, K=3):
    tokens = tokenizer.encode_plus(sent, return_tensors='pt').to(device)
    model.eval()
    with torch.no_grad():
        outs = model(tokens['input_ids'])
        
    logits = torch.nn.functional.softmax(outs['logits'], dim=-1)
    topK = torch.topk(logits, K)
    
    topK_probs = topK.values.cpu().detach().numpy()[0]
    topK_preds = topK.indices.cpu().detach().numpy()[0]

    return list(topK_probs), list(topK_preds)

topK_probs, topK_preds = evaluate(df, folds_f1s[1][0], tokenizer, sent, device, K=3)    

print(f'top3 probs: {topK_probs} -- top3 preds: {topK_preds}')

top3 probs: [0.6328591, 0.27050778, 0.017768664] -- top3 preds: [11, 12, 9]
