<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/Kfold_banking77.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!nvidia-smi

Fri Sep  2 23:08:05 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
%%capture
!pip install transformers
!pip install magic_timer

In [9]:
import os
import gc
import numpy as np
import pandas as pd

import torch
from torch.utils.data import TensorDataset, DataLoader

from magic_timer import MagicTimer

from transformers import BertForSequenceClassification, BertTokenizerFast
from transformers import AdamW, get_linear_schedule_with_warmup

device = 'cuda'


from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score

In [4]:
path_base = '/content/drive/MyDrive/Artigos/BACEN/data_translated/'

df = pd.read_parquet(path_base+'banking77_ptbr_train')

# ajustando o label de str para int
df = df.assign(Label = df.Label.apply(lambda x: np.int64(x.replace('.', ''))))
df = df.sample(frac=1).reset_index(drop=True)

# - - - - -
print(df.shape)
df

(10003, 2)


Unnamed: 0,Data,Label
0,Fiz uma transação em uma conta errada!,8
1,Quando posso esperar a entrega?,12
2,Posso configurar a conta para um reabastecimen...,4
3,Quanto tempo até o dinheiro transferido aparecer?,5
4,O aplicativo mostra um pagamento que eu nunca ...,16
...,...,...
9998,"Se eu perdi meu telefone no hotel, ainda posso...",42
9999,Passos de ativação de cartão,0
10000,Como posso substituir meu cartão antes de expi...,9
10001,Faz uma semana que não manda meu cartão e aind...,11


#Build Kfold Loop

In [5]:
BSIZE = 16
MAX_LEN = 32
path_model = 'neuralmind/bert-base-portuguese-cased'
tokenizer = BertTokenizerFast.from_pretrained(path_model)

Downloading tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/205k [00:00<?, ?B/s]

Downloading added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/647 [00:00<?, ?B/s]

In [19]:
def get_banking77_tokens(data, label, maxlen):
    ids, labels = [], []
    for text, label in zip(data, label):
        tokens = tokenizer.encode_plus(
            text=text,
            truncation=True, 
            max_length=maxlen,
            padding='max_length',
            return_tensors='pt'
        )
        labels.append(int(label))
        ids.append(tokens['input_ids'])

    ids = torch.vstack(ids)
    return ids, torch.tensor(labels)

def train_one_epoch(model, dataloader, optim, device='cpu'):
    total_loss = []
    model.train() 
    for batch in dataloader:
        model.zero_grad()
        batch_ids, batch_label = (b.to(device) for b in batch)
        outs = model(input_ids=batch_ids, labels=batch_label)
        loss = outs['loss']
        total_loss.append(loss.item())
        loss.backward()
        optim.step()
    
    return np.array(total_loss).mean()

def evaluate(model, loader, device):
    trues, preds = [],[]
    model.eval()
    for batch in loader:
        model.zero_grad()
        
        batch_ids, batch_label = (b.to(device) for b in batch)
        
        with torch.no_grad():
            outs = model(input_ids=batch_ids, labels=batch_label)
        
        pred = outs['logits'].argmax(-1).detach().cpu().numpy()
        preds.extend(pred)
        
        trues.extend(batch_label.detach().cpu().numpy().tolist())

    acc = accuracy_score(trues, preds)
    f1 = f1_score(trues, preds, average='macro')
    
    return f1, acc, trues, preds


def train(model, train_loader, valid_loader, optim, n_epochs, timer, device='cpu'):
    train_losses = []
    f1s, accs = [], []
    best_f1 = -1
    best_epoch = 0
    for epoch in range(n_epochs):
        train_loss = train_one_epoch(model, train_loader, optim, device)
        f1Score, accScore, trues, preds = evaluate(model, valid_loader, device)

        # Accumulating train and validation losses
        train_losses.append(train_loss)
        f1s.append(f1Score)
        accs.append(accScore)

        if f1Score > best_f1:
            best_f1 = f1Score
            best_epoch = epoch + 1

        print(f'ep: [{epoch+1}/{n_epochs}] -- T: {train_loss:.3}  -- F1: {f1Score:.3} -- ACC: {accScore:.3} -- time elapsed: {timer}')

    return best_f1, best_epoch

def perform_kfold_cv(df, k_folds, n_epochs=10, random_state=341, device='cpu', batch_size=BSIZE):
    skf = StratifiedKFold(n_splits=k_folds, random_state=random_state, shuffle=True)
    folds_f1s = []
    trained_models = []
    
    for fold, (train_index, eval_index) in enumerate(skf.split(np.zeros(df.shape[0]), df.Label)):
        timer = MagicTimer()  
        print(f'--- K Fold [{fold+1}/{k_folds}] ---')
        model = BertForSequenceClassification.from_pretrained(
            path_model, 
            num_labels=df.Label.nunique(), 
            return_dict=True,
        ).to(device)
        optim = torch.optim.AdamW(model.parameters(), lr=5e-5)

        X_train, y_train = df.Data[train_index], df.Label[train_index]
        X_eval, y_eval = df.Data[eval_index], df.Label[eval_index]
        
        train_texts, train_labels = get_banking77_tokens(X_train.to_list(), y_train.to_list(), MAX_LEN)
        eval_texts, eval_labels = get_banking77_tokens(X_eval.to_list(), y_eval.to_list(), MAX_LEN)

        dataset_train = TensorDataset(train_texts, train_labels)        
        dataset_eval  = TensorDataset(eval_texts, eval_labels)

        train_loader = DataLoader(dataset=dataset_train, batch_size=batch_size, shuffle=True)
        valid_loader = DataLoader(dataset=dataset_eval, batch_size=batch_size, shuffle=False)                        
        
        best_f1, best_epoch = train(model, train_loader, valid_loader, optim, n_epochs, timer, device=device)
        print(f'Fold {fold+1} got F1 = {best_f1:.3} at epoch {best_epoch}')
        folds_f1s.append((best_f1, best_epoch, train_index, eval_index))
        trained_models.append(model)

    return folds_f1s, trained_models

In [None]:
try:
    del model
    gc.collect()
    torch.cuda.empty_cache()
except:
    pass

k_folds = 3
folds_f1s = perform_kfold_cv(df, k_folds, n_epochs=30, random_state=341, device=device, batch_size=BSIZE)

--- K Fold [1/3] ---


Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the

ep: [1/30] -- T: 4.32  -- F1: 0.0304 -- ACC: 0.0387 -- time elapsed: 40 seconds
ep: [2/30] -- T: 4.22  -- F1: 0.0488 -- ACC: 0.0546 -- time elapsed: 1.3 minutes
ep: [3/30] -- T: 3.6  -- F1: 0.553 -- ACC: 0.642 -- time elapsed: 1.9 minutes
ep: [4/30] -- T: 1.01  -- F1: 0.849 -- ACC: 0.856 -- time elapsed: 2.5 minutes
ep: [5/30] -- T: 0.382  -- F1: 0.881 -- ACC: 0.88 -- time elapsed: 3.1 minutes
ep: [6/30] -- T: 0.209  -- F1: 0.891 -- ACC: 0.89 -- time elapsed: 3.7 minutes
ep: [7/30] -- T: 0.131  -- F1: 0.895 -- ACC: 0.894 -- time elapsed: 4.3 minutes
ep: [8/30] -- T: 0.0902  -- F1: 0.892 -- ACC: 0.891 -- time elapsed: 4.9 minutes
ep: [9/30] -- T: 0.0734  -- F1: 0.888 -- ACC: 0.887 -- time elapsed: 5.5 minutes
ep: [10/30] -- T: 0.0584  -- F1: 0.882 -- ACC: 0.883 -- time elapsed: 6.1 minutes
ep: [11/30] -- T: 0.0693  -- F1: 0.884 -- ACC: 0.884 -- time elapsed: 6.7 minutes
ep: [12/30] -- T: 0.0602  -- F1: 0.864 -- ACC: 0.863 -- time elapsed: 7.3 minutes


# Inference TopK

In [None]:
sent = 'quando chega meu cartao'

def evaluate(df, model, tokenizer, sent, device, K=3):
    tokens = tokenizer.encode_plus(sent, return_tensors='pt').to(device)
    model.eval()
    with torch.no_grad():
        outs = model(tokens['input_ids'])
        
    logits = torch.nn.functional.softmax(outs['logits'], dim=-1)
    topK = torch.topk(logits, K)
    
    topK_probs = topK.values.cpu().detach().numpy()[0]
    topK_preds = topK.indices.cpu().detach().numpy()[0]

    return list(topK_probs), list(topK_preds)

topK_probs, topK_preds = evaluate(df, folds_f1s[1][0], tokenizer, sent, device, K=3)    

print(f'top3 probs: {topK_probs} -- top3 preds: {topK_preds}')