<a href="https://colab.research.google.com/github/finardi/tutos/blob/master/FAQ%20-BERTa%C3%BA%20pairWise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# BERTaú PairWise FAQ example
> #### This notebook contains a end-to-end code to run the BERTaú pairwise FAQ task with a slice of the whole FAQ dataset. The FAQ dataset used in this experiment is public.

In [1]:
!nvidia-smi

Wed Jan 27 23:27:51 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    24W / 300W |      0MiB / 16130MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
! git clone https://github.com/vfcarida/bertau
! pip install -q transformers

Cloning into 'bertau'...
remote: Enumerating objects: 17, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (15/15), done.[K
remote: Total 17 (delta 2), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (17/17), done.
[K     |████████████████████████████████| 1.8MB 12.9MB/s 
[K     |████████████████████████████████| 890kB 48.7MB/s 
[K     |████████████████████████████████| 2.9MB 62.7MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone


In [None]:
import gc
import sys
sys.path.append('/content/bertau/')

import time
import torch
import torch.cuda.amp as amp

from train import run
from data import Data
from dataset import FinalPrep, FAQDataset
from utils import deterministic, get_device
from BERTau_pairwise import BERTauPairwise, pairwise_loss, hinge_loss

from transformers import BertTokenizer, get_linear_schedule_with_warmup

# Make the experiment deterministic
deterministic = deterministic() 

# Device of experiment: CPU or GPU
device = get_device()

# Basic configurations of the experiment
config = {
    'K-CANDS': 45,
    'TRAIN_SIZE': 0.90,
    'BATCH_SZ': 12,
    'MAX_LEN': 196,
    'N_EPOCHS': 1,
    'LR': 5e-5,
    'TOP_K': 10,
}

# Get FAQ data ---------------------------------------
print('Getting the FAQ data ...', end=' ')
path = '/content/bertau/public_itau-unibanco-faq.csv'
df = Data(path)
df_faq = df.data_prep()
print('DONE!')

# Data Prep ------------------------------------------
print('Preparing the data ...', end=' ')
data = FinalPrep(df_faq)
df_docs, qid_to_text, docid_to_text, labels = data.make_labels()
qid_target_train, train_dataframe, labels_train, \
qid_target_valid, valid_dataframe, labels_valid = data.split_data(
    train_size=config['TRAIN_SIZE'])    
print('DONE!')

# Dataset and Dataloader -----------------------------
print('Building Datasets and Dataloaders ...', end=' ')

# Path model at Hugging Face hub
path_model = 'Itau-Unibanco/BERTau'

tokenizer = BertTokenizer.from_pretrained(path_model, lowercase=True)

# Train dataset and dataloader
train = FAQDataset()

ds_train = train.make_pairwise_dataset(
    df=qid_target_train, 
    df_docs=df_docs, 
    tokenizer=tokenizer, 
    qid_to_text=qid_to_text, 
    docid_to_text=docid_to_text, 
    max_seq_len=config['MAX_LEN'], 
    K=config['K-CANDS']
    )
train_loader = train.make_pairwise_dataLoader(
    ds_train, 
    batch_size=config['BATCH_SZ'], 
    phase='train',
    )

# Valid dataset and dataloader
valid = FAQDataset()

valid_data = valid._get_cands(
    df=qid_target_valid, 
    df_docs=df_docs, 
    K=config['K-CANDS']
    )
print('DONE!')

# Running Train and Eval -----------------------------
try:
    del model
    gc.collect()
    torch.cuda.empty_cache()
except:
    pass

print('\nRunning the Train Loop ...')

# Put the model on device
model = BERTauPairwise(path_model).to(device)

# Config the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=config['LR'])

# Number of total steps to be used in Warm Up
total_steps = len(train_loader) * config['N_EPOCHS']

# Config the Learning Rate scheduler
scheduler = get_linear_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=total_steps * 0.03,  # 3% of Warm Up
    num_training_steps=total_steps
    )

# To perform the experiment in FP16
scaler = amp.GradScaler()

# Start train time
start = time.time()

# Call run function to perform the train and valid batches
df_stats = run(
    model=model, 
    loss_fn=hinge_loss, 
    train_loader=train_loader, 
    qid_target=valid_data, 
    path_vocab=path_model, 
    qid_to_text=qid_to_text, 
    docid_to_text=docid_to_text, 
    ground_truths=labels_valid, 
    max_len=config['MAX_LEN'], 
    K=config['K-CANDS'], 
    optimizer=optimizer, 
    device=device,
    scheduler=scheduler, 
    scaler=scaler, 
    path_save=None, 
    n_epochs=config['N_EPOCHS'], 
    )
    
# Count the training time
end = time.time()
elapsed = end - start

print(f'DONE! Elapsed time: {round(elapsed/60)} min.')    

Getting the FAQ data ... DONE!
Preparing the data ... DONE!
Building Datasets and Dataloaders ... DONE!

Running the Train Loop ...


# The End