In [None]:
import os
import time
import datetime
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, TensorDataset, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, random_oversamplig
from training import train, validate, evaluate

import transformers
from transformers.optimization import AdamW
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer, BertModel, BertForMaskedLM


%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
BASE_BERT = 'dccuchile/bert-base-spanish-wwm-cased'

In [None]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

In [None]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [None]:
training_instances = parse_dataset(training)
validation_instances = parse_dataset(validation)
testing_instances = parse_dataset(testing)

oversampled_training = random_oversamplig(training_instances)

In [None]:
tokenizer = BertTokenizer.from_pretrained(BASE_BERT, do_lower_case=False)

In [None]:
def pad_seq(x, seq_len=110, right_padding = False):
    z = np.zeros(seq_len, dtype=np.int32)
    n = min(seq_len, len(x))
    if right_padding:
        z[:n] = x[0:n]
    else:
        z[(seq_len - n):] = x[0:n]
    return z

In [None]:
testing_instances[0]

In [None]:
sent = testing_instances[0]['question'] +' [SEP] ' + testing_instances[0]['answer']
sent

In [None]:
tokenizer.encode(sent, add_special_tokens=True)

In [None]:
def encode(samples, tokenizer):
    input_ids, labels = [], []
    for item in samples:
        sent = item['question'] +' [SEP] ' + item['answer'] 
        encoded_sent = tokenizer.encode(sent, add_special_tokens = True)
        padded_sent = pad_seq(encoded_sent, seq_len=30)
        input_ids.append(padded_sent)
        labels.append(item['label'])
        
    attention_masks = []
    for sent in input_ids:  
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks.append(att_mask)
        
    return input_ids, attention_masks, labels

In [None]:
train_inputs, train_masks, train_labels = encode(oversampled_training, tokenizer)
valid_inputs, valid_masks, valid_labels = encode(validation_instances, tokenizer)
test_inputs, test_masks, test_labels = encode(testing_instances, tokenizer)

In [None]:
train_inputs = torch.tensor(train_inputs)
valid_inputs = torch.tensor(valid_inputs)
test_inputs = torch.tensor(test_inputs)

train_labels = torch.tensor(train_labels)
valid_labels = torch.tensor(valid_labels)
test_labels = torch.tensor(test_labels)

train_masks = torch.tensor(train_masks)
valid_masks = torch.tensor(valid_masks)
test_masks = torch.tensor(test_masks)

In [None]:
batch_size = 8

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
valid_data = TensorDataset(valid_inputs, valid_masks, valid_labels)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
import time
import copy
import spacy
import pickle
import collections
from tqdm import tqdm_notebook, trange
from collections import Counter

In [None]:
model = BertForSequenceClassification.from_pretrained(BASE_BERT, num_labels=2, output_attentions=False, 
                                                      output_hidden_states=False)

In [None]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
from transformers import get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def train_model(model, train_dataloader, valid_dataloader, epochs):
    import random
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # Store the average loss after each epoch so we can plot them.
    loss_values = []
    epochs_results = []

    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()
        total_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            b_input_ids = batch[0].long()
            b_input_mask = batch[1]
            b_labels = batch[2]
            model.zero_grad()
            outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
            loss = outputs[0]
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_loss / len(train_dataloader)  
        loss_values.append(avg_train_loss)
        valid_acc, y_real, y_pred = valid_model(model, valid_dataloader)
        p, r, f1 = evaluate(y_real, y_pred)
        epochs_results.append([avg_train_loss, valid_acc, p, r, f1])

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
    print("")
    print("Training complete!")
    return epochs_results

In [None]:
def valid_model(model, validation_dataloader):
    print("Running Validation...")    
    y_true = []
    y_pred = []
    
    t0 = time.time()
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    
    for batch in validation_dataloader:
        batch = tuple(t for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.long()
        with torch.no_grad(): 
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        logits = outputs[0]
        #logits = logits.detach().cpu().numpy()
        #label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(np.array(logits), np.array(b_labels))
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        pred = torch.max(logits, dim=1)[1]
        y_true.append(b_labels)
        y_pred.append(pred)        
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    return eval_accuracy/nb_eval_steps, y_true, y_pred

In [None]:
def test_model(model, test_dataloader):
    model.eval()
    predictions , true_labels = [], []

    for batch in test_dataloader:
        #batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.long()
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, 
                          attention_mask=b_input_mask)
        logits = outputs[0]
        # Move logits and labels to CPU
        #logits = logits.detach().cpu().numpy()
        #label_ids = b_labels.to('cpu').numpy()
        # Store predictions and true labels
        pred = torch.max(logits, dim=1)[1]
        predictions.append(pred)
        true_labels.append(b_labels)
    print('    DONE.')
    return true_labels, predictions

In [None]:
epochs_results = train_model(model, train_dataloader, valid_dataloader, epochs=1)