In [1]:
import os
import time
import datetime
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, TensorDataset, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vectorizer, HeadQA, HeadQA_IR, clean_words, parse_dataset, parse_ir_dataset, random_oversamplig, save_dataset_to_pickle, load_dataset_from_pickle
from training import evaluate, train_ir, validate_ir, evaluator_ir

import transformers
from transformers.optimization import AdamW
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer, BertModel, BertForMaskedLM


%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
BASE_BERT = 'dccuchile/bert-base-spanish-wwm-cased'

In [3]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\CLAUDIA\.cache\huggingface\datasets\head_qa\es\1.1.0\d6803d1e84273cdc4a2cf3c5102945d166555f47b299ecbc5266d582f408f8e2)


In [4]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [5]:
# training_instances = parse_dataset_ir(training)
# validation_instances = parse_dataset_ir(validation)
# testing_instances = parse_dataset_ir(testing)

# oversampled_training = random_oversamplig(training_instances)

In [6]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training_ir.pickle')

In [7]:
tokenizer = BertTokenizer.from_pretrained(BASE_BERT, do_lower_case=False)

In [8]:
def pad_seq(x, seq_len=110, right_padding = False):
    z = np.zeros(seq_len, dtype=np.int32)
    n = min(seq_len, len(x))
    if right_padding:
        z[:n] = x[0:n]
    else:
        z[(seq_len - n):] = x[0:n]
    return z

In [9]:
testing_instances[0]

{'question': 'Forma fibras extracelulares con gran resistencia a la tensión:',
 'answer': 'Fibronectina.',
 'tok_qtext': ['Forma',
  'fibras',
  'extracelulares',
  'con',
  'gran',
  'resistencia',
  'a',
  'la',
  'tensión',
  ':'],
 'tok_atext': ['Fibronectina', '.'],
 'label': 0,
 'category': 'biology'}

In [10]:
def encode_ir(samples, tokenizer):
    input_ids_0, input_ids_1, labels = [], [], []
    for item in samples:
        encoded_q = tokenizer.encode(item['question'], add_special_tokens = True)
        encoded_a = tokenizer.encode(item['answer'], add_special_tokens = True)
        padded_q = pad_seq(encoded_q, seq_len=30)
        padded_a = pad_seq(encoded_a, seq_len=30)
        input_ids_0.append(padded_q)
        input_ids_1.append(padded_a)
        labels.append(item['label'])
        
    attention_masks_0, attention_masks_1 = [], []
    for sent in input_ids_0:  
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks_0.append(att_mask)
    
    for sent in input_ids_1:  
        att_mask = [int(token_id > 0) for token_id in sent]
        attention_masks_1.append(att_mask)
        
    return input_ids_0, attention_masks_0, input_ids_1, attention_masks_1, labels

In [11]:
train_inputs_0, train_masks_0, train_inputs_1, train_masks_1, train_labels = encode_ir(oversampled_training, tokenizer)
valid_inputs_0, valid_masks_0, valid_inputs_1, valid_masks_1, valid_labels = encode_ir(validation_instances, tokenizer)
test_inputs_0, test_masks_0, test_inputs_1, test_masks_1, test_labels = encode_ir(testing_instances, tokenizer)

In [12]:
train_inputs_0 = torch.tensor(train_inputs_0)
valid_inputs_0 = torch.tensor(valid_inputs_0)
test_inputs_0 = torch.tensor(test_inputs_0)

train_masks_0 = torch.tensor(train_masks_0)
valid_masks_0 = torch.tensor(valid_masks_0)
test_masks_0 = torch.tensor(test_masks_0)

train_inputs_1 = torch.tensor(train_inputs_1)
valid_inputs_1 = torch.tensor(valid_inputs_1)
test_inputs_1 = torch.tensor(test_inputs_1)

train_masks_1 = torch.tensor(train_masks_1)
valid_masks_1 = torch.tensor(valid_masks_1)
test_masks_1 = torch.tensor(test_masks_1)

train_labels = torch.tensor(train_labels)
valid_labels = torch.tensor(valid_labels)
test_labels = torch.tensor(test_labels)

In [13]:
batch_size = 8

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs_0, train_masks_0, train_inputs_1, train_masks_1, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
valid_data = TensorDataset(valid_inputs_0, valid_masks_0, valid_inputs_1, valid_masks_1, valid_labels)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs_0, test_masks_0, test_inputs_1, test_masks_1, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [14]:
import time
import copy
import spacy
import pickle
import collections
from tqdm import tqdm_notebook, trange
from collections import Counter

In [None]:
# model = BertForSequenceClassification.from_pretrained(BASE_BERT, num_labels=2, output_attentions=False, 
#                                                       output_hidden_states=False)

In [17]:
class BERTSimilarity(torch.nn.Module):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    """
    def __init__(self, pretrained_model = 'bert-base-uncased'):
        super(BERTSimilarity, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        config = self.bert.config
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, 1)
        nn.init.xavier_normal_(self.classifier.weight)
        self.cosine = nn.CosineSimilarity(dim=1)

    def forward(self, input_ids_0, attention_mask_0, input_ids_1, attention_mask_1, labels=None, output_hidden_states=True):
        outputs_0 = self.bert(input_ids=input_ids_0, attention_mask=attention_mask_0, output_hidden_states=output_hidden_states)
        outputs_1 = self.bert(input_ids=input_ids_1, attention_mask=attention_mask_1, output_hidden_states=output_hidden_states)
        return outputs_0, outputs_1
        print(type(outputs))
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        outputs = (logits,) + outputs[2:]
        return outputs

    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False

    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [18]:
model = BERTSimilarity(pretrained_model=BASE_BERT)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bi

In [None]:
for batch in 

In [None]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
from transformers import get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
def train_model(model, train_dataloader, valid_dataloader, epochs):
    import random
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # Store the average loss after each epoch so we can plot them.
    loss_values = []
    epochs_results = []

    for epoch_i in range(0, epochs):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()
        total_loss = 0
        model.train()
        for step, batch in enumerate(train_dataloader):
            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            b_input_ids = batch[0].long()
            b_input_mask = batch[1]
            b_labels = batch[2]
            model.zero_grad()
            outputs = model(b_input_ids, 
                        token_type_ids=None, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
            loss = outputs[0]
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
        avg_train_loss = total_loss / len(train_dataloader)  
        loss_values.append(avg_train_loss)
        valid_acc, y_real, y_pred = valid_model(model, valid_dataloader)
        p, r, f1 = evaluate(y_real, y_pred)
        epochs_results.append([avg_train_loss, valid_acc, p, r, f1])

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
    print("")
    print("Training complete!")
    return epochs_results

In [None]:
def valid_model(model, validation_dataloader):
    print("Running Validation...")    
    y_true = []
    y_pred = []
    
    t0 = time.time()
    model.eval()
    
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    
    for batch in validation_dataloader:
        batch = tuple(t for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.long()
        with torch.no_grad(): 
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        logits = outputs[0]
        #logits = logits.detach().cpu().numpy()
        #label_ids = b_labels.to('cpu').numpy()
        tmp_eval_accuracy = flat_accuracy(np.array(logits), np.array(b_labels))
        eval_accuracy += tmp_eval_accuracy
        nb_eval_steps += 1
        pred = torch.max(logits, dim=1)[1]
        y_true.append(b_labels)
        y_pred.append(pred)        
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
    return eval_accuracy/nb_eval_steps, y_true, y_pred

In [None]:
def test_model(model, test_dataloader):
    model.eval()
    predictions , true_labels = [], []

    for batch in test_dataloader:
        #batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        b_input_ids = b_input_ids.long()
        with torch.no_grad():
            outputs = model(b_input_ids, token_type_ids=None, 
                          attention_mask=b_input_mask)
        logits = outputs[0]
        # Move logits and labels to CPU
        #logits = logits.detach().cpu().numpy()
        #label_ids = b_labels.to('cpu').numpy()
        # Store predictions and true labels
        pred = torch.max(logits, dim=1)[1]
        predictions.append(pred)
        true_labels.append(b_labels)
    print('    DONE.')
    return true_labels, predictions

In [None]:
epochs_results = train_model(model, train_dataloader, valid_dataloader, epochs=1)