In [1]:
import os
import time
import datetime
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, TensorDataset, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vectorizer, HeadQA, HeadQA_IR, clean_words, parse_dataset, parse_ir_dataset, random_oversamplig, save_dataset_to_pickle, load_dataset_from_pickle
from training import evaluate, train_ir, validate_ir, evaluator_bert_ir, encode_bert_ir, encoder_bert_ir_instance

import transformers
from transformers.optimization import AdamW
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer, BertModel, BertForMaskedLM

from unsupervised_models import BERTSimilarity


%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
BASE_BERT = 'dccuchile/bert-base-spanish-wwm-cased'

In [3]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [4]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [5]:
# training_instances = parse_dataset_ir(training)
# validation_instances = parse_dataset_ir(validation)
# testing_instances = parse_dataset_ir(testing)

# oversampled_training = random_oversamplig(training_instances)

In [6]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training_ir.pickle')

In [7]:
tokenizer = BertTokenizer.from_pretrained(BASE_BERT, do_lower_case=False)

In [8]:
testing_instances[0]

{'question': 'Forma fibras extracelulares con gran resistencia a la tensión:',
 'answer': 'Fibronectina.',
 'tok_qtext': ['Forma',
  'fibras',
  'extracelulares',
  'con',
  'gran',
  'resistencia',
  'a',
  'la',
  'tensión',
  ':'],
 'tok_atext': ['Fibronectina', '.'],
 'label': 0,
 'category': 'biology'}

In [9]:
train_inputs_0, train_masks_0, train_inputs_1, train_masks_1, train_labels = encode_bert_ir(oversampled_training, tokenizer)
valid_inputs_0, valid_masks_0, valid_inputs_1, valid_masks_1, valid_labels = encode_bert_ir(validation_instances, tokenizer)
test_inputs_0, test_masks_0, test_inputs_1, test_masks_1, test_labels = encode_bert_ir(testing_instances, tokenizer)

In [10]:
train_inputs_0 = torch.tensor(train_inputs_0)
valid_inputs_0 = torch.tensor(valid_inputs_0)
test_inputs_0 = torch.tensor(test_inputs_0)

train_masks_0 = torch.tensor(train_masks_0)
valid_masks_0 = torch.tensor(valid_masks_0)
test_masks_0 = torch.tensor(test_masks_0)

train_inputs_1 = torch.tensor(train_inputs_1)
valid_inputs_1 = torch.tensor(valid_inputs_1)
test_inputs_1 = torch.tensor(test_inputs_1)

train_masks_1 = torch.tensor(train_masks_1)
valid_masks_1 = torch.tensor(valid_masks_1)
test_masks_1 = torch.tensor(test_masks_1)

train_labels = torch.tensor(train_labels)
valid_labels = torch.tensor(valid_labels)
test_labels = torch.tensor(test_labels)

In [11]:
batch_size = 8

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs_0, train_masks_0, train_inputs_1, train_masks_1, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
valid_data = TensorDataset(valid_inputs_0, valid_masks_0, valid_inputs_1, valid_masks_1, valid_labels)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs_0, test_masks_0, test_inputs_1, test_masks_1, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [12]:
import time
import copy
import spacy
import pickle
import collections
from tqdm import tqdm_notebook, trange
from collections import Counter

In [13]:
class BERTSimilarity(torch.nn.Module):
    """BERT model for classification.
    This module is composed of the BERT model with a linear layer on top of
    the pooled output.
    """
    def __init__(self, pretrained_model = 'bert-base-uncased'):
        super(BERTSimilarity, self).__init__()
        self.bert = BertModel.from_pretrained(pretrained_model)
        self.config = self.bert.config
        self.cosine = nn.CosineSimilarity(dim=1)

    def forward(self, input_ids_0, attention_mask_0, input_ids_1, attention_mask_1, labels=None, output_hidden_states=True):
        question_outputs = self.bert(input_ids_0, attention_mask=attention_mask_0, output_hidden_states=output_hidden_states)
        answer_outputs = self.bert(input_ids_1, attention_mask=attention_mask_1, output_hidden_states=output_hidden_states)
        quest_last_hidden_state = question_outputs[0] #last_hidden_state
        ans_last_hidden_state = answer_outputs[0]
        batch_size = ans_last_hidden_state.shape[0]
        quest_last_hidden_state = quest_last_hidden_state.view(batch_size, 30*self.config.hidden_size)
        ans_last_hidden_state = ans_last_hidden_state.view(batch_size, 30*self.config.hidden_size)
        sim = self.cosine(quest_last_hidden_state, ans_last_hidden_state)
        return sim
    
    def freeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = False

    def unfreeze_bert_encoder(self):
        for param in self.bert.parameters():
            param.requires_grad = True

In [14]:
model = BERTSimilarity(pretrained_model=BASE_BERT)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bi

In [15]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 199 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (31002, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [18]:
acc, points = evaluate(model, validation, encoder_bert_ir_instance, evaluator_bert_ir)
acc, points

(tensor(0.2189), -170)

In [19]:
acc, points = evaluate(model, testing, encoder_bert_ir_instance, evaluator_bert_ir)
acc, points

(tensor(0.2360), -154)