In [73]:
import os
import time
import datetime
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, TensorDataset, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vectorizer, HeadQA, HeadQA_IR, clean_words, parse_dataset, parse_ir_dataset, random_oversamplig, save_dataset_to_pickle, load_dataset_from_pickle
from training import evaluate, evaluate_better, train_ir, validate_ir, evaluator_bert_ir, encoder_bert_ir, encoder_bert_ir_instance

import transformers
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer, BertModel, BertForMaskedLM

from unsupervised_models import BERTSimilarity


%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
BASE_BERT = 'dccuchile/bert-base-spanish-wwm-cased'

In [3]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [4]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [5]:
# training_instances = parse_dataset_ir(training)
# validation_instances = parse_dataset_ir(validation)
# testing_instances = parse_dataset_ir(testing)

# oversampled_training = random_oversamplig(training_instances)

In [6]:
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')

In [7]:
tokenizer = BertTokenizer.from_pretrained(BASE_BERT, do_lower_case=False)

In [8]:
testing_instances[0]

{'question': 'Forma fibras extracelulares con gran resistencia a la tensión:',
 'answer': 'Fibronectina.',
 'tok_qtext': ['Forma',
  'fibras',
  'extracelulares',
  'con',
  'gran',
  'resistencia',
  'a',
  'la',
  'tensión',
  ':'],
 'tok_atext': ['Fibronectina', '.'],
 'label': 0,
 'category': 'biology'}

In [9]:
valid_inputs_0, valid_masks_0, valid_inputs_1, valid_masks_1, valid_labels = encoder_bert_ir(validation_instances, tokenizer)
test_inputs_0, test_masks_0, test_inputs_1, test_masks_1, test_labels = encoder_bert_ir(testing_instances, tokenizer)

In [10]:
valid_inputs_0 = torch.tensor(valid_inputs_0)
test_inputs_0 = torch.tensor(test_inputs_0)

valid_masks_0 = torch.tensor(valid_masks_0)
test_masks_0 = torch.tensor(test_masks_0)

valid_inputs_1 = torch.tensor(valid_inputs_1)
test_inputs_1 = torch.tensor(test_inputs_1)

valid_masks_1 = torch.tensor(valid_masks_1)
test_masks_1 = torch.tensor(test_masks_1)

valid_labels = torch.tensor(valid_labels)
test_labels = torch.tensor(test_labels)

In [11]:
batch_size = 8

# Create the DataLoader for our validation set.
valid_data = TensorDataset(valid_inputs_0, valid_masks_0, valid_inputs_1, valid_masks_1, valid_labels)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs_0, test_masks_0, test_inputs_1, test_masks_1, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [12]:
model = BERTSimilarity(pretrained_model=BASE_BERT)

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bi

In [13]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 199 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (31002, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [14]:
acc, points = evaluate(model, validation, encoder_bert_ir_instance, evaluator_bert_ir)
acc, points

(tensor(0.2189), -170)

In [15]:
acc, points = evaluate(model, testing, encoder_bert_ir_instance, evaluator_bert_ir)
acc, points

(tensor(0.2360), -154)

In [74]:
acc, points, acc_list, points_list = evaluate_better(model, validation, encoder_bert_ir_instance, evaluator_bert_ir)

In [75]:
acc, points, acc_list, points_list

(0.21881276,
 -28.333333333333332,
 [tensor(0.2257),
  tensor(0.2522),
  tensor(0.2133),
  tensor(0.2165),
  tensor(0.1991),
  tensor(0.2061)],
 [-22, 2, -33, -31, -46, -40])

In [76]:
acc, points, acc_list, points_list = evaluate_better(model, testing, encoder_bert_ir_instance, evaluator_bert_ir)

NameError: name 'points_lis' is not defined

In [77]:
acc, points, acc_list, points_list

(0.23575836,
 -12.833333333333334,
 [tensor(0.2105),
  tensor(0.2332),
  tensor(0.2018),
  tensor(0.2543),
  tensor(0.2522),
  tensor(0.2597),
  tensor(0.2168),
  tensor(0.2586),
  tensor(0.2140),
  tensor(0.2814),
  tensor(0.1911),
  tensor(0.2555)],
 [-36, -15, -44, 4, 2, 9, -30, 8, -33, 29, -53, 5])