In [4]:
from google.colab import drive
drive.mount('/content/gdrive')

import sys
sys.path.insert(0,'/content/head-qa-afi/code')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [5]:
!rm -r /content/head-qa-afi/
!git clone https://github.com/claudiaqw/head-qa-afi.git

Cloning into 'head-qa-afi'...
remote: Enumerating objects: 603, done.[K
remote: Counting objects: 100% (603/603), done.[K
remote: Compressing objects: 100% (401/401), done.[K
remote: Total 603 (delta 351), reused 440 (delta 197), pack-reused 0[K
Receiving objects: 100% (603/603), 226.49 MiB | 27.29 MiB/s, done.
Resolving deltas: 100% (351/351), done.
Checking out files: 100% (201/201), done.


In [6]:
!pip install transformers
!pip install datasets
!pip install pickle5

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 7.1 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 45.7 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 55.1 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 76.6 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

In [7]:
import os
import time
import datetime
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import copy
import spacy
import pickle5 as pickle
import collections
from tqdm import tqdm_notebook, trange


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, TensorDataset, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import transformers
from transformers.optimization import AdamW
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer, BertModel, BertForMaskedLM
from transformers import get_linear_schedule_with_warmup

import training
from training import flat_accuracy, format_time, train_model, valid_model
from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir

from ir_models import BERT_QA


%matplotlib inline
%load_ext autoreload
%autoreload 2

Downloading:   0%|          | 0.00/242k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/134 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/364 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/480k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/648 [00:00<?, ?B/s]

In [8]:
def load_dataset_from_pickle(filename):
    with open(filename, 'rb') as handle:
        return pickle.load(handle)

def filter_by_category(dataset, category):
    filtered_dataset = []
    for instance in dataset:
        categ = instance['category']
        if categ == category:
            filtered_dataset.append(instance)
    return filtered_dataset

In [9]:
CATEGORY = 'chemistry'
BASE_BERT = 'dccuchile/bert-base-spanish-wwm-cased'

In [10]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Downloading:   0%|          | 0.00/2.28k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

Downloading and preparing dataset head_qa/es (download: 1.77 MiB, generated: 2.82 MiB, post-processed: Unknown size, total: 4.59 MiB) to /root/.cache/huggingface/datasets/head_qa/es/1.1.0/d6803d1e84273cdc4a2cf3c5102945d166555f47b299ecbc5266d582f408f8e2...


Downloading:   0%|          | 0.00/1.86M [00:00<?, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset head_qa downloaded and prepared to /root/.cache/huggingface/datasets/head_qa/es/1.1.0/d6803d1e84273cdc4a2cf3c5102945d166555f47b299ecbc5266d582f408f8e2. Subsequent calls will reuse this data.


In [11]:
training_instances = load_dataset_from_pickle('/content/head-qa-afi/data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('/content/head-qa-afi/data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('/content/head-qa-afi/data/testing_ir.pickle')

oversampled_training = load_dataset_from_pickle('/content/head-qa-afi/data/oversampled_training_ir.pickle')
undersampled_training = load_dataset_from_pickle('/content/head-qa-afi/data/undersampled_training_ir.pickle')

In [12]:
training_categ = filter_by_category(undersampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [14]:
tokenizer = BertTokenizer.from_pretrained(BASE_BERT, do_lower_case=False)

In [15]:
train_inputs_0, train_masks_0, train_inputs_1, train_masks_1, train_labels = encoder_bert_ir(training_categ, tokenizer)
valid_inputs_0, valid_masks_0, valid_inputs_1, valid_masks_1, valid_labels = encoder_bert_ir(validation_categ, tokenizer)
test_inputs_0, test_masks_0, test_inputs_1, test_masks_1, test_labels = encoder_bert_ir(testing_categ, tokenizer)

In [16]:
train_inputs_0 = torch.tensor(train_inputs_0)
valid_inputs_0 = torch.tensor(valid_inputs_0)
test_inputs_0 = torch.tensor(test_inputs_0)

train_masks_0 = torch.tensor(train_masks_0)
valid_masks_0 = torch.tensor(valid_masks_0)
test_masks_0 = torch.tensor(test_masks_0)

train_inputs_1 = torch.tensor(train_inputs_1)
valid_inputs_1 = torch.tensor(valid_inputs_1)
test_inputs_1 = torch.tensor(test_inputs_1)

train_masks_1 = torch.tensor(train_masks_1)
valid_masks_1 = torch.tensor(valid_masks_1)
test_masks_1 = torch.tensor(test_masks_1)

train_labels = torch.tensor(train_labels)
valid_labels = torch.tensor(valid_labels)
test_labels = torch.tensor(test_labels)

In [17]:
batch_size = 64

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs_0, train_masks_0, train_inputs_1, train_masks_1, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
valid_data = TensorDataset(valid_inputs_0, valid_masks_0, valid_inputs_1, valid_masks_1, valid_labels)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs_0, test_masks_0, test_inputs_1, test_masks_1, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [18]:
model = BERT_QA(pretrained_model=BASE_BERT)

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at dccuchile/bert-base-spanish-wwm-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.bi

In [19]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 203 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (31002, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              (

In [20]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

epochs = 3
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [22]:
epochs_results = train_model(model, train_dataloader, valid_dataloader, valid_model, epochs, scheduler, optimizer)


Training...




Running Validation...
  Accuracy: 34.90
  Validation took: 0:01:34

  Average training loss: 0.69
  Training epoch took: 0:04:13

Training...
Running Validation...
  Accuracy: 32.97
  Validation took: 0:01:34

  Average training loss: 0.68
  Training epoch took: 0:04:12

Training...
Running Validation...
  Accuracy: 32.03
  Validation took: 0:01:34

  Average training loss: 0.68
  Training epoch took: 0:04:12

Training complete!


In [23]:
model_path = f'/content/head-qa-afi/code/trained_models/bert_qa_{CATEGORY}_big'
torch.save(model.state_dict(), model_path)

In [24]:
t0 = time.time()
model.eval()
acc, points = evaluate(model, dev_categ, encoder_bert_ir_instance, evaluator_bert_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')

acc, points = evaluate(model, test_categ, encoder_bert_ir_instance, evaluator_bert_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print(format_time(time.time() - t0))



DEV Dominio: chemistry
accuracy: tensor([0.2895]), points: 36
----------
TEST Dominio: chemistry
accuracy: tensor([0.2358]), points: -26
0:06:13


In [25]:
with open(f'/content/head-qa-afi/data/train_results_bert_qa_{CATEGORY}_sig_big.pickle', 'wb') as handle:
    pickle.dump(epochs_results, handle, protocol=pickle.HIGHEST_PROTOCOL)