In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

import sys
sys.path.insert(0,'/content/head-qa-afi/code')

In [None]:
# !git clone https://github.com/claudiaqw/head-qa-afi.git

In [None]:
!pip install transformers
!pip install datasets
!python -m spacy download es_core_news_sm
!pip install pickle5
# !python -m spacy download es_core_news_lg

In [None]:
import os
import time
import datetime
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pickle5 as pickle

import copy
import spacy
import pickle5 as pickle
import collections
from tqdm import tqdm_notebook, trange


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, TensorDataset, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import transformers
from transformers.optimization import AdamW
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer, BertModel, BertForMaskedLM
from transformers import get_linear_schedule_with_warmup

import training
from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir
from training import flat_accuracy, format_time, train_model, valid_model

from ir_models import BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
def load_dataset_from_pickle(filename):
    with open(filename, 'rb') as handle:
        return pickle.load(handle)

def filter_by_category(dataset, category):
    filtered_dataset = []
    for instance in dataset:
        categ = instance['category']
        if categ == category:
            filtered_dataset.append(instance)
    return filtered_dataset

In [None]:
CATEGORY = 'biology'
BASE_BERT = 'dccuchile/bert-base-spanish-wwm-cased'

In [None]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [None]:
training_instances = load_dataset_from_pickle('/content/head-qa-afi/data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('/content/head-qa-afi/data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('/content/head-qa-afi/data/testing_ir.pickle')

oversampled_training = load_dataset_from_pickle('/content/head-qa-afi/data/oversampled_training_ir.pickle')

In [None]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [None]:
tokenizer = BertTokenizer.from_pretrained(BASE_BERT, do_lower_case=False)

In [None]:
train_inputs_0, train_masks_0, train_inputs_1, train_masks_1, train_labels = encoder_bert_ir(training_categ, tokenizer)
valid_inputs_0, valid_masks_0, valid_inputs_1, valid_masks_1, valid_labels = encoder_bert_ir(validation_categ, tokenizer)
test_inputs_0, test_masks_0, test_inputs_1, test_masks_1, test_labels = encoder_bert_ir(testing_categ, tokenizer)

In [None]:
train_inputs_0 = torch.tensor(train_inputs_0)
valid_inputs_0 = torch.tensor(valid_inputs_0)
test_inputs_0 = torch.tensor(test_inputs_0)

train_masks_0 = torch.tensor(train_masks_0)
valid_masks_0 = torch.tensor(valid_masks_0)
test_masks_0 = torch.tensor(test_masks_0)

train_inputs_1 = torch.tensor(train_inputs_1)
valid_inputs_1 = torch.tensor(valid_inputs_1)
test_inputs_1 = torch.tensor(test_inputs_1)

train_masks_1 = torch.tensor(train_masks_1)
valid_masks_1 = torch.tensor(valid_masks_1)
test_masks_1 = torch.tensor(test_masks_1)

train_labels = torch.tensor(train_labels)
valid_labels = torch.tensor(valid_labels)
test_labels = torch.tensor(test_labels)

In [None]:
batch_size = 8

# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs_0, train_masks_0, train_inputs_1, train_masks_1, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
valid_data = TensorDataset(valid_inputs_0, valid_masks_0, valid_inputs_1, valid_masks_1, valid_labels)
valid_sampler = SequentialSampler(valid_data)
valid_dataloader = DataLoader(valid_data, sampler=valid_sampler, batch_size=batch_size)

# Create the DataLoader for our test set.
test_data = TensorDataset(test_inputs_0, test_masks_0, test_inputs_1, test_masks_1, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:
model = BERT_QA(pretrained_model=BASE_BERT)

In [None]:
params = list(model.named_parameters())

print('The BERT model has {:} different named parameters.\n'.format(len(params)))

print('==== Embedding Layer ====\n')

for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== First Transformer ====\n')

for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in params[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )

epochs = 4
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
epochs_results = train_model(model, train_dataloader, valid_dataloader, epochs=4, scheduler, optimizer, valid_model)

In [None]:
acc, points = evaluate(model, validation, encoder_bert, evaluator_bert)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')

acc, points = evaluate(model, testing, encoder_bert, evaluator_bert)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')