In [1]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle
import training
from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
CATEGORY = 'biology'

In [3]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [28]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [37]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [30]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [31]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [32]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [64]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [65]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)

Epoch 0 train loss  47.6459 valid loss 0.974 and accuracy 0.7344
Epoch 1 train loss  48.1597 valid loss 1.036 and accuracy 0.7333
Epoch 2 train loss  48.6966 valid loss 1.036 and accuracy 0.7377
Epoch 3 train loss  48.7087 valid loss 1.036 and accuracy 0.7377
Epoch 4 train loss  48.7415 valid loss 1.036 and accuracy 0.7377
Epoch 5 train loss  48.7144 valid loss 1.036 and accuracy 0.7400
Epoch 6 train loss  48.7127 valid loss 1.036 and accuracy 0.7377
Epoch 7 train loss  48.7169 valid loss 1.036 and accuracy 0.7400
Epoch 8 train loss  48.6782 valid loss 1.036 and accuracy 0.7377
Epoch 9 train loss  48.8057 valid loss 1.036 and accuracy 0.7377
Epoch 10 train loss  48.7660 valid loss 1.036 and accuracy 0.7400
Epoch 11 train loss  48.7922 valid loss 1.036 and accuracy 0.7377
Epoch 12 train loss  48.7376 valid loss 1.036 and accuracy 0.7411
Epoch 13 train loss  48.6322 valid loss 1.036 and accuracy 0.7411
Epoch 14 train loss  48.6297 valid loss 1.036 and accuracy 0.7411
Epoch 15 train loss 

In [66]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.3009]), points: 46
----------
TEST Dominio: biology
accuracy: tensor([0.2357]), points: -26


In [67]:
model_path = os.getcwd() + f'/trained_models/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [69]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [70]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  1.7352 valid loss 0.033 and accuracy 0.7500
Epoch 1 train loss  1.7340 valid loss 0.033 and accuracy 0.7500
Epoch 2 train loss  1.7332 valid loss 0.033 and accuracy 0.7500
Epoch 3 train loss  1.7330 valid loss 0.033 and accuracy 0.7500
Epoch 4 train loss  1.7317 valid loss 0.033 and accuracy 0.7500
Epoch 5 train loss  1.7314 valid loss 0.033 and accuracy 0.7500
Epoch 6 train loss  1.7299 valid loss 0.033 and accuracy 0.7500
Epoch 7 train loss  1.7284 valid loss 0.033 and accuracy 0.7500
Epoch 8 train loss  1.7259 valid loss 0.033 and accuracy 0.7500
Epoch 9 train loss  1.7222 valid loss 0.033 and accuracy 0.7500
Epoch 10 train loss  1.7179 valid loss 0.033 and accuracy 0.7500
Epoch 11 train loss  1.7118 valid loss 0.033 and accuracy 0.7500
Epoch 12 train loss  1.6974 valid loss 0.033 and accuracy 0.7500
Epoch 13 train loss  1.6848 valid loss 0.033 and accuracy 0.7500
Epoch 14 train loss  1.6711 valid loss 0.033 and accuracy 0.7500
Epoch 15 train loss  1.6545 valid l

In [71]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.3053]), points: 50
----------
TEST Dominio: biology
accuracy: tensor([0.3150]), points: 118


In [73]:
model_path = os.getcwd() + f'/trained_models/basic_lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [74]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [78]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)

In [79]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=100)

Epoch 0 train loss  1.7386 valid loss 0.033 and accuracy 0.7500
Epoch 1 train loss  1.7355 valid loss 0.033 and accuracy 0.7500
Epoch 2 train loss  1.7338 valid loss 0.033 and accuracy 0.7500
Epoch 3 train loss  1.7336 valid loss 0.033 and accuracy 0.7500
Epoch 4 train loss  1.7343 valid loss 0.033 and accuracy 0.7500
Epoch 5 train loss  1.7322 valid loss 0.033 and accuracy 0.7500
Epoch 6 train loss  1.7339 valid loss 0.033 and accuracy 0.7500
Epoch 7 train loss  1.7333 valid loss 0.033 and accuracy 0.7500
Epoch 8 train loss  1.7330 valid loss 0.033 and accuracy 0.7500
Epoch 9 train loss  1.7315 valid loss 0.033 and accuracy 0.7500
Epoch 10 train loss  1.7312 valid loss 0.033 and accuracy 0.7500
Epoch 11 train loss  1.7296 valid loss 0.033 and accuracy 0.7500
Epoch 12 train loss  1.7298 valid loss 0.033 and accuracy 0.7500
Epoch 13 train loss  1.7289 valid loss 0.033 and accuracy 0.7500
Epoch 14 train loss  1.7287 valid loss 0.033 and accuracy 0.7500
Epoch 15 train loss  1.7262 valid l

In [80]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.3009]), points: 46
----------
TEST Dominio: biology
accuracy: tensor([0.2445]), points: -10


In [81]:
model_path = os.getcwd() + f'/trained_models/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [4]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training_ir.pickle')

In [5]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [6]:
vectorizer = Vectorizer.vectorize_ir_dataset(oversampled_training)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [7]:
trainset = HeadQA_IR(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=15)

In [8]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [9]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [10]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...




In [11]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=100)

Epoch 0 train loss  0.7191 valid loss 0.005 and accuracy 0.7500
Epoch 1 train loss  0.7126 valid loss 0.005 and accuracy 0.7500
Epoch 2 train loss  0.6871 valid loss 0.005 and accuracy 0.7500
Epoch 3 train loss  0.6478 valid loss 0.006 and accuracy 0.7500
Epoch 4 train loss  0.6068 valid loss 0.006 and accuracy 0.7500
Epoch 5 train loss  0.5797 valid loss 0.006 and accuracy 0.7494
Epoch 6 train loss  0.5524 valid loss 0.006 and accuracy 0.7502
Epoch 7 train loss  0.5298 valid loss 0.006 and accuracy 0.7504
Epoch 8 train loss  0.5217 valid loss 0.006 and accuracy 0.7498
Epoch 9 train loss  0.5087 valid loss 0.006 and accuracy 0.7496
Epoch 10 train loss  0.4903 valid loss 0.005 and accuracy 0.7496
Epoch 11 train loss  0.4891 valid loss 0.006 and accuracy 0.7500
Epoch 12 train loss  0.4853 valid loss 0.005 and accuracy 0.7493
Epoch 13 train loss  0.4785 valid loss 0.006 and accuracy 0.7502
Epoch 14 train loss  0.4729 valid loss 0.006 and accuracy 0.7498
Epoch 15 train loss  0.4658 valid l

In [13]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.2699]), points: 18
----------
TEST Dominio: biology
accuracy: tensor([0.2819]), points: 58


In [14]:
model_path = os.getcwd() + f'/trained_models/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [15]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [16]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=100)

Epoch 0 train loss  0.7188 valid loss 0.005 and accuracy 0.7500
Epoch 1 train loss  0.7115 valid loss 0.005 and accuracy 0.7500
Epoch 2 train loss  0.6807 valid loss 0.005 and accuracy 0.7500
Epoch 3 train loss  0.6336 valid loss 0.006 and accuracy 0.7498
Epoch 4 train loss  0.5947 valid loss 0.006 and accuracy 0.7500
Epoch 5 train loss  0.5663 valid loss 0.006 and accuracy 0.7494
Epoch 6 train loss  0.5511 valid loss 0.007 and accuracy 0.7498
Epoch 7 train loss  0.5284 valid loss 0.006 and accuracy 0.7496
Epoch 8 train loss  0.5172 valid loss 0.006 and accuracy 0.7498
Epoch 9 train loss  0.5034 valid loss 0.006 and accuracy 0.7504
Epoch 10 train loss  0.4937 valid loss 0.006 and accuracy 0.7502
Epoch 11 train loss  0.4882 valid loss 0.006 and accuracy 0.7494
Epoch 12 train loss  0.4818 valid loss 0.006 and accuracy 0.7509
Epoch 13 train loss  0.4759 valid loss 0.006 and accuracy 0.7502
Epoch 14 train loss  0.4729 valid loss 0.006 and accuracy 0.7498
Epoch 15 train loss  0.4672 valid l

In [17]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.3274]), points: 70
----------
TEST Dominio: biology
accuracy: tensor([0.3018]), points: 94


In [18]:
model_path = os.getcwd() + f'/trained_models/lstm_cnn_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)