In [1]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle
import training
from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
CATEGORY = 'biology'

In [3]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [4]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [5]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [6]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [7]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [8]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [9]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [10]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)



Epoch 0 train loss  49.9159 valid loss 0.806 and accuracy 0.7288
Epoch 1 train loss  48.5355 valid loss 0.921 and accuracy 0.7377
Epoch 2 train loss  48.7181 valid loss 0.921 and accuracy 0.7366
Epoch 3 train loss  48.6935 valid loss 0.921 and accuracy 0.7366
Epoch 4 train loss  48.6484 valid loss 0.921 and accuracy 0.7355
Epoch 5 train loss  48.5818 valid loss 0.921 and accuracy 0.7377
Epoch 6 train loss  24.1746 valid loss 2.762 and accuracy 0.2500
Epoch 7 train loss  50.4228 valid loss 2.762 and accuracy 0.2500
Epoch 8 train loss  50.4230 valid loss 2.762 and accuracy 0.2500
Epoch 9 train loss  50.4230 valid loss 2.762 and accuracy 0.2500
Epoch 10 train loss  50.4230 valid loss 2.762 and accuracy 0.2500
Epoch 11 train loss  50.4230 valid loss 2.762 and accuracy 0.2500
Epoch 12 train loss  50.4230 valid loss 2.762 and accuracy 0.2500
Epoch 13 train loss  50.4230 valid loss 2.762 and accuracy 0.2500
Epoch 14 train loss  50.4230 valid loss 2.762 and accuracy 0.2500
Epoch 15 train loss 

In [11]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.2566]), points: 6
----------
TEST Dominio: biology
accuracy: tensor([0.2467]), points: -6


In [12]:
model_path = os.getcwd() + f'/trained_models/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [13]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [14]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.6168 valid loss 0.041 and accuracy 0.2500
Epoch 1 train loss  0.7854 valid loss 0.027 and accuracy 0.2500
Epoch 2 train loss  0.7089 valid loss 0.026 and accuracy 0.2500
Epoch 3 train loss  0.7011 valid loss 0.026 and accuracy 0.2500
Epoch 4 train loss  0.6954 valid loss 0.026 and accuracy 0.2489
Epoch 5 train loss  0.6871 valid loss 0.027 and accuracy 0.2511
Epoch 6 train loss  0.6716 valid loss 0.027 and accuracy 0.2511
Epoch 7 train loss  0.6325 valid loss 0.030 and accuracy 0.2623
Epoch 8 train loss  0.5893 valid loss 0.032 and accuracy 0.3170
Epoch 9 train loss  0.5181 valid loss 0.031 and accuracy 0.3638
Epoch 10 train loss  0.4560 valid loss 0.029 and accuracy 0.5502
Epoch 11 train loss  0.3794 valid loss 0.029 and accuracy 0.5993
Epoch 12 train loss  0.3390 valid loss 0.030 and accuracy 0.6350
Epoch 13 train loss  0.3025 valid loss 0.031 and accuracy 0.6328
Epoch 14 train loss  0.2782 valid loss 0.036 and accuracy 0.5971
Epoch 15 train loss  0.2568 valid l

In [15]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.2965]), points: 42
----------
TEST Dominio: biology
accuracy: tensor([0.2974]), points: 86


In [16]:
model_path = os.getcwd() + f'/trained_models/basic_lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [17]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [18]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)



In [19]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.3888 valid loss 2.762 and accuracy 0.2500
Epoch 1 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 2 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 3 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 4 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 5 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 6 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 7 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 8 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 9 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 10 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 11 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 12 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 13 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 14 train loss  50.4464 valid loss 2.762 and accuracy 0.2500
Epoch 15 train loss  

In [20]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.2257]), points: -22
----------
TEST Dominio: biology
accuracy: tensor([0.2247]), points: -46


In [21]:
model_path = os.getcwd() + f'/trained_models/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [22]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training_ir.pickle')

In [23]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [24]:
vectorizer = Vectorizer.vectorize_ir_dataset(oversampled_training)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [25]:
trainset = HeadQA_IR(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=15)

In [26]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [27]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [30]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [31]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.5016 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4944 valid loss 0.003 and accuracy 0.7498
Epoch 2 train loss  0.4683 valid loss 0.003 and accuracy 0.7265
Epoch 3 train loss  0.4232 valid loss 0.004 and accuracy 0.6739
Epoch 4 train loss  0.3587 valid loss 0.004 and accuracy 0.6822
Epoch 5 train loss  0.2965 valid loss 0.005 and accuracy 0.6520
Epoch 6 train loss  0.2583 valid loss 0.006 and accuracy 0.6820
Epoch 7 train loss  0.1954 valid loss 0.008 and accuracy 0.6864
Epoch 8 train loss  0.1719 valid loss 0.007 and accuracy 0.6879
Epoch 9 train loss  0.1450 valid loss 0.007 and accuracy 0.6746
Epoch 10 train loss  0.1233 valid loss 0.008 and accuracy 0.6842
Epoch 11 train loss  0.0967 valid loss 0.006 and accuracy 0.6746
Epoch 12 train loss  0.0820 valid loss 0.007 and accuracy 0.6572
Epoch 13 train loss  0.0828 valid loss 0.007 and accuracy 0.6465
Epoch 14 train loss  0.0711 valid loss 0.006 and accuracy 0.6631
Epoch 15 train loss  0.0591 valid l

In [32]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.3009]), points: 46
----------
TEST Dominio: biology
accuracy: tensor([0.2952]), points: 82


In [33]:
model_path = os.getcwd() + f'/trained_models/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [34]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [35]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.5018 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4938 valid loss 0.004 and accuracy 0.7502
Epoch 2 train loss  0.4670 valid loss 0.004 and accuracy 0.7401
Epoch 3 train loss  0.4160 valid loss 0.005 and accuracy 0.7145
Epoch 4 train loss  0.3632 valid loss 0.006 and accuracy 0.6610
Epoch 5 train loss  0.3058 valid loss 0.007 and accuracy 0.6803
Epoch 6 train loss  0.2571 valid loss 0.006 and accuracy 0.6537
Epoch 7 train loss  0.2124 valid loss 0.008 and accuracy 0.7094
Epoch 8 train loss  0.1791 valid loss 0.011 and accuracy 0.7145
Epoch 9 train loss  0.1458 valid loss 0.011 and accuracy 0.6888
Epoch 10 train loss  0.1334 valid loss 0.011 and accuracy 0.6518
Epoch 11 train loss  0.1155 valid loss 0.010 and accuracy 0.6390
Epoch 12 train loss  0.0895 valid loss 0.014 and accuracy 0.6849
Epoch 13 train loss  0.0848 valid loss 0.012 and accuracy 0.6456
Epoch 14 train loss  0.0811 valid loss 0.012 and accuracy 0.6557
Epoch 15 train loss  0.0680 valid l

In [36]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.2478]), points: -2
----------
TEST Dominio: biology
accuracy: tensor([0.2885]), points: 70


In [37]:
model_path = os.getcwd() + f'/trained_models/lstm_cnn_qa:{CATEGORY}'
torch.save(lstm_cnn_qa.state_dict(), model_path)