In [4]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle

from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir, evaluate_better

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2



In [5]:
CATEGORY = 'medicine'

In [6]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [7]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [8]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [9]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [10]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [11]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [12]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [10]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)



Epoch 0 train loss  49.5839 valid loss 0.921 and accuracy 0.7500
Epoch 1 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 2 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 3 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 4 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 5 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 6 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 7 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 8 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 9 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 10 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 11 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 12 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 13 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 14 train loss  49.2188 valid loss 0.921 and accuracy 0.7500
Epoch 15 train loss 

In [11]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.7489]), points: 461
----------
TEST Dominio: medicine
accuracy: tensor([0.7559]), points: 937


In [12]:
model_path = os.getcwd() + f'/trained_models/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [13]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [14]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.6859 valid loss 0.026 and accuracy 0.7500
Epoch 1 train loss  0.7086 valid loss 0.026 and accuracy 0.7500
Epoch 2 train loss  0.7003 valid loss 0.026 and accuracy 0.7500
Epoch 3 train loss  0.6972 valid loss 0.026 and accuracy 0.7500
Epoch 4 train loss  0.6966 valid loss 0.026 and accuracy 0.7500
Epoch 5 train loss  0.6952 valid loss 0.026 and accuracy 0.7500
Epoch 6 train loss  0.6952 valid loss 0.025 and accuracy 0.7500
Epoch 7 train loss  0.6932 valid loss 0.025 and accuracy 0.7500
Epoch 8 train loss  0.6892 valid loss 0.025 and accuracy 0.7444
Epoch 9 train loss  0.6873 valid loss 0.025 and accuracy 0.7444
Epoch 10 train loss  0.6750 valid loss 0.025 and accuracy 0.7333
Epoch 11 train loss  0.6572 valid loss 0.025 and accuracy 0.7121
Epoch 12 train loss  0.6265 valid loss 0.025 and accuracy 0.6931
Epoch 13 train loss  0.6007 valid loss 0.025 and accuracy 0.6920
Epoch 14 train loss  0.5746 valid loss 0.025 and accuracy 0.6942
Epoch 15 train loss  0.4957 valid l

In [15]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.6017]), points: 325
----------
TEST Dominio: medicine
accuracy: tensor([0.5961]), points: 641


In [16]:
model_path = os.getcwd() + f'/trained_models/basic_lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [14]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [15]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)



In [19]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.3882 valid loss 1.175 and accuracy 0.2500
Epoch 1 train loss  5.0803 valid loss 0.995 and accuracy 0.2500
Epoch 2 train loss  3.5764 valid loss 0.635 and accuracy 0.2500
Epoch 3 train loss  1.8625 valid loss 0.361 and accuracy 0.2500
Epoch 4 train loss  4.7184 valid loss 0.415 and accuracy 0.2500
Epoch 5 train loss  1.2986 valid loss 0.271 and accuracy 0.2500
Epoch 6 train loss  0.9460 valid loss 0.322 and accuracy 0.2500
Epoch 7 train loss  0.9120 valid loss 0.283 and accuracy 0.2500
Epoch 8 train loss  1.0054 valid loss 0.098 and accuracy 0.2500
Epoch 9 train loss  0.7150 valid loss 0.385 and accuracy 0.2500
Epoch 10 train loss  1.1712 valid loss 0.310 and accuracy 0.2500
Epoch 11 train loss  1.0082 valid loss 0.349 and accuracy 0.2500
Epoch 12 train loss  1.2337 valid loss 0.123 and accuracy 0.2500
Epoch 13 train loss  0.7399 valid loss 0.295 and accuracy 0.2500
Epoch 14 train loss  0.7538 valid loss 0.390 and accuracy 0.2500
Epoch 15 train loss  1.1738 valid l

In [20]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.6320]), points: 353
----------
TEST Dominio: medicine
accuracy: tensor([0.6199]), points: 685


In [21]:
model_path = os.getcwd() + f'/trained_models/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [18]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training_ir.pickle')

In [19]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [20]:
vectorizer = Vectorizer.vectorize_ir_dataset(oversampled_training)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [21]:
trainset = HeadQA_IR(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=15)

In [22]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [23]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [24]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [29]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.5015 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4955 valid loss 0.003 and accuracy 0.7502
Epoch 2 train loss  0.4701 valid loss 0.004 and accuracy 0.7465
Epoch 3 train loss  0.4179 valid loss 0.003 and accuracy 0.6996
Epoch 4 train loss  0.3643 valid loss 0.005 and accuracy 0.7031
Epoch 5 train loss  0.3063 valid loss 0.005 and accuracy 0.6820
Epoch 6 train loss  0.2611 valid loss 0.004 and accuracy 0.6559
Epoch 7 train loss  0.2121 valid loss 0.007 and accuracy 0.6960
Epoch 8 train loss  0.1595 valid loss 0.008 and accuracy 0.7007
Epoch 9 train loss  0.1566 valid loss 0.007 and accuracy 0.6515
Epoch 10 train loss  0.1363 valid loss 0.007 and accuracy 0.6399
Epoch 11 train loss  0.1202 valid loss 0.006 and accuracy 0.6643
Epoch 12 train loss  0.0949 valid loss 0.006 and accuracy 0.6820
Epoch 13 train loss  0.0782 valid loss 0.007 and accuracy 0.6511
Epoch 14 train loss  0.0643 valid loss 0.007 and accuracy 0.6897
Epoch 15 train loss  0.0675 valid l

In [30]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.2511]), points: 1
----------
TEST Dominio: medicine
accuracy: tensor([0.2462]), points: -7


In [31]:
model_path = os.getcwd() + f'/trained_models/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [25]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [33]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.5017 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4952 valid loss 0.003 and accuracy 0.7500
Epoch 2 train loss  0.4746 valid loss 0.004 and accuracy 0.7458
Epoch 3 train loss  0.4207 valid loss 0.004 and accuracy 0.7031
Epoch 4 train loss  0.3649 valid loss 0.004 and accuracy 0.7145
Epoch 5 train loss  0.2930 valid loss 0.005 and accuracy 0.6368
Epoch 6 train loss  0.2489 valid loss 0.006 and accuracy 0.7022
Epoch 7 train loss  0.2024 valid loss 0.007 and accuracy 0.7026
Epoch 8 train loss  0.1730 valid loss 0.007 and accuracy 0.7256
Epoch 9 train loss  0.1519 valid loss 0.008 and accuracy 0.7325
Epoch 10 train loss  0.1427 valid loss 0.007 and accuracy 0.7057
Epoch 11 train loss  0.0933 valid loss 0.006 and accuracy 0.6564
Epoch 12 train loss  0.0968 valid loss 0.010 and accuracy 0.7046
Epoch 13 train loss  0.0844 valid loss 0.009 and accuracy 0.7072
Epoch 14 train loss  0.0739 valid loss 0.008 and accuracy 0.7029
Epoch 15 train loss  0.0693 valid l

In [34]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.2294]), points: -19
----------
TEST Dominio: medicine
accuracy: tensor([0.2311]), points: -35


In [35]:
model_path = os.getcwd() + f'/trained_models/lstm_cnn_qa_{CATEGORY}'
torch.save(lstm_cnn_qa.state_dict(), model_path)

### Evaluacion

In [16]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)

models = [logistic_regressor, lstm, bilstm]
paths = [os.getcwd() + f'/trained_models/logistic_regressor_{CATEGORY}', 
         os.getcwd() + f'/trained_models/basic_lstm_{CATEGORY}',         
         os.getcwd() + f'/trained_models/bilstm_{CATEGORY}']

print(paths[0])

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print() 

C:\Users\tec005m\mds\TFM\head-qa-afi\code/trained_models/logistic_regressor_medicine




DEV
Accuracy media 0.74891776
Puntos media 461.0
[tensor(0.7489)]
[461]
---------
TEST
Accuracy media 0.75594306
Puntos media 468.5
[tensor(0.7543), tensor(0.7576)]
[468, 469]
---------

DEV
Accuracy media 0.6017316
Puntos media 325.0
[tensor(0.6017)]
[325]
---------
TEST
Accuracy media 0.5960871
Puntos media 320.5
[tensor(0.6078), tensor(0.5844)]
[332, 309]
---------

DEV
Accuracy media 0.63203466
Puntos media 353.0
[tensor(0.6320)]
[353]
---------
TEST
Accuracy media 0.6198873
Puntos media 342.5
[tensor(0.6121), tensor(0.6277)]
[336, 349]
---------



In [26]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)

models = [lstm_qa, lstm_cnn_qa]

paths = [os.getcwd() + f'/trained_models/lstm_qa_{CATEGORY}',
         os.getcwd() + f'/trained_models/lstm_cnn_qa_{CATEGORY}'
        ]

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator_ir)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator_ir)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print() 

Loading pretrained embeddings...
Loading pretrained embeddings...
DEV
Accuracy media 0.25108224
Puntos media 1.0
[tensor(0.2511)]
[1]
---------
TEST
Accuracy media 0.24619345
Puntos media -3.5
[tensor(0.2586), tensor(0.2338)]
[8, -15]
---------

DEV
Accuracy media 0.22943723
Puntos media -19.0
[tensor(0.2294)]
[-19]
---------
TEST
Accuracy media 0.23113525
Puntos media -17.5
[tensor(0.2155), tensor(0.2468)]
[-32, -3]
---------

