In [29]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle

from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir, evaluate_better

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
CATEGORY = 'medicine'

In [31]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [72]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

mixed_training = load_dataset_from_pickle('../data/mixed_oversampling_training.pickle')

In [73]:
training_categ = filter_by_category(mixed_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [74]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [75]:
vectorizer.label_vocab.vocab2index = {1:1, 0:0}
vectorizer.label_vocab.index2vocab = {0:0, 1:1}

In [76]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [77]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [38]:
torch.random.manual_seed(42)
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [39]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)

Epoch 0 train loss  31.6583 valid loss 2.302 and accuracy 0.3181
Epoch 1 train loss  56.2315 valid loss 2.283 and accuracy 0.2690
Epoch 2 train loss  57.1161 valid loss 2.762 and accuracy 0.2533
Epoch 3 train loss  57.4692 valid loss 2.762 and accuracy 0.2500
Epoch 4 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 5 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 6 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 7 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 8 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 9 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 10 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 11 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 12 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 13 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 14 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 15 train loss 

In [40]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.2511]), points: 1
----------
TEST Dominio: medicine
accuracy: tensor([0.2441]), points: -11


In [41]:
model_path = os.getcwd() + f'/trained_models_v2/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [42]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [43]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.6786 valid loss 0.025 and accuracy 0.2500
Epoch 1 train loss  0.7070 valid loss 0.025 and accuracy 0.2500
Epoch 2 train loss  0.6992 valid loss 0.025 and accuracy 0.2500
Epoch 3 train loss  0.6947 valid loss 0.024 and accuracy 0.2500
Epoch 4 train loss  0.6899 valid loss 0.024 and accuracy 0.2500
Epoch 5 train loss  0.6880 valid loss 0.024 and accuracy 0.2545
Epoch 6 train loss  0.6851 valid loss 0.024 and accuracy 0.2634
Epoch 7 train loss  0.6805 valid loss 0.024 and accuracy 0.2768
Epoch 8 train loss  0.6745 valid loss 0.024 and accuracy 0.2902
Epoch 9 train loss  0.6691 valid loss 0.024 and accuracy 0.3036
Epoch 10 train loss  0.6583 valid loss 0.024 and accuracy 0.3538
Epoch 11 train loss  0.6398 valid loss 0.024 and accuracy 0.3895
Epoch 12 train loss  0.6112 valid loss 0.024 and accuracy 0.4007
Epoch 13 train loss  0.5772 valid loss 0.024 and accuracy 0.4286
Epoch 14 train loss  0.5486 valid loss 0.025 and accuracy 0.4364
Epoch 15 train loss  0.5069 valid l

In [44]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.2771]), points: 25
----------
TEST Dominio: medicine
accuracy: tensor([0.2052]), points: -83


In [45]:
model_path = os.getcwd() + f'/trained_models_v2/lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [78]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [47]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)



In [48]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.4452 valid loss 2.762 and accuracy 0.2500
Epoch 1 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 2 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 3 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 4 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 5 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 6 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 7 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 8 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 9 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 10 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 11 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 12 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 13 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 14 train loss  57.4495 valid loss 2.762 and accuracy 0.2500
Epoch 15 train loss  

In [49]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.2511]), points: 1
----------
TEST Dominio: medicine
accuracy: tensor([0.2441]), points: -11


In [50]:
model_path = os.getcwd() + f'/trained_models_v2/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [80]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
mixed_training_ir = load_dataset_from_pickle('../data/mixed_oversampling_training_ir.pickle')

In [81]:
training_categ = filter_by_category(mixed_training_ir, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [82]:
vectorizer = Vectorizer.vectorize_ir_dataset(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

vectorizer.label_vocab.vocab2index = {1:1, 0:0}
vectorizer.label_vocab.index2vocab = {0:0, 1:1}

In [83]:
trainset = HeadQA_IR(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=15)

In [84]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [85]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [57]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [58]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.4378 valid loss 0.209 and accuracy 0.2500
Epoch 1 train loss  0.8067 valid loss 0.112 and accuracy 0.2500
Epoch 2 train loss  0.8706 valid loss 0.030 and accuracy 0.2500
Epoch 3 train loss  0.7028 valid loss 0.032 and accuracy 0.2500
Epoch 4 train loss  0.6776 valid loss 0.034 and accuracy 0.2500
Epoch 5 train loss  0.6911 valid loss 0.028 and accuracy 0.2500
Epoch 6 train loss  0.7019 valid loss 0.026 and accuracy 0.2500
Epoch 7 train loss  0.6948 valid loss 0.027 and accuracy 0.2500
Epoch 8 train loss  0.7041 valid loss 0.026 and accuracy 0.2500
Epoch 9 train loss  0.7091 valid loss 0.026 and accuracy 0.2500
Epoch 10 train loss  0.7020 valid loss 0.026 and accuracy 0.2500
Epoch 11 train loss  0.7149 valid loss 0.025 and accuracy 0.2500
Epoch 12 train loss  0.7059 valid loss 0.025 and accuracy 0.2500
Epoch 13 train loss  0.7016 valid loss 0.026 and accuracy 0.2500
Epoch 14 train loss  0.7071 valid loss 0.025 and accuracy 0.2500
Epoch 15 train loss  0.7059 valid l

In [59]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.2727]), points: 21
----------
TEST Dominio: medicine
accuracy: tensor([0.2441]), points: -11


In [60]:
model_path = os.getcwd() + f'/trained_models_v2/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [61]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [62]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.4453 valid loss 0.180 and accuracy 0.2500
Epoch 1 train loss  0.8463 valid loss 0.074 and accuracy 0.2500
Epoch 2 train loss  0.7834 valid loss 0.038 and accuracy 0.2500
Epoch 3 train loss  0.6805 valid loss 0.048 and accuracy 0.2500
Epoch 4 train loss  0.6757 valid loss 0.038 and accuracy 0.2500
Epoch 5 train loss  0.7092 valid loss 0.028 and accuracy 0.2500
Epoch 6 train loss  0.6992 valid loss 0.027 and accuracy 0.2500
Epoch 7 train loss  0.7149 valid loss 0.027 and accuracy 0.2500
Epoch 8 train loss  0.7075 valid loss 0.027 and accuracy 0.2500
Epoch 9 train loss  0.7060 valid loss 0.027 and accuracy 0.2500
Epoch 10 train loss  0.7129 valid loss 0.027 and accuracy 0.2500
Epoch 11 train loss  0.7187 valid loss 0.027 and accuracy 0.2500
Epoch 12 train loss  0.7188 valid loss 0.026 and accuracy 0.2500
Epoch 13 train loss  0.7135 valid loss 0.026 and accuracy 0.2500
Epoch 14 train loss  0.7064 valid loss 0.026 and accuracy 0.2500
Epoch 15 train loss  0.7027 valid l

In [63]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.2424]), points: -7
----------
TEST Dominio: medicine
accuracy: tensor([0.2549]), points: 9


In [64]:
model_path = os.getcwd() + f'/trained_models_v2/lstm_cnn_qa_{CATEGORY}'
torch.save(lstm_cnn_qa.state_dict(), model_path)

### Evaluacion

In [79]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)

models = [logistic_regressor, lstm, bilstm]
paths = [os.getcwd() + f'/trained_models_v2/logistic_regressor_{CATEGORY}', 
         os.getcwd() + f'/trained_models_v2/lstm_{CATEGORY}',         
         os.getcwd() + f'/trained_models_v2/bilstm_{CATEGORY}']

print(paths[0])

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print() 

C:\Users\tec005m\mds\TFM\head-qa-afi\code/trained_models_v2/logistic_regressor_medicine
DEV
Accuracy media 0.25108224
Puntos media 1.0
[tensor(0.2511)]
[1]
---------
TEST
Accuracy media 0.24405695
Puntos media -5.5
[tensor(0.2457), tensor(0.2424)]
[-4, -7]
---------

DEV
Accuracy media 0.27705628
Puntos media 25.0
[tensor(0.2771)]
[25]
---------
TEST
Accuracy media 0.20515189
Puntos media -41.5
[tensor(0.2198), tensor(0.1905)]
[-28, -55]
---------

DEV
Accuracy media 0.25108224
Puntos media 1.0
[tensor(0.2511)]
[1]
---------
TEST
Accuracy media 0.24405695
Puntos media -5.5
[tensor(0.2457), tensor(0.2424)]
[-4, -7]
---------



In [86]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)

models = [lstm_qa, lstm_cnn_qa]

paths = [os.getcwd() + f'/trained_models_v2/lstm_qa_{CATEGORY}',
         os.getcwd() + f'/trained_models_v2/lstm_cnn_qa_{CATEGORY}'
        ]

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator_ir)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator_ir)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print() 

Loading pretrained embeddings...
Loading pretrained embeddings...
DEV
Accuracy media 0.27272728
Puntos media 21.0
[tensor(0.2727)]
[21]
---------
TEST
Accuracy media 0.2440103
Puntos media -5.5
[tensor(0.2672), tensor(0.2208)]
[16, -27]
---------

DEV
Accuracy media 0.24242425
Puntos media -7.0
[tensor(0.2424)]
[-7]
---------
TEST
Accuracy media 0.25490746
Puntos media 4.5
[tensor(0.2328), tensor(0.2771)]
[-16, 25]
---------

