In [15]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle

from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir, evaluate_better

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
CATEGORY = 'medicine'

In [17]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [50]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

mixed_training = load_dataset_from_pickle('../data/mixed_oversampling_training.pickle')

In [51]:
training_categ = filter_by_category(mixed_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [52]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [53]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [54]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [23]:
torch.random.manual_seed(42)
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [24]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)

Epoch 0 train loss  42.5911 valid loss 0.921 and accuracy 0.7478
Epoch 1 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 2 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 3 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 4 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 5 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 6 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 7 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 8 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 9 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 10 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 11 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 12 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 13 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 14 train loss  42.5189 valid loss 0.921 and accuracy 0.7478
Epoch 15 train loss 

In [25]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.7489]), points: 461
----------
TEST Dominio: medicine
accuracy: tensor([0.7559]), points: 937


In [26]:
model_path = os.getcwd() + f'/trained_models_v2/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [27]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [28]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.6680 valid loss 0.025 and accuracy 0.7500
Epoch 1 train loss  0.7053 valid loss 0.024 and accuracy 0.7500
Epoch 2 train loss  0.6950 valid loss 0.024 and accuracy 0.7500
Epoch 3 train loss  0.6910 valid loss 0.024 and accuracy 0.7500
Epoch 4 train loss  0.6896 valid loss 0.024 and accuracy 0.7500
Epoch 5 train loss  0.6867 valid loss 0.024 and accuracy 0.7500
Epoch 6 train loss  0.6844 valid loss 0.024 and accuracy 0.7500
Epoch 7 train loss  0.6804 valid loss 0.024 and accuracy 0.7500
Epoch 8 train loss  0.6749 valid loss 0.024 and accuracy 0.7489
Epoch 9 train loss  0.6679 valid loss 0.024 and accuracy 0.7444
Epoch 10 train loss  0.6602 valid loss 0.024 and accuracy 0.7388
Epoch 11 train loss  0.6364 valid loss 0.024 and accuracy 0.7243
Epoch 12 train loss  0.6113 valid loss 0.025 and accuracy 0.6853
Epoch 13 train loss  0.5835 valid loss 0.026 and accuracy 0.6685
Epoch 14 train loss  0.5496 valid loss 0.026 and accuracy 0.6440
Epoch 15 train loss  0.5124 valid l

In [29]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.5584]), points: 285
----------
TEST Dominio: medicine
accuracy: tensor([0.6026]), points: 653


In [30]:
model_path = os.getcwd() + f'/trained_models_v2/lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [55]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [32]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)



In [33]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.4419 valid loss 1.022 and accuracy 0.2500
Epoch 1 train loss  1.3515 valid loss 0.715 and accuracy 0.2500
Epoch 2 train loss  1.4661 valid loss 0.412 and accuracy 0.2500
Epoch 3 train loss  1.5565 valid loss 0.347 and accuracy 0.2500
Epoch 4 train loss  1.2589 valid loss 0.426 and accuracy 0.2500
Epoch 5 train loss  1.2791 valid loss 0.658 and accuracy 0.2500
Epoch 6 train loss  1.7317 valid loss 0.247 and accuracy 0.2500
Epoch 7 train loss  0.8928 valid loss 0.028 and accuracy 0.4944
Epoch 8 train loss  0.5455 valid loss 0.398 and accuracy 0.2500
Epoch 9 train loss  1.2190 valid loss 0.197 and accuracy 0.2500
Epoch 10 train loss  0.7375 valid loss 0.517 and accuracy 0.2500
Epoch 11 train loss  1.2416 valid loss 0.043 and accuracy 0.3471
Epoch 12 train loss  0.6681 valid loss 0.550 and accuracy 0.2500
Epoch 13 train loss  1.2698 valid loss 0.107 and accuracy 0.2556
Epoch 14 train loss  0.6219 valid loss 0.360 and accuracy 0.2500
Epoch 15 train loss  1.6044 valid l

In [34]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.5758]), points: 301
----------
TEST Dominio: medicine
accuracy: tensor([0.5983]), points: 645


In [35]:
model_path = os.getcwd() + f'/trained_models_v2/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [57]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
mixed_training_ir = load_dataset_from_pickle('../data/mixed_oversampling_training_ir.pickle')

In [58]:
training_categ = filter_by_category(mixed_training_ir, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [59]:
vectorizer = Vectorizer.vectorize_ir_dataset(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [60]:
trainset = HeadQA_IR(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=15)

In [61]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [62]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [42]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [43]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.4577 valid loss 0.178 and accuracy 0.2500
Epoch 1 train loss  0.8042 valid loss 0.087 and accuracy 0.2500
Epoch 2 train loss  0.7410 valid loss 0.054 and accuracy 0.2500
Epoch 3 train loss  0.7714 valid loss 0.028 and accuracy 0.7500
Epoch 4 train loss  0.7098 valid loss 0.028 and accuracy 0.7500
Epoch 5 train loss  0.7119 valid loss 0.026 and accuracy 0.7500
Epoch 6 train loss  0.7036 valid loss 0.027 and accuracy 0.7500
Epoch 7 train loss  0.7092 valid loss 0.027 and accuracy 0.7500
Epoch 8 train loss  0.7135 valid loss 0.027 and accuracy 0.7500
Epoch 9 train loss  0.7121 valid loss 0.027 and accuracy 0.7500
Epoch 10 train loss  0.7146 valid loss 0.027 and accuracy 0.7500
Epoch 11 train loss  0.7173 valid loss 0.026 and accuracy 0.7500
Epoch 12 train loss  0.7091 valid loss 0.026 and accuracy 0.7500
Epoch 13 train loss  0.7131 valid loss 0.026 and accuracy 0.7500
Epoch 14 train loss  0.7111 valid loss 0.026 and accuracy 0.7500
Epoch 15 train loss  0.7080 valid l

In [44]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.2641]), points: 13
----------
TEST Dominio: medicine
accuracy: tensor([0.3002]), points: 93


In [45]:
model_path = os.getcwd() + f'/trained_models_v2/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [46]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [47]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.4405 valid loss 0.204 and accuracy 0.2500
Epoch 1 train loss  0.8792 valid loss 0.065 and accuracy 0.2500
Epoch 2 train loss  0.7152 valid loss 0.055 and accuracy 0.2500
Epoch 3 train loss  0.7579 valid loss 0.032 and accuracy 0.3806
Epoch 4 train loss  0.7071 valid loss 0.027 and accuracy 0.7500
Epoch 5 train loss  0.6977 valid loss 0.027 and accuracy 0.7500
Epoch 6 train loss  0.6966 valid loss 0.027 and accuracy 0.7500
Epoch 7 train loss  0.6941 valid loss 0.026 and accuracy 0.7500
Epoch 8 train loss  0.7020 valid loss 0.026 and accuracy 0.7500
Epoch 9 train loss  0.7075 valid loss 0.025 and accuracy 0.7500
Epoch 10 train loss  0.7085 valid loss 0.025 and accuracy 0.7500
Epoch 11 train loss  0.7026 valid loss 0.025 and accuracy 0.7500
Epoch 12 train loss  0.6977 valid loss 0.025 and accuracy 0.7500
Epoch 13 train loss  0.6952 valid loss 0.025 and accuracy 0.7500
Epoch 14 train loss  0.6965 valid loss 0.025 and accuracy 0.7500
Epoch 15 train loss  0.7024 valid l

In [48]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: medicine
accuracy: tensor([0.2727]), points: 21
----------
TEST Dominio: medicine
accuracy: tensor([0.2873]), points: 69


In [49]:
model_path = os.getcwd() + f'/trained_models_v2/lstm_cnn_qa_{CATEGORY}'
torch.save(lstm_cnn_qa.state_dict(), model_path)

### Evaluacion

In [56]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)

models = [logistic_regressor, lstm, bilstm]
paths = [os.getcwd() + f'/trained_models_v2/logistic_regressor_{CATEGORY}', 
         os.getcwd() + f'/trained_models_v2/lstm_{CATEGORY}',         
         os.getcwd() + f'/trained_models_v2/bilstm_{CATEGORY}']

print(paths[0])

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print() 

C:\Users\tec005m\mds\TFM\head-qa-afi\code/trained_models_v2/logistic_regressor_medicine
DEV
Accuracy media 0.74891776
Puntos media 461.0
[tensor(0.7489)]
[461]
---------
TEST
Accuracy media 0.75594306
Puntos media 468.5
[tensor(0.7543), tensor(0.7576)]
[468, 469]
---------

DEV
Accuracy media 0.5584416
Puntos media 285.0
[tensor(0.5584)]
[285]
---------
TEST
Accuracy media 0.60252464
Puntos media 326.5
[tensor(0.6336), tensor(0.5714)]
[356, 297]
---------

DEV
Accuracy media 0.57575756
Puntos media 301.0
[tensor(0.5758)]
[301]
---------
TEST
Accuracy media 0.59827024
Puntos media 322.5
[tensor(0.5991), tensor(0.5974)]
[324, 321]
---------



In [63]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)

models = [lstm_qa, lstm_cnn_qa]

paths = [os.getcwd() + f'/trained_models_v2/lstm_qa_{CATEGORY}',
         os.getcwd() + f'/trained_models_v2/lstm_cnn_qa_{CATEGORY}'
        ]

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator_ir)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator_ir)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print() 

Loading pretrained embeddings...
Loading pretrained embeddings...
DEV
Accuracy media 0.26406926
Puntos media 13.0
[tensor(0.2641)]
[13]
---------
TEST
Accuracy media 0.30016607
Puntos media 46.5
[tensor(0.3233), tensor(0.2771)]
[68, 25]
---------

DEV
Accuracy media 0.27272728
Puntos media 21.0
[tensor(0.2727)]
[21]
---------
TEST
Accuracy media 0.28723502
Puntos media 34.5
[tensor(0.2974), tensor(0.2771)]
[44, 25]
---------

