In [4]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle

from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir, evaluate_better

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [5]:
CATEGORY = 'pharmacology'

In [6]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [7]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [8]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [9]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [10]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [11]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [12]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [10]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)



Epoch 0 train loss  51.6431 valid loss 0.806 and accuracy 0.6864
Epoch 1 train loss  51.3075 valid loss 0.921 and accuracy 0.7467
Epoch 2 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 3 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 4 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 5 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 6 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 7 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 8 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 9 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 10 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 11 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 12 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 13 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 14 train loss  50.7432 valid loss 0.921 and accuracy 0.7467
Epoch 15 train loss 

In [11]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: pharmacology
accuracy: tensor([0.2089]), points: -37
----------
TEST Dominio: pharmacology
accuracy: tensor([0.2407]), points: -17


In [12]:
model_path = os.getcwd() + f'/trained_models/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [13]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [14]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.6605 valid loss 0.029 and accuracy 0.2500
Epoch 1 train loss  0.7199 valid loss 0.026 and accuracy 0.2500
Epoch 2 train loss  0.7003 valid loss 0.026 and accuracy 0.2500
Epoch 3 train loss  0.6956 valid loss 0.026 and accuracy 0.2500
Epoch 4 train loss  0.6922 valid loss 0.026 and accuracy 0.2500
Epoch 5 train loss  0.6884 valid loss 0.026 and accuracy 0.2500
Epoch 6 train loss  0.6815 valid loss 0.026 and accuracy 0.2500
Epoch 7 train loss  0.6690 valid loss 0.027 and accuracy 0.2533
Epoch 8 train loss  0.6297 valid loss 0.027 and accuracy 0.2868
Epoch 9 train loss  0.5625 valid loss 0.029 and accuracy 0.3393
Epoch 10 train loss  0.5025 valid loss 0.025 and accuracy 0.5424
Epoch 11 train loss  0.4381 valid loss 0.025 and accuracy 0.5714
Epoch 12 train loss  0.3862 valid loss 0.025 and accuracy 0.5904
Epoch 13 train loss  0.3464 valid loss 0.025 and accuracy 0.6138
Epoch 14 train loss  0.3158 valid loss 0.026 and accuracy 0.6261
Epoch 15 train loss  0.2922 valid l

In [15]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: pharmacology
accuracy: tensor([0.1911]), points: -53
----------
TEST Dominio: pharmacology
accuracy: tensor([0.2319]), points: -33


In [16]:
model_path = os.getcwd() + f'/trained_models/basic_lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [14]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [15]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)



In [19]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.3722 valid loss 2.762 and accuracy 0.2500
Epoch 1 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 2 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 3 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 4 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 5 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 6 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 7 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 8 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 9 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 10 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 11 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 12 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 13 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 14 train loss  49.2457 valid loss 2.762 and accuracy 0.2500
Epoch 15 train loss  

In [20]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: pharmacology
accuracy: tensor([0.2489]), points: -1
----------
TEST Dominio: pharmacology
accuracy: tensor([0.2101]), points: -73


In [21]:
model_path = os.getcwd() + f'/trained_models/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [25]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training_ir.pickle')

In [26]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [27]:
vectorizer = Vectorizer.vectorize_ir_dataset(oversampled_training)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [28]:
trainset = HeadQA_IR(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=15)

In [29]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [30]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [31]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [29]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.5021 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4939 valid loss 0.003 and accuracy 0.7507
Epoch 2 train loss  0.4668 valid loss 0.003 and accuracy 0.7449
Epoch 3 train loss  0.4160 valid loss 0.003 and accuracy 0.6735
Epoch 4 train loss  0.3501 valid loss 0.003 and accuracy 0.6267
Epoch 5 train loss  0.3087 valid loss 0.004 and accuracy 0.6535
Epoch 6 train loss  0.2404 valid loss 0.004 and accuracy 0.6816
Epoch 7 train loss  0.1908 valid loss 0.006 and accuracy 0.7033
Epoch 8 train loss  0.1823 valid loss 0.005 and accuracy 0.6526
Epoch 9 train loss  0.1488 valid loss 0.005 and accuracy 0.6449
Epoch 10 train loss  0.1085 valid loss 0.005 and accuracy 0.6575
Epoch 11 train loss  0.1040 valid loss 0.008 and accuracy 0.6691
Epoch 12 train loss  0.0998 valid loss 0.007 and accuracy 0.6754
Epoch 13 train loss  0.0904 valid loss 0.006 and accuracy 0.6660
Epoch 14 train loss  0.0773 valid loss 0.010 and accuracy 0.6875
Epoch 15 train loss  0.0661 valid l

In [30]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: pharmacology
accuracy: tensor([0.2133]), points: -33
----------
TEST Dominio: pharmacology
accuracy: tensor([0.2473]), points: -5


In [31]:
model_path = os.getcwd() + f'/trained_models/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [32]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [33]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.5017 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4944 valid loss 0.003 and accuracy 0.7500
Epoch 2 train loss  0.4704 valid loss 0.004 and accuracy 0.7305
Epoch 3 train loss  0.4199 valid loss 0.003 and accuracy 0.7066
Epoch 4 train loss  0.3674 valid loss 0.004 and accuracy 0.6358
Epoch 5 train loss  0.3099 valid loss 0.005 and accuracy 0.6781
Epoch 6 train loss  0.2714 valid loss 0.004 and accuracy 0.6612
Epoch 7 train loss  0.2186 valid loss 0.005 and accuracy 0.6305
Epoch 8 train loss  0.1704 valid loss 0.006 and accuracy 0.6950
Epoch 9 train loss  0.1544 valid loss 0.008 and accuracy 0.6840
Epoch 10 train loss  0.1417 valid loss 0.006 and accuracy 0.6425
Epoch 11 train loss  0.1163 valid loss 0.008 and accuracy 0.6254
Epoch 12 train loss  0.0957 valid loss 0.005 and accuracy 0.6733
Epoch 13 train loss  0.0822 valid loss 0.006 and accuracy 0.6588
Epoch 14 train loss  0.0717 valid loss 0.007 and accuracy 0.6572
Epoch 15 train loss  0.0704 valid l

In [34]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: pharmacology
accuracy: tensor([0.2622]), points: 11
----------
TEST Dominio: pharmacology
accuracy: tensor([0.2495]), points: -1


In [37]:
model_path = os.getcwd() + f'/trained_models/lstm_cnn_qa_{CATEGORY}'
torch.save(lstm_cnn_qa.state_dict(), model_path)

### Evaluacion

In [16]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)

models = [logistic_regressor, lstm, bilstm]
paths = [os.getcwd() + f'/trained_models/logistic_regressor_{CATEGORY}', 
         os.getcwd() + f'/trained_models/basic_lstm_{CATEGORY}',         
         os.getcwd() + f'/trained_models/bilstm_{CATEGORY}']

print(paths[0])

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print() 

C:\Users\tec005m\mds\TFM\head-qa-afi\code/trained_models/logistic_regressor_pharmacology




DEV
Accuracy media 0.20888889
Puntos media -37.0
[tensor(0.2089)]
[-37]
---------
TEST
Accuracy media 0.24068221
Puntos media -8.5
[tensor(0.2325), tensor(0.2489)]
[-16, -1]
---------

DEV
Accuracy media 0.19111112
Puntos media -53.0
[tensor(0.1911)]
[-53]
---------
TEST
Accuracy media 0.23200604
Puntos media -16.5
[tensor(0.2588), tensor(0.2052)]
[8, -41]
---------

DEV
Accuracy media 0.2488889
Puntos media -1.0
[tensor(0.2489)]
[-1]
---------
TEST
Accuracy media 0.21006665
Puntos media -36.5
[tensor(0.2105), tensor(0.2096)]
[-36, -37]
---------



In [33]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)

models = [lstm_qa, lstm_cnn_qa]

paths = [os.getcwd() + f'/trained_models/lstm_qa_{CATEGORY}',
         os.getcwd() + f'/trained_models/lstm_cnn_qa_{CATEGORY}'
        ]

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator_ir)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator_ir)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print()  

Loading pretrained embeddings...
Loading pretrained embeddings...
DEV
Accuracy media 0.21333334
Puntos media -33.0
[tensor(0.2133)]
[-33]
---------
TEST
Accuracy media 0.24725159
Puntos media -2.5
[tensor(0.2412), tensor(0.2533)]
[-8, 3]
---------

DEV
Accuracy media 0.26222223
Puntos media 11.0
[tensor(0.2622)]
[11]
---------
TEST
Accuracy media 0.24952118
Puntos media -0.5
[tensor(0.2807), tensor(0.2183)]
[28, -29]
---------

