In [1]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle
import training
from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir, evaluate_better

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2



In [2]:
CATEGORY = 'psychology'

In [3]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [36]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

mixed_training = load_dataset_from_pickle('../data/mixed_oversampling_training.pickle')

In [37]:
training_categ = filter_by_category(mixed_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [38]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [39]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [40]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [9]:
torch.random.manual_seed(42)
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [10]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)



Epoch 0 train loss  43.0091 valid loss 0.921 and accuracy 0.7422
Epoch 1 train loss  42.7286 valid loss 0.921 and accuracy 0.7422
Epoch 2 train loss  42.7286 valid loss 0.921 and accuracy 0.7422
Epoch 3 train loss  42.7286 valid loss 0.921 and accuracy 0.7422
Epoch 4 train loss  42.7286 valid loss 0.921 and accuracy 0.7422
Epoch 5 train loss  42.7286 valid loss 0.921 and accuracy 0.7422
Epoch 6 train loss  42.7285 valid loss 0.921 and accuracy 0.7422
Epoch 7 train loss  42.7285 valid loss 0.921 and accuracy 0.7422
Epoch 8 train loss  42.7285 valid loss 0.921 and accuracy 0.7422
Epoch 9 train loss  42.7285 valid loss 0.921 and accuracy 0.7422
Epoch 10 train loss  42.7285 valid loss 0.921 and accuracy 0.7422
Epoch 11 train loss  42.7284 valid loss 0.921 and accuracy 0.7422
Epoch 12 train loss  42.7284 valid loss 0.921 and accuracy 0.7422
Epoch 13 train loss  42.7284 valid loss 0.921 and accuracy 0.7422
Epoch 14 train loss  42.7284 valid loss 0.921 and accuracy 0.7422
Epoch 15 train loss 

In [11]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: psychology
accuracy: tensor([0.2611]), points: 10
----------
TEST Dominio: psychology
accuracy: tensor([0.2418]), points: -15


In [12]:
model_path = os.getcwd() + f'/trained_models_v2/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [13]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [14]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.6869 valid loss 0.025 and accuracy 0.2500
Epoch 1 train loss  0.6977 valid loss 0.024 and accuracy 0.2500
Epoch 2 train loss  0.6913 valid loss 0.024 and accuracy 0.2500
Epoch 3 train loss  0.6887 valid loss 0.024 and accuracy 0.2500
Epoch 4 train loss  0.6866 valid loss 0.024 and accuracy 0.2511
Epoch 5 train loss  0.6852 valid loss 0.024 and accuracy 0.2556
Epoch 6 train loss  0.6807 valid loss 0.024 and accuracy 0.2600
Epoch 7 train loss  0.6759 valid loss 0.023 and accuracy 0.2857
Epoch 8 train loss  0.6709 valid loss 0.023 and accuracy 0.2924
Epoch 9 train loss  0.6600 valid loss 0.023 and accuracy 0.3337
Epoch 10 train loss  0.6402 valid loss 0.023 and accuracy 0.3929
Epoch 11 train loss  0.6060 valid loss 0.023 and accuracy 0.4342
Epoch 12 train loss  0.5654 valid loss 0.023 and accuracy 0.4609
Epoch 13 train loss  0.5134 valid loss 0.023 and accuracy 0.4777
Epoch 14 train loss  0.4632 valid loss 0.024 and accuracy 0.4810
Epoch 15 train loss  0.4276 valid l

In [15]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: psychology
accuracy: tensor([0.3009]), points: 46
----------
TEST Dominio: psychology
accuracy: tensor([0.2484]), points: -3


In [16]:
model_path = os.getcwd() + f'/trained_models_v2/lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [41]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [18]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)



In [19]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.4343 valid loss 2.762 and accuracy 0.2500
Epoch 1 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 2 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 3 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 4 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 5 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 6 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 7 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 8 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 9 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 10 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 11 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 12 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 13 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 14 train loss  57.1970 valid loss 2.762 and accuracy 0.2500
Epoch 15 train loss  

In [20]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: psychology
accuracy: tensor([0.2478]), points: -2
----------
TEST Dominio: psychology
accuracy: tensor([0.2418]), points: -15


In [21]:
model_path = os.getcwd() + f'/trained_models_v2/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [43]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
mixed_training_ir = load_dataset_from_pickle('../data/mixed_oversampling_training_ir.pickle')

In [44]:
training_categ = filter_by_category(mixed_training_ir, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [45]:
vectorizer = Vectorizer.vectorize_ir_dataset(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [46]:
trainset = HeadQA_IR(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=15)

In [47]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [48]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [28]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [29]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.4379 valid loss 0.180 and accuracy 0.2500
Epoch 1 train loss  0.7679 valid loss 0.093 and accuracy 0.2500
Epoch 2 train loss  0.6700 valid loss 0.095 and accuracy 0.2500
Epoch 3 train loss  0.7141 valid loss 0.067 and accuracy 0.2500
Epoch 4 train loss  0.7683 valid loss 0.032 and accuracy 0.2500
Epoch 5 train loss  0.7264 valid loss 0.027 and accuracy 0.2500
Epoch 6 train loss  0.7112 valid loss 0.028 and accuracy 0.2500
Epoch 7 train loss  0.7231 valid loss 0.027 and accuracy 0.2500
Epoch 8 train loss  0.7147 valid loss 0.027 and accuracy 0.2500
Epoch 9 train loss  0.7074 valid loss 0.027 and accuracy 0.2500
Epoch 10 train loss  0.7006 valid loss 0.027 and accuracy 0.2500
Epoch 11 train loss  0.7026 valid loss 0.027 and accuracy 0.2500
Epoch 12 train loss  0.6916 valid loss 0.028 and accuracy 0.2500
Epoch 13 train loss  0.6823 valid loss 0.027 and accuracy 0.2500
Epoch 14 train loss  0.6836 valid loss 0.027 and accuracy 0.2500
Epoch 15 train loss  0.7086 valid l

In [30]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: psychology
accuracy: tensor([0.2743]), points: 22
----------
TEST Dominio: psychology
accuracy: tensor([0.2484]), points: -3


In [31]:
model_path = os.getcwd() + f'/trained_models_v2/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [32]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [33]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.4318 valid loss 0.179 and accuracy 0.2500
Epoch 1 train loss  0.7051 valid loss 0.136 and accuracy 0.2500
Epoch 2 train loss  0.7541 valid loss 0.109 and accuracy 0.2500
Epoch 3 train loss  0.8325 valid loss 0.030 and accuracy 0.2500
Epoch 4 train loss  0.7264 valid loss 0.026 and accuracy 0.2500
Epoch 5 train loss  0.7142 valid loss 0.025 and accuracy 0.2500
Epoch 6 train loss  0.7022 valid loss 0.026 and accuracy 0.2500
Epoch 7 train loss  0.6929 valid loss 0.025 and accuracy 0.2500
Epoch 8 train loss  0.6995 valid loss 0.026 and accuracy 0.2500
Epoch 9 train loss  0.7016 valid loss 0.026 and accuracy 0.2500
Epoch 10 train loss  0.7073 valid loss 0.026 and accuracy 0.2500
Epoch 11 train loss  0.7025 valid loss 0.026 and accuracy 0.2500
Epoch 12 train loss  0.7064 valid loss 0.025 and accuracy 0.2500
Epoch 13 train loss  0.7028 valid loss 0.025 and accuracy 0.2500
Epoch 14 train loss  0.6986 valid loss 0.025 and accuracy 0.2500
Epoch 15 train loss  0.6987 valid l

In [34]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: psychology
accuracy: tensor([0.2389]), points: -10
----------
TEST Dominio: psychology
accuracy: tensor([0.2813]), points: 57


In [35]:
model_path = os.getcwd() + f'/trained_models_v2/lstm_cnn_qa_{CATEGORY}'
torch.save(lstm_cnn_qa.state_dict(), model_path)

### Evaluacion

In [42]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)

models = [logistic_regressor, lstm, bilstm]
paths = [os.getcwd() + f'/trained_models_v2/logistic_regressor_{CATEGORY}', 
         os.getcwd() + f'/trained_models_v2/lstm_{CATEGORY}',         
         os.getcwd() + f'/trained_models_v2/bilstm_{CATEGORY}']

print(paths[0])

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print() 

C:\Users\tec005m\mds\TFM\head-qa-afi\code/trained_models_v2/logistic_regressor_psychology
DEV
Accuracy media 0.26106194
Puntos media 10.0
[tensor(0.2611)]
[10]
---------
TEST
Accuracy media 0.24173912
Puntos media -7.5
[tensor(0.2435), tensor(0.2400)]
[-6, -9]
---------

DEV
Accuracy media 0.30088496
Puntos media 46.0
[tensor(0.3009)]
[46]
---------
TEST
Accuracy media 0.24811596
Puntos media -1.5
[tensor(0.2696), tensor(0.2267)]
[18, -21]
---------

DEV
Accuracy media 0.24778761
Puntos media -2.0
[tensor(0.2478)]
[-2]
---------
TEST
Accuracy media 0.24173912
Puntos media -7.5
[tensor(0.2435), tensor(0.2400)]
[-6, -9]
---------



In [49]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)

models = [lstm_qa, lstm_cnn_qa]

paths = [os.getcwd() + f'/trained_models_v2/lstm_qa_{CATEGORY}',
         os.getcwd() + f'/trained_models_v2/lstm_cnn_qa_{CATEGORY}'
        ]

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator_ir)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator_ir)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print() 

Loading pretrained embeddings...
Loading pretrained embeddings...
DEV
Accuracy media 0.27433628
Puntos media 22.0
[tensor(0.2743)]
[22]
---------
TEST
Accuracy media 0.24816425
Puntos media -1.5
[tensor(0.2652), tensor(0.2311)]
[14, -17]
---------

DEV
Accuracy media 0.23893805
Puntos media -10.0
[tensor(0.2389)]
[-10]
---------
TEST
Accuracy media 0.28111112
Puntos media 28.5
[tensor(0.3000), tensor(0.2622)]
[46, 11]
---------

