In [1]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle
import training
from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir, evaluate_better

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2



In [2]:
CATEGORY = 'biology'

In [3]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [38]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

mixed_training = load_dataset_from_pickle('../data/mixed_oversampling_training.pickle')

In [39]:
training_categ = filter_by_category(mixed_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [40]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [41]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [42]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [9]:
torch.random.manual_seed(42)
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [10]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)



Epoch 0 train loss  39.7331 valid loss 2.185 and accuracy 0.5145
Epoch 1 train loss  57.6679 valid loss 1.913 and accuracy 0.4922
Epoch 2 train loss  51.0371 valid loss 2.647 and accuracy 0.3683
Epoch 3 train loss  66.4870 valid loss 1.611 and accuracy 0.6451
Epoch 4 train loss  44.4319 valid loss 1.422 and accuracy 0.6663
Epoch 5 train loss  40.6669 valid loss 1.671 and accuracy 0.5737
Epoch 6 train loss  47.9072 valid loss 1.381 and accuracy 0.6674
Epoch 7 train loss  43.4086 valid loss 1.266 and accuracy 0.7054
Epoch 8 train loss  41.7250 valid loss 1.266 and accuracy 0.7221
Epoch 9 train loss  41.5001 valid loss 1.266 and accuracy 0.7154
Epoch 10 train loss  41.2636 valid loss 1.381 and accuracy 0.7121
Epoch 11 train loss  41.6210 valid loss 1.381 and accuracy 0.7076
Epoch 12 train loss  41.6338 valid loss 1.381 and accuracy 0.7121
Epoch 13 train loss  41.9512 valid loss 1.106 and accuracy 0.7299
Epoch 14 train loss  41.6915 valid loss 1.266 and accuracy 0.7299
Epoch 15 train loss 

In [11]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.2301]), points: -18
----------
TEST Dominio: biology
accuracy: tensor([0.2533]), points: 6


In [12]:
model_path = f'trained_models_v2/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [13]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [14]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.6329 valid loss 0.029 and accuracy 0.2500
Epoch 1 train loss  0.7263 valid loss 0.025 and accuracy 0.2500
Epoch 2 train loss  0.6946 valid loss 0.025 and accuracy 0.2500
Epoch 3 train loss  0.6906 valid loss 0.025 and accuracy 0.2500
Epoch 4 train loss  0.6866 valid loss 0.024 and accuracy 0.2500
Epoch 5 train loss  0.6824 valid loss 0.024 and accuracy 0.2500
Epoch 6 train loss  0.6765 valid loss 0.024 and accuracy 0.2522
Epoch 7 train loss  0.6659 valid loss 0.024 and accuracy 0.3136
Epoch 8 train loss  0.6482 valid loss 0.024 and accuracy 0.3371
Epoch 9 train loss  0.6114 valid loss 0.025 and accuracy 0.3940
Epoch 10 train loss  0.5620 valid loss 0.025 and accuracy 0.5301
Epoch 11 train loss  0.5115 valid loss 0.026 and accuracy 0.5737
Epoch 12 train loss  0.4500 valid loss 0.027 and accuracy 0.6150
Epoch 13 train loss  0.3888 valid loss 0.027 and accuracy 0.6105
Epoch 14 train loss  0.3549 valid loss 0.028 and accuracy 0.6429
Epoch 15 train loss  0.3264 valid l

In [15]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.3009]), points: 46
----------
TEST Dominio: biology
accuracy: tensor([0.2797]), points: 54


In [16]:
model_path = f'trained_models_v2/lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [17]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [18]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)



In [19]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.4436 valid loss 2.762 and accuracy 0.2500
Epoch 1 train loss  19.7950 valid loss 2.468 and accuracy 0.2500
Epoch 2 train loss  13.1380 valid loss 1.054 and accuracy 0.2500
Epoch 3 train loss  1.4101 valid loss 1.568 and accuracy 0.2500
Epoch 4 train loss  2.1352 valid loss 0.311 and accuracy 0.2500
Epoch 5 train loss  1.1977 valid loss 0.934 and accuracy 0.2489
Epoch 6 train loss  1.3837 valid loss 0.231 and accuracy 0.2489
Epoch 7 train loss  1.0271 valid loss 2.453 and accuracy 0.2500
Epoch 8 train loss  2.4974 valid loss 0.331 and accuracy 0.2500
Epoch 9 train loss  1.1671 valid loss 1.065 and accuracy 0.2500
Epoch 10 train loss  1.3254 valid loss 0.270 and accuracy 0.2500
Epoch 11 train loss  1.0261 valid loss 1.356 and accuracy 0.2500
Epoch 12 train loss  2.0137 valid loss 2.438 and accuracy 0.2500
Epoch 13 train loss  7.4514 valid loss 1.059 and accuracy 0.2500
Epoch 14 train loss  2.0131 valid loss 2.439 and accuracy 0.2500
Epoch 15 train loss  8.2253 valid

In [20]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.2965]), points: 42
----------
TEST Dominio: biology
accuracy: tensor([0.2467]), points: -6


In [21]:
model_path = f'trained_models_v2/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [44]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')

mixed_training_ir = load_dataset_from_pickle('../data/mixed_oversampling_training_ir.pickle')

In [45]:
training_categ = filter_by_category(mixed_training_ir, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [46]:
vectorizer = Vectorizer.vectorize_ir_dataset(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [47]:
trainset = HeadQA_IR(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=15)

In [48]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [49]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [29]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [30]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.5014 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4968 valid loss 0.003 and accuracy 0.7500
Epoch 2 train loss  0.4832 valid loss 0.004 and accuracy 0.7476
Epoch 3 train loss  0.4524 valid loss 0.004 and accuracy 0.7406
Epoch 4 train loss  0.4209 valid loss 0.004 and accuracy 0.7301
Epoch 5 train loss  0.3816 valid loss 0.004 and accuracy 0.7077
Epoch 6 train loss  0.3492 valid loss 0.005 and accuracy 0.6915
Epoch 7 train loss  0.3077 valid loss 0.005 and accuracy 0.7033
Epoch 8 train loss  0.2803 valid loss 0.005 and accuracy 0.6579
Epoch 9 train loss  0.2766 valid loss 0.006 and accuracy 0.7068
Epoch 10 train loss  0.2559 valid loss 0.006 and accuracy 0.7059
Epoch 11 train loss  0.2371 valid loss 0.007 and accuracy 0.7153
Epoch 12 train loss  0.2365 valid loss 0.007 and accuracy 0.7015
Epoch 13 train loss  0.2183 valid loss 0.006 and accuracy 0.6868
Epoch 14 train loss  0.2035 valid loss 0.007 and accuracy 0.6903
Epoch 15 train loss  0.2087 valid l

In [31]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.2655]), points: 14
----------
TEST Dominio: biology
accuracy: tensor([0.2577]), points: 14


In [32]:
model_path = f'trained_models_v2/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [33]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [34]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.5018 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4972 valid loss 0.003 and accuracy 0.7500
Epoch 2 train loss  0.4877 valid loss 0.003 and accuracy 0.7487
Epoch 3 train loss  0.4633 valid loss 0.004 and accuracy 0.7272
Epoch 4 train loss  0.4308 valid loss 0.004 and accuracy 0.6982
Epoch 5 train loss  0.3933 valid loss 0.004 and accuracy 0.6768
Epoch 6 train loss  0.3573 valid loss 0.004 and accuracy 0.6888
Epoch 7 train loss  0.3251 valid loss 0.004 and accuracy 0.7063
Epoch 8 train loss  0.2972 valid loss 0.005 and accuracy 0.6522
Epoch 9 train loss  0.2880 valid loss 0.004 and accuracy 0.6847
Epoch 10 train loss  0.2664 valid loss 0.004 and accuracy 0.6879
Epoch 11 train loss  0.2568 valid loss 0.004 and accuracy 0.6741
Epoch 12 train loss  0.2373 valid loss 0.004 and accuracy 0.6432
Epoch 13 train loss  0.2275 valid loss 0.005 and accuracy 0.6632
Epoch 14 train loss  0.2143 valid loss 0.005 and accuracy 0.6831
Epoch 15 train loss  0.2121 valid l

In [35]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: biology
accuracy: tensor([0.3230]), points: 66
----------
TEST Dominio: biology
accuracy: tensor([0.2335]), points: -30


In [36]:
model_path = f'trained_models_v2/lstm_cnn_qa_{CATEGORY}'
torch.save(lstm_cnn_qa.state_dict(), model_path)

### Evaluacion

In [43]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)

models = [logistic_regressor, lstm, bilstm]
paths = [f'trained_models_v2/logistic_regressor_{CATEGORY}', 
         f'trained_models_v2/lstm_{CATEGORY}',         
         f'trained_models_v2/bilstm_{CATEGORY}']

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print()    

DEV
Accuracy media 0.2300885
Puntos media -18.0
[tensor(0.2301)]
[-18]
---------
TEST
Accuracy media 0.25326037
Puntos media 3.0
[tensor(0.2632), tensor(0.2434)]
[12, -6]
---------

DEV
Accuracy media 0.30088496
Puntos media 46.0
[tensor(0.3009)]
[46]
---------
TEST
Accuracy media 0.2796538
Puntos media 27.0
[tensor(0.2982), tensor(0.2611)]
[44, 10]
---------

DEV
Accuracy media 0.29646018
Puntos media 42.0
[tensor(0.2965)]
[42]
---------
TEST
Accuracy media 0.24670082
Puntos media -3.0
[tensor(0.2456), tensor(0.2478)]
[-4, -2]
---------



In [50]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)

models = [lstm_qa, lstm_cnn_qa]

paths = [f'trained_models_v2/lstm_qa_{CATEGORY}',
         f'trained_models_v2/lstm_cnn_qa_{CATEGORY}'
        ]

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator_ir)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator_ir)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print()   

Loading pretrained embeddings...
Loading pretrained embeddings...
DEV
Accuracy media 0.26548672
Puntos media 14.0
[tensor(0.2655)]
[14]
---------
TEST
Accuracy media 0.2578016
Puntos media 7.0
[tensor(0.2368), tensor(0.2788)]
[-12, 26]
---------

DEV
Accuracy media 0.32300884
Puntos media 66.0
[tensor(0.3230)]
[66]
---------
TEST
Accuracy media 0.23360115
Puntos media -15.0
[tensor(0.2061), tensor(0.2611)]
[-40, 10]
---------

