In [1]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle
import training
from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir, evaluate_better

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2



In [2]:
CATEGORY = 'nursery'

In [3]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [43]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

mixed_training = load_dataset_from_pickle('../data/mixed_oversampling_training.pickle')

In [44]:
training_categ = filter_by_category(mixed_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [45]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [46]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [47]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [10]:
torch.random.manual_seed(42)
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [11]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)



Epoch 0 train loss  45.9939 valid loss 0.921 and accuracy 0.7176
Epoch 1 train loss  41.5816 valid loss 0.806 and accuracy 0.6373
Epoch 2 train loss  48.7944 valid loss 2.431 and accuracy 0.3984
Epoch 3 train loss  52.9548 valid loss 2.762 and accuracy 0.2924
Epoch 4 train loss  56.2437 valid loss 2.762 and accuracy 0.2578
Epoch 5 train loss  57.0490 valid loss 2.762 and accuracy 0.2511
Epoch 6 train loss  57.1488 valid loss 2.762 and accuracy 0.2511
Epoch 7 train loss  57.1488 valid loss 2.762 and accuracy 0.2511
Epoch 8 train loss  57.1485 valid loss 2.762 and accuracy 0.2511
Epoch 9 train loss  57.1485 valid loss 2.762 and accuracy 0.2511
Epoch 10 train loss  57.0547 valid loss 2.762 and accuracy 0.2511
Epoch 11 train loss  57.0546 valid loss 2.762 and accuracy 0.2511
Epoch 12 train loss  57.0545 valid loss 2.762 and accuracy 0.2511
Epoch 13 train loss  57.0544 valid loss 2.762 and accuracy 0.2511
Epoch 14 train loss  57.0535 valid loss 2.762 and accuracy 0.2511
Epoch 15 train loss 

In [12]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.2087]), points: -38
----------
TEST Dominio: nursery
accuracy: tensor([0.2440]), points: -11


In [13]:
model_path = f'trained_models_v2/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [14]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [15]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.7161 valid loss 0.025 and accuracy 0.2500
Epoch 1 train loss  0.7027 valid loss 0.025 and accuracy 0.2500
Epoch 2 train loss  0.6951 valid loss 0.025 and accuracy 0.2500
Epoch 3 train loss  0.6930 valid loss 0.025 and accuracy 0.2500
Epoch 4 train loss  0.6900 valid loss 0.025 and accuracy 0.2511
Epoch 5 train loss  0.6879 valid loss 0.025 and accuracy 0.2556
Epoch 6 train loss  0.6838 valid loss 0.025 and accuracy 0.2667
Epoch 7 train loss  0.6784 valid loss 0.025 and accuracy 0.2746
Epoch 8 train loss  0.6769 valid loss 0.024 and accuracy 0.2801
Epoch 9 train loss  0.6662 valid loss 0.024 and accuracy 0.2879
Epoch 10 train loss  0.6531 valid loss 0.024 and accuracy 0.3147
Epoch 11 train loss  0.6303 valid loss 0.023 and accuracy 0.4062
Epoch 12 train loss  0.5987 valid loss 0.022 and accuracy 0.4565
Epoch 13 train loss  0.5464 valid loss 0.022 and accuracy 0.4721
Epoch 14 train loss  0.5042 valid loss 0.021 and accuracy 0.5123
Epoch 15 train loss  0.4577 valid l

In [16]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.2087]), points: -38
----------
TEST Dominio: nursery
accuracy: tensor([0.2703]), points: 37


In [17]:
model_path = f'trained_models_v2/lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [48]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [19]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)



In [20]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.4468 valid loss 2.762 and accuracy 0.2500
Epoch 1 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 2 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 3 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 4 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 5 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 6 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 7 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 8 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 9 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 10 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 11 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 12 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 13 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 14 train loss  57.1429 valid loss 2.762 and accuracy 0.2500
Epoch 15 train loss  

In [21]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.2087]), points: -38
----------
TEST Dominio: nursery
accuracy: tensor([0.2440]), points: -11


In [22]:
model_path = f'trained_models_v2/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [50]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
mixed_training_ir = load_dataset_from_pickle('../data/mixed_oversampling_training_ir.pickle')

In [51]:
training_categ = filter_by_category(mixed_training_ir, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [52]:
vectorizer = Vectorizer.vectorize_ir_dataset(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [53]:
trainset = HeadQA_IR(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=15)

In [54]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [55]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [29]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [30]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.4733 valid loss 0.138 and accuracy 0.2500
Epoch 1 train loss  0.8241 valid loss 0.080 and accuracy 0.2500
Epoch 2 train loss  0.7956 valid loss 0.038 and accuracy 0.2500
Epoch 3 train loss  0.7441 valid loss 0.029 and accuracy 0.2500
Epoch 4 train loss  0.7352 valid loss 0.027 and accuracy 0.2500
Epoch 5 train loss  0.7297 valid loss 0.027 and accuracy 0.2500
Epoch 6 train loss  0.7274 valid loss 0.026 and accuracy 0.2500
Epoch 7 train loss  0.7223 valid loss 0.026 and accuracy 0.2500
Epoch 8 train loss  0.7252 valid loss 0.026 and accuracy 0.2500
Epoch 9 train loss  0.7186 valid loss 0.026 and accuracy 0.2500
Epoch 10 train loss  0.7092 valid loss 0.026 and accuracy 0.2500
Epoch 11 train loss  0.7044 valid loss 0.026 and accuracy 0.2500
Epoch 12 train loss  0.7038 valid loss 0.026 and accuracy 0.2500
Epoch 13 train loss  0.7003 valid loss 0.026 and accuracy 0.2500
Epoch 14 train loss  0.6996 valid loss 0.025 and accuracy 0.2500
Epoch 15 train loss  0.7039 valid l

In [31]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.2478]), points: -2
----------
TEST Dominio: nursery
accuracy: tensor([0.2659]), points: 29


In [32]:
model_path = f'trained_models_v2/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [33]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [34]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.4700 valid loss 0.156 and accuracy 0.2500
Epoch 1 train loss  0.8508 valid loss 0.068 and accuracy 0.2500
Epoch 2 train loss  0.7906 valid loss 0.036 and accuracy 0.2500
Epoch 3 train loss  0.7671 valid loss 0.027 and accuracy 0.2500
Epoch 4 train loss  0.7221 valid loss 0.027 and accuracy 0.2500
Epoch 5 train loss  0.7170 valid loss 0.027 and accuracy 0.2500
Epoch 6 train loss  0.7242 valid loss 0.027 and accuracy 0.2500
Epoch 7 train loss  0.7213 valid loss 0.026 and accuracy 0.2500
Epoch 8 train loss  0.7193 valid loss 0.026 and accuracy 0.2500
Epoch 9 train loss  0.7137 valid loss 0.026 and accuracy 0.2500
Epoch 10 train loss  0.7035 valid loss 0.026 and accuracy 0.2500
Epoch 11 train loss  0.7028 valid loss 0.026 and accuracy 0.2500
Epoch 12 train loss  0.7012 valid loss 0.026 and accuracy 0.2500
Epoch 13 train loss  0.7032 valid loss 0.026 and accuracy 0.2500
Epoch 14 train loss  0.7193 valid loss 0.026 and accuracy 0.2500
Epoch 15 train loss  0.7138 valid l

In [35]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.1957]), points: -50
----------
TEST Dominio: nursery
accuracy: tensor([0.2505]), points: 1


In [36]:
model_path = f'trained_models_v2/lstm_cnn_qa_{CATEGORY}'
torch.save(lstm_cnn_qa.state_dict(), model_path)

### Evaluacion

In [49]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)

models = [logistic_regressor, lstm, bilstm]
paths = [f'trained_models_v2/logistic_regressor_{CATEGORY}', 
         f'trained_models_v2/lstm_{CATEGORY}',         
         f'trained_models_v2/bilstm_{CATEGORY}']

print(paths[0])

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print()    

trained_models_v2/logistic_regressor_nursery
DEV
Accuracy media 0.20869565
Puntos media -38.0
[tensor(0.2087)]
[-38]
---------
TEST
Accuracy media 0.2437471
Puntos media -5.5
[tensor(0.2332), tensor(0.2543)]
[-15, 4]
---------

DEV
Accuracy media 0.20869565
Puntos media -38.0
[tensor(0.2087)]
[-38]
---------
TEST
Accuracy media 0.270218
Puntos media 18.5
[tensor(0.2646), tensor(0.2759)]
[13, 24]
---------

DEV
Accuracy media 0.20869565
Puntos media -38.0
[tensor(0.2087)]
[-38]
---------
TEST
Accuracy media 0.2437471
Puntos media -5.5
[tensor(0.2332), tensor(0.2543)]
[-15, 4]
---------



In [56]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)

models = [lstm_qa, lstm_cnn_qa]

paths = [f'trained_models_v2/lstm_qa_{CATEGORY}',
         f'trained_models_v2/lstm_cnn_qa_{CATEGORY}'
        ]

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator_ir)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator_ir)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print()  

Loading pretrained embeddings...
Loading pretrained embeddings...
DEV
Accuracy media 0.24782608
Puntos media -2.0
[tensor(0.2478)]
[-2]
---------
TEST
Accuracy media 0.2659077
Puntos media 14.5
[tensor(0.2646), tensor(0.2672)]
[13, 16]
---------

DEV
Accuracy media 0.19565217
Puntos media -50.0
[tensor(0.1957)]
[-50]
---------
TEST
Accuracy media 0.25064752
Puntos media 0.5
[tensor(0.2556), tensor(0.2457)]
[5, -4]
---------

