In [1]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle
import training
from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
CATEGORY = 'nursery'

In [3]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [4]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [5]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [6]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [7]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [8]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [9]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [10]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)



Epoch 0 train loss  48.6619 valid loss 1.151 and accuracy 0.7433
Epoch 1 train loss  48.8185 valid loss 1.151 and accuracy 0.7411
Epoch 2 train loss  48.8107 valid loss 1.154 and accuracy 0.7377
Epoch 3 train loss  48.8342 valid loss 1.266 and accuracy 0.7333
Epoch 4 train loss  48.8600 valid loss 0.921 and accuracy 0.7500
Epoch 5 train loss  48.8625 valid loss 0.921 and accuracy 0.7500
Epoch 6 train loss  48.8601 valid loss 0.921 and accuracy 0.7500
Epoch 7 train loss  48.8574 valid loss 0.921 and accuracy 0.7500
Epoch 8 train loss  48.8426 valid loss 0.921 and accuracy 0.7467
Epoch 9 train loss  48.8267 valid loss 0.921 and accuracy 0.7455
Epoch 10 train loss  48.9786 valid loss 0.806 and accuracy 0.7455
Epoch 11 train loss  48.8590 valid loss 1.266 and accuracy 0.7411
Epoch 12 train loss  48.8388 valid loss 1.266 and accuracy 0.7377
Epoch 13 train loss  48.8405 valid loss 1.239 and accuracy 0.7388
Epoch 14 train loss  48.9744 valid loss 1.266 and accuracy 0.7455
Epoch 15 train loss 

In [11]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.2043]), points: -42
----------
TEST Dominio: nursery
accuracy: tensor([0.2571]), points: 13


In [12]:
model_path = os.getcwd() + f'/trained_models/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [13]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [14]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.6973 valid loss 0.027 and accuracy 0.2500
Epoch 1 train loss  0.7080 valid loss 0.026 and accuracy 0.2500
Epoch 2 train loss  0.6991 valid loss 0.025 and accuracy 0.2500
Epoch 3 train loss  0.6953 valid loss 0.025 and accuracy 0.2511
Epoch 4 train loss  0.6920 valid loss 0.025 and accuracy 0.2511
Epoch 5 train loss  0.6884 valid loss 0.025 and accuracy 0.2533
Epoch 6 train loss  0.6829 valid loss 0.025 and accuracy 0.2667
Epoch 7 train loss  0.6707 valid loss 0.025 and accuracy 0.2812
Epoch 8 train loss  0.6512 valid loss 0.025 and accuracy 0.2991
Epoch 9 train loss  0.6131 valid loss 0.024 and accuracy 0.3895
Epoch 10 train loss  0.5656 valid loss 0.024 and accuracy 0.4799
Epoch 11 train loss  0.5294 valid loss 0.024 and accuracy 0.5112
Epoch 12 train loss  0.4841 valid loss 0.023 and accuracy 0.5681
Epoch 13 train loss  0.4357 valid loss 0.023 and accuracy 0.5826
Epoch 14 train loss  0.4068 valid loss 0.024 and accuracy 0.5882
Epoch 15 train loss  0.3885 valid l

In [15]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.2217]), points: -26
----------
TEST Dominio: nursery
accuracy: tensor([0.2747]), points: 45


In [16]:
model_path = os.getcwd() + f'/trained_models/basic_lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [17]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [18]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)



In [19]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.3906 valid loss 2.762 and accuracy 0.2500
Epoch 1 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 2 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 3 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 4 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 5 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 6 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 7 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 8 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 9 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 10 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 11 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 12 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 13 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 14 train loss  51.0638 valid loss 2.762 and accuracy 0.2500
Epoch 15 train loss  

In [20]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.2087]), points: -38
----------
TEST Dominio: nursery
accuracy: tensor([0.2440]), points: -11


In [21]:
model_path = os.getcwd() + f'/trained_models/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [22]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training_ir.pickle')

In [23]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [24]:
vectorizer = Vectorizer.vectorize_ir_dataset(oversampled_training)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [25]:
trainset = HeadQA_IR(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=15)

In [26]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [27]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [28]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [29]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.5015 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4944 valid loss 0.003 and accuracy 0.7500
Epoch 2 train loss  0.4647 valid loss 0.003 and accuracy 0.7463
Epoch 3 train loss  0.4109 valid loss 0.004 and accuracy 0.7213
Epoch 4 train loss  0.3513 valid loss 0.004 and accuracy 0.7048
Epoch 5 train loss  0.2835 valid loss 0.005 and accuracy 0.6818
Epoch 6 train loss  0.2420 valid loss 0.006 and accuracy 0.6756
Epoch 7 train loss  0.1999 valid loss 0.007 and accuracy 0.7050
Epoch 8 train loss  0.1714 valid loss 0.007 and accuracy 0.7132
Epoch 9 train loss  0.1326 valid loss 0.010 and accuracy 0.7132
Epoch 10 train loss  0.1103 valid loss 0.008 and accuracy 0.6739
Epoch 11 train loss  0.1076 valid loss 0.010 and accuracy 0.6912
Epoch 12 train loss  0.0944 valid loss 0.009 and accuracy 0.6790
Epoch 13 train loss  0.0749 valid loss 0.009 and accuracy 0.6835
Epoch 14 train loss  0.0745 valid loss 0.010 and accuracy 0.6743
Epoch 15 train loss  0.0750 valid l

In [30]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.2478]), points: -2
----------
TEST Dominio: nursery
accuracy: tensor([0.2681]), points: 33


In [31]:
model_path = os.getcwd() + f'/trained_models/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [32]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [33]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.5017 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4945 valid loss 0.003 and accuracy 0.7502
Epoch 2 train loss  0.4681 valid loss 0.003 and accuracy 0.7349
Epoch 3 train loss  0.4263 valid loss 0.003 and accuracy 0.6965
Epoch 4 train loss  0.3666 valid loss 0.004 and accuracy 0.6980
Epoch 5 train loss  0.3041 valid loss 0.004 and accuracy 0.6985
Epoch 6 train loss  0.2525 valid loss 0.004 and accuracy 0.6601
Epoch 7 train loss  0.1999 valid loss 0.005 and accuracy 0.6188
Epoch 8 train loss  0.1738 valid loss 0.006 and accuracy 0.6362
Epoch 9 train loss  0.1419 valid loss 0.006 and accuracy 0.6733
Epoch 10 train loss  0.1242 valid loss 0.009 and accuracy 0.7018
Epoch 11 train loss  0.0903 valid loss 0.009 and accuracy 0.6956
Epoch 12 train loss  0.0896 valid loss 0.009 and accuracy 0.7169
Epoch 13 train loss  0.0846 valid loss 0.008 and accuracy 0.6886
Epoch 14 train loss  0.0671 valid loss 0.008 and accuracy 0.6811
Epoch 15 train loss  0.0644 valid l

In [34]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.2522]), points: 2
----------
TEST Dominio: nursery
accuracy: tensor([0.2330]), points: -31


In [35]:
model_path = os.getcwd() + f'/trained_models/lstm_cnn_qa_sig_{CATEGORY}'
torch.save(lstm_cnn_qa.state_dict(), model_path)