In [1]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle
import training
from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
CATEGORY = 'nursery'

In [3]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [4]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [5]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [6]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [7]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [8]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [9]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [10]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)

Epoch 0 train loss  47.7163 valid loss 1.036 and accuracy 0.7478
Epoch 1 train loss  47.1762 valid loss 1.036 and accuracy 0.7489
Epoch 2 train loss  47.2937 valid loss 1.036 and accuracy 0.7489
Epoch 3 train loss  47.2867 valid loss 1.036 and accuracy 0.7489
Epoch 4 train loss  47.2958 valid loss 1.036 and accuracy 0.7489
Epoch 5 train loss  47.2825 valid loss 1.036 and accuracy 0.7489
Epoch 6 train loss  47.2959 valid loss 1.036 and accuracy 0.7489
Epoch 7 train loss  47.3773 valid loss 1.036 and accuracy 0.7489
Epoch 8 train loss  47.4878 valid loss 1.036 and accuracy 0.7478
Epoch 9 train loss  47.5163 valid loss 1.036 and accuracy 0.7478
Epoch 10 train loss  47.5163 valid loss 1.036 and accuracy 0.7478
Epoch 11 train loss  47.5164 valid loss 1.036 and accuracy 0.7478
Epoch 12 train loss  47.5165 valid loss 1.036 and accuracy 0.7478
Epoch 13 train loss  47.5165 valid loss 1.036 and accuracy 0.7478
Epoch 14 train loss  47.5166 valid loss 1.036 and accuracy 0.7478
Epoch 15 train loss 

In [11]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.2087]), points: -38
----------
TEST Dominio: nursery
accuracy: tensor([0.2462]), points: -7


In [12]:
model_path = os.getcwd() + f'/trained_models/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [13]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [14]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  1.7144 valid loss 0.033 and accuracy 0.7500
Epoch 1 train loss  1.7124 valid loss 0.033 and accuracy 0.7500
Epoch 2 train loss  1.7114 valid loss 0.033 and accuracy 0.7500
Epoch 3 train loss  1.7113 valid loss 0.033 and accuracy 0.7500
Epoch 4 train loss  1.7106 valid loss 0.033 and accuracy 0.7500
Epoch 5 train loss  1.7105 valid loss 0.033 and accuracy 0.7500
Epoch 6 train loss  1.7087 valid loss 0.033 and accuracy 0.7500
Epoch 7 train loss  1.7086 valid loss 0.033 and accuracy 0.7500
Epoch 8 train loss  1.7072 valid loss 0.033 and accuracy 0.7500
Epoch 9 train loss  1.7057 valid loss 0.033 and accuracy 0.7500
Epoch 10 train loss  1.7028 valid loss 0.033 and accuracy 0.7500
Epoch 11 train loss  1.7005 valid loss 0.033 and accuracy 0.7500
Epoch 12 train loss  1.6972 valid loss 0.033 and accuracy 0.7500
Epoch 13 train loss  1.6926 valid loss 0.033 and accuracy 0.7500
Epoch 14 train loss  1.6867 valid loss 0.033 and accuracy 0.7500
Epoch 15 train loss  1.6816 valid l

In [15]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.2217]), points: -26
----------
TEST Dominio: nursery
accuracy: tensor([0.2637]), points: 25


In [16]:
model_path = os.getcwd() + f'/trained_models/basic_lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [17]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [18]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)



In [19]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  1.7210 valid loss 0.033 and accuracy 0.7500
Epoch 1 train loss  1.7147 valid loss 0.033 and accuracy 0.7500
Epoch 2 train loss  1.7136 valid loss 0.033 and accuracy 0.7500
Epoch 3 train loss  1.7141 valid loss 0.033 and accuracy 0.7500
Epoch 4 train loss  1.7132 valid loss 0.033 and accuracy 0.7500
Epoch 5 train loss  1.7130 valid loss 0.033 and accuracy 0.7500
Epoch 6 train loss  1.7145 valid loss 0.033 and accuracy 0.7500
Epoch 7 train loss  1.7129 valid loss 0.033 and accuracy 0.7500
Epoch 8 train loss  1.7134 valid loss 0.033 and accuracy 0.7500
Epoch 9 train loss  1.7126 valid loss 0.033 and accuracy 0.7500
Epoch 10 train loss  1.7124 valid loss 0.033 and accuracy 0.7500
Epoch 11 train loss  1.7119 valid loss 0.033 and accuracy 0.7500
Epoch 12 train loss  1.7112 valid loss 0.033 and accuracy 0.7500
Epoch 13 train loss  1.7113 valid loss 0.033 and accuracy 0.7500
Epoch 14 train loss  1.7111 valid loss 0.033 and accuracy 0.7500
Epoch 15 train loss  1.7098 valid l

In [20]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.2087]), points: -38
----------
TEST Dominio: nursery
accuracy: tensor([0.2264]), points: -43


In [21]:
model_path = os.getcwd() + f'/trained_models/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [22]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training_ir.pickle')

In [23]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [24]:
vectorizer = Vectorizer.vectorize_ir_dataset(oversampled_training)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [25]:
trainset = HeadQA_IR(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=15)

In [26]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [27]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [28]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [29]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.7189 valid loss 0.005 and accuracy 0.7500
Epoch 1 train loss  0.7130 valid loss 0.005 and accuracy 0.7500
Epoch 2 train loss  0.6916 valid loss 0.005 and accuracy 0.7500
Epoch 3 train loss  0.6468 valid loss 0.005 and accuracy 0.7500
Epoch 4 train loss  0.6078 valid loss 0.006 and accuracy 0.7496
Epoch 5 train loss  0.5801 valid loss 0.006 and accuracy 0.7496
Epoch 6 train loss  0.5552 valid loss 0.007 and accuracy 0.7494
Epoch 7 train loss  0.5307 valid loss 0.006 and accuracy 0.7489
Epoch 8 train loss  0.5199 valid loss 0.006 and accuracy 0.7485
Epoch 9 train loss  0.5048 valid loss 0.006 and accuracy 0.7494
Epoch 10 train loss  0.4978 valid loss 0.006 and accuracy 0.7494
Epoch 11 train loss  0.4840 valid loss 0.006 and accuracy 0.7476
Epoch 12 train loss  0.4814 valid loss 0.007 and accuracy 0.7491
Epoch 13 train loss  0.4781 valid loss 0.006 and accuracy 0.7482
Epoch 14 train loss  0.4716 valid loss 0.006 and accuracy 0.7483
Epoch 15 train loss  0.4667 valid l

In [30]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: nursery
accuracy: tensor([0.1913]), points: -54
----------
TEST Dominio: nursery
accuracy: tensor([0.2615]), points: 21


In [31]:
model_path = os.getcwd() + f'/trained_models/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [34]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

TypeError: super(type, obj): obj must be an instance or subtype of type

In [None]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

In [None]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

In [None]:
model_path = os.getcwd() + f'/trained_models/lstm_cnn_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)