In [1]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle
import training
from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
CATEGORY = 'chemistry'

In [3]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [4]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [5]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [6]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [7]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [8]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [9]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [10]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)



Epoch 0 train loss  51.2619 valid loss 0.921 and accuracy 0.7489
Epoch 1 train loss  50.4455 valid loss 0.921 and accuracy 0.7500
Epoch 2 train loss  50.4435 valid loss 0.921 and accuracy 0.7489
Epoch 3 train loss  50.4303 valid loss 0.921 and accuracy 0.7467
Epoch 4 train loss  50.4188 valid loss 0.921 and accuracy 0.7467
Epoch 5 train loss  50.4054 valid loss 0.921 and accuracy 0.7467
Epoch 6 train loss  50.7387 valid loss 0.921 and accuracy 0.7455
Epoch 7 train loss  50.3909 valid loss 0.921 and accuracy 0.7455
Epoch 8 train loss  50.3906 valid loss 0.921 and accuracy 0.7455
Epoch 9 train loss  50.3903 valid loss 0.921 and accuracy 0.7478
Epoch 10 train loss  50.3899 valid loss 0.921 and accuracy 0.7478
Epoch 11 train loss  50.3894 valid loss 0.921 and accuracy 0.7478
Epoch 12 train loss  50.4187 valid loss 0.921 and accuracy 0.7500
Epoch 13 train loss  50.3921 valid loss 0.921 and accuracy 0.7500
Epoch 14 train loss  50.3917 valid loss 0.921 and accuracy 0.7500
Epoch 15 train loss 

In [11]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: chemistry
accuracy: tensor([0.2149]), points: -32
----------
TEST Dominio: chemistry
accuracy: tensor([0.2424]), points: -14


In [12]:
model_path = os.getcwd() + f'/trained_models/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [13]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [14]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.6326 valid loss 0.037 and accuracy 0.2500
Epoch 1 train loss  0.7450 valid loss 0.027 and accuracy 0.2500
Epoch 2 train loss  0.7008 valid loss 0.026 and accuracy 0.2500
Epoch 3 train loss  0.6944 valid loss 0.026 and accuracy 0.2500
Epoch 4 train loss  0.6887 valid loss 0.026 and accuracy 0.2511
Epoch 5 train loss  0.6829 valid loss 0.026 and accuracy 0.2545
Epoch 6 train loss  0.6671 valid loss 0.027 and accuracy 0.2556
Epoch 7 train loss  0.6441 valid loss 0.027 and accuracy 0.2746
Epoch 8 train loss  0.6027 valid loss 0.028 and accuracy 0.3371
Epoch 9 train loss  0.5487 valid loss 0.029 and accuracy 0.3705
Epoch 10 train loss  0.4892 valid loss 0.028 and accuracy 0.4531
Epoch 11 train loss  0.4379 valid loss 0.027 and accuracy 0.5636
Epoch 12 train loss  0.3871 valid loss 0.029 and accuracy 0.5647
Epoch 13 train loss  0.3538 valid loss 0.029 and accuracy 0.5882
Epoch 14 train loss  0.3214 valid loss 0.031 and accuracy 0.5960
Epoch 15 train loss  0.2987 valid l

In [15]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: chemistry
accuracy: tensor([0.2632]), points: 12
----------
TEST Dominio: chemistry
accuracy: tensor([0.3057]), points: 102


In [16]:
model_path = os.getcwd() + f'/trained_models/basic_lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [17]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [18]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)



In [19]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.3688 valid loss 2.762 and accuracy 0.2500
Epoch 1 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 2 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 3 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 4 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 5 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 6 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 7 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 8 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 9 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 10 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 11 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 12 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 13 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 14 train loss  49.5652 valid loss 2.762 and accuracy 0.2500
Epoch 15 train loss  

In [20]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: chemistry
accuracy: tensor([0.2061]), points: -40
----------
TEST Dominio: chemistry
accuracy: tensor([0.2402]), points: -18


In [21]:
model_path = os.getcwd() + f'/trained_models/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [22]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training_ir.pickle')

In [23]:
training_categ = filter_by_category(oversampled_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [24]:
vectorizer = Vectorizer.vectorize_ir_dataset(oversampled_training)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [25]:
trainset = HeadQA_IR(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=15)

In [26]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [27]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [28]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [29]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.5016 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4948 valid loss 0.003 and accuracy 0.7498
Epoch 2 train loss  0.4721 valid loss 0.003 and accuracy 0.7441
Epoch 3 train loss  0.4231 valid loss 0.003 and accuracy 0.6813
Epoch 4 train loss  0.3735 valid loss 0.004 and accuracy 0.6792
Epoch 5 train loss  0.3168 valid loss 0.004 and accuracy 0.6816
Epoch 6 train loss  0.2699 valid loss 0.005 and accuracy 0.7031
Epoch 7 train loss  0.2033 valid loss 0.006 and accuracy 0.6472
Epoch 8 train loss  0.1693 valid loss 0.006 and accuracy 0.6551
Epoch 9 train loss  0.1365 valid loss 0.007 and accuracy 0.6728
Epoch 10 train loss  0.1184 valid loss 0.007 and accuracy 0.6822
Epoch 11 train loss  0.1045 valid loss 0.008 and accuracy 0.6824
Epoch 12 train loss  0.0855 valid loss 0.008 and accuracy 0.6858
Epoch 13 train loss  0.0809 valid loss 0.009 and accuracy 0.6969
Epoch 14 train loss  0.0741 valid loss 0.010 and accuracy 0.7085
Epoch 15 train loss  0.0651 valid l

In [30]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: chemistry
accuracy: tensor([0.2544]), points: 4
----------
TEST Dominio: chemistry
accuracy: tensor([0.2445]), points: -10


In [31]:
model_path = os.getcwd() + f'/trained_models/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [32]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [33]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.5020 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4957 valid loss 0.003 and accuracy 0.7500
Epoch 2 train loss  0.4693 valid loss 0.003 and accuracy 0.7430
Epoch 3 train loss  0.4211 valid loss 0.004 and accuracy 0.6958
Epoch 4 train loss  0.3601 valid loss 0.004 and accuracy 0.7085
Epoch 5 train loss  0.2905 valid loss 0.005 and accuracy 0.6991
Epoch 6 train loss  0.2380 valid loss 0.005 and accuracy 0.5949
Epoch 7 train loss  0.2209 valid loss 0.005 and accuracy 0.6691
Epoch 8 train loss  0.1688 valid loss 0.007 and accuracy 0.7044
Epoch 9 train loss  0.1455 valid loss 0.009 and accuracy 0.7221
Epoch 10 train loss  0.1297 valid loss 0.010 and accuracy 0.7182
Epoch 11 train loss  0.1143 valid loss 0.010 and accuracy 0.6983
Epoch 12 train loss  0.1113 valid loss 0.008 and accuracy 0.6489
Epoch 13 train loss  0.0968 valid loss 0.007 and accuracy 0.6730
Epoch 14 train loss  0.0856 valid loss 0.008 and accuracy 0.6513
Epoch 15 train loss  0.0746 valid l

In [34]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: chemistry
accuracy: tensor([0.2851]), points: 32
----------
TEST Dominio: chemistry
accuracy: tensor([0.2729]), points: 42


In [35]:
model_path = os.getcwd() + f'/trained_models/lstm_cnn_qa_{CATEGORY}'
torch.save(lstm_cnn_qa.state_dict(), model_path)