In [1]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import  Vocabulary, Vectorizer, HeadQA, HeadQA_IR
from utils_data import parse_dataset, parse_ir_dataset, random_oversamplig, random_undersampling
from utils_data import filter_by_category, save_dataset_to_pickle, load_dataset_from_pickle
import training
from training import get_optimizer, train, train_ir, validate, validate_ir, evaluator, evaluator_ir, evaluate
from training import load_embeddings_from_file, make_embedding_matrix
from training import pad_seq, encoder_bert, encoder_bert_ir, encoder_bert_instance, encoder_bert_ir_instance
from training import evaluator_bert, evaluator_bert_ir, evaluate_better

from supervised_models import LogisticRegression, BasicLSTM, BiLSTM_model
from ir_models import LSTM_QA, LSTM_CNN_QA, BERT_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2



In [2]:
CATEGORY = 'chemistry'

In [3]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


### Modelos supervisados puros

In [36]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

mixed_training = load_dataset_from_pickle('../data/mixed_oversampling_training.pickle')

In [37]:
training_categ = filter_by_category(mixed_training, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [38]:
vectorizer = Vectorizer.vectorize_training(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [39]:
trainset = HeadQA(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=30)

In [40]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

#### Logistic Regressor

In [9]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(logistic_regressor, lr = 0.01, wd = 1e-5)

In [10]:
training_results = train(logistic_regressor, optimizer, train_dt, valid_dt, validate, epochs=30)



Epoch 0 train loss  44.8732 valid loss 1.151 and accuracy 0.7422
Epoch 1 train loss  30.8754 valid loss 2.729 and accuracy 0.3650
Epoch 2 train loss  55.4212 valid loss 2.532 and accuracy 0.2600
Epoch 3 train loss  57.4654 valid loss 2.532 and accuracy 0.2645
Epoch 4 train loss  57.6747 valid loss 2.762 and accuracy 0.2567
Epoch 5 train loss  57.7071 valid loss 2.762 and accuracy 0.2567
Epoch 6 train loss  57.7071 valid loss 2.762 and accuracy 0.2567
Epoch 7 train loss  57.7071 valid loss 2.762 and accuracy 0.2567
Epoch 8 train loss  57.7071 valid loss 2.762 and accuracy 0.2567
Epoch 9 train loss  57.7071 valid loss 2.762 and accuracy 0.2567
Epoch 10 train loss  57.7071 valid loss 2.762 and accuracy 0.2567
Epoch 11 train loss  57.7071 valid loss 2.762 and accuracy 0.2567
Epoch 12 train loss  57.8242 valid loss 2.532 and accuracy 0.2634
Epoch 13 train loss  57.5479 valid loss 2.532 and accuracy 0.2634
Epoch 14 train loss  57.5547 valid loss 2.532 and accuracy 0.2656
Epoch 15 train loss 

In [11]:
acc, points = evaluate(logistic_regressor, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(logistic_regressor, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: chemistry
accuracy: tensor([0.2105]), points: -36
----------
TEST Dominio: chemistry
accuracy: tensor([0.2358]), points: -26


In [12]:
model_path = os.getcwd() + f'/trained_models_v2/logistic_regressor_{CATEGORY}'
torch.save(logistic_regressor.state_dict(), model_path)

#### LSTM

In [13]:
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(lstm, lr = 0.001, wd = 1e-5)

In [14]:
training_results = train(lstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.6541 valid loss 0.027 and accuracy 0.2500
Epoch 1 train loss  0.7101 valid loss 0.025 and accuracy 0.2500
Epoch 2 train loss  0.6955 valid loss 0.025 and accuracy 0.2500
Epoch 3 train loss  0.6936 valid loss 0.025 and accuracy 0.2500
Epoch 4 train loss  0.6913 valid loss 0.025 and accuracy 0.2489
Epoch 5 train loss  0.6884 valid loss 0.025 and accuracy 0.2556
Epoch 6 train loss  0.6857 valid loss 0.025 and accuracy 0.2612
Epoch 7 train loss  0.6804 valid loss 0.025 and accuracy 0.2645
Epoch 8 train loss  0.6713 valid loss 0.025 and accuracy 0.2701
Epoch 9 train loss  0.6575 valid loss 0.025 and accuracy 0.3237
Epoch 10 train loss  0.6210 valid loss 0.026 and accuracy 0.3795
Epoch 11 train loss  0.5796 valid loss 0.026 and accuracy 0.4364
Epoch 12 train loss  0.5199 valid loss 0.027 and accuracy 0.5089
Epoch 13 train loss  0.4638 valid loss 0.029 and accuracy 0.5569
Epoch 14 train loss  0.4080 valid loss 0.031 and accuracy 0.5558
Epoch 15 train loss  0.3836 valid l

In [15]:
acc, points = evaluate(lstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: chemistry
accuracy: tensor([0.2675]), points: 16
----------
TEST Dominio: chemistry
accuracy: tensor([0.3210]), points: 130


In [16]:
model_path = os.getcwd() + f'/trained_models_v2/lstm_{CATEGORY}'
torch.save(lstm.state_dict(), model_path)

#### BiLSTM

In [41]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [18]:
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(bilstm, lr = 0.01, wd = 1e-5)



In [19]:
training_results = train(bilstm, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.4231 valid loss 2.762 and accuracy 0.2500
Epoch 1 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 2 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 3 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 4 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 5 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 6 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 7 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 8 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 9 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 10 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 11 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 12 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 13 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 14 train loss  57.5758 valid loss 2.762 and accuracy 0.2500
Epoch 15 train loss  

In [20]:
acc, points = evaluate(bilstm, dev_categ, trainset.encode, evaluator)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(bilstm, test_categ, trainset.encode, evaluator)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: chemistry
accuracy: tensor([0.2061]), points: -40
----------
TEST Dominio: chemistry
accuracy: tensor([0.2402]), points: -18


In [21]:
model_path = os.getcwd() + f'/trained_models_v2/bilstm_{CATEGORY}'
torch.save(bilstm.state_dict(), model_path)

### Modelos supervisados IR

In [43]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
mixed_training_ir = load_dataset_from_pickle('../data/mixed_oversampling_training_ir.pickle')

In [44]:
training_categ = filter_by_category(mixed_training_ir, category=CATEGORY)
validation_categ = filter_by_category(validation_instances, category=CATEGORY)
testing_categ = filter_by_category(testing_instances, category=CATEGORY)

dev_categ = filter_by_category(validation, category=CATEGORY)
test_categ = filter_by_category(testing, category=CATEGORY)

In [45]:
vectorizer = Vectorizer.vectorize_ir_dataset(training_categ)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [46]:
trainset = HeadQA_IR(instances=training_categ, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_categ, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_categ, vectorizer=vectorizer, right_padding=False, max_length=15)

In [47]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [48]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

#### LSTM-QA

In [28]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [29]:
training_results = train_ir(lstm_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.4358 valid loss 0.212 and accuracy 0.2500
Epoch 1 train loss  0.8573 valid loss 0.086 and accuracy 0.2500
Epoch 2 train loss  0.7966 valid loss 0.037 and accuracy 0.2500
Epoch 3 train loss  0.7037 valid loss 0.030 and accuracy 0.2500
Epoch 4 train loss  0.6803 valid loss 0.029 and accuracy 0.2500
Epoch 5 train loss  0.6845 valid loss 0.028 and accuracy 0.2500
Epoch 6 train loss  0.6865 valid loss 0.027 and accuracy 0.2500
Epoch 7 train loss  0.6879 valid loss 0.027 and accuracy 0.2500
Epoch 8 train loss  0.6827 valid loss 0.027 and accuracy 0.2500
Epoch 9 train loss  0.6833 valid loss 0.027 and accuracy 0.2500
Epoch 10 train loss  0.6872 valid loss 0.026 and accuracy 0.2500
Epoch 11 train loss  0.6905 valid loss 0.027 and accuracy 0.2500
Epoch 12 train loss  0.6866 valid loss 0.033 and accuracy 0.2500
Epoch 13 train loss  0.6696 valid loss 0.031 and accuracy 0.2500
Epoch 14 train loss  0.6623 valid loss 0.030 and accuracy 0.2500
Epoch 15 train loss  0.6888 valid l

In [30]:
acc, points = evaluate(lstm_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: chemistry
accuracy: tensor([0.2500]), points: 0
----------
TEST Dominio: chemistry
accuracy: tensor([0.2009]), points: -90


In [31]:
model_path = os.getcwd() + f'/trained_models_v2/lstm_qa_{CATEGORY}'
torch.save(lstm_qa.state_dict(), model_path)

#### LSTM-QA/CNN

In [32]:
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(lstm_cnn_qa, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [33]:
training_results = train_ir(lstm_cnn_qa, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.4387 valid loss 0.208 and accuracy 0.2500
Epoch 1 train loss  0.8319 valid loss 0.106 and accuracy 0.2500
Epoch 2 train loss  0.8092 valid loss 0.043 and accuracy 0.2500
Epoch 3 train loss  0.7443 valid loss 0.029 and accuracy 0.2500
Epoch 4 train loss  0.6813 valid loss 0.028 and accuracy 0.2500
Epoch 5 train loss  0.6797 valid loss 0.027 and accuracy 0.2500
Epoch 6 train loss  0.6848 valid loss 0.027 and accuracy 0.2500
Epoch 7 train loss  0.6916 valid loss 0.026 and accuracy 0.2500
Epoch 8 train loss  0.6899 valid loss 0.026 and accuracy 0.2500
Epoch 9 train loss  0.7004 valid loss 0.026 and accuracy 0.2500
Epoch 10 train loss  0.7016 valid loss 0.026 and accuracy 0.2500
Epoch 11 train loss  0.7006 valid loss 0.026 and accuracy 0.2500
Epoch 12 train loss  0.7029 valid loss 0.026 and accuracy 0.2500
Epoch 13 train loss  0.7053 valid loss 0.026 and accuracy 0.2500
Epoch 14 train loss  0.7138 valid loss 0.026 and accuracy 0.2500
Epoch 15 train loss  0.7013 valid l

In [34]:
acc, points = evaluate(lstm_cnn_qa, dev_categ, trainset.encode, evaluator_ir)
print(f'DEV Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')
print('----------')
acc, points = evaluate(lstm_cnn_qa, test_categ, trainset.encode, evaluator_ir)
print(f'TEST Dominio: {CATEGORY}')
print(f'accuracy: {acc}, points: {points}')

DEV Dominio: chemistry
accuracy: tensor([0.2632]), points: 12
----------
TEST Dominio: chemistry
accuracy: tensor([0.3057]), points: 102


In [35]:
model_path = os.getcwd() + f'/trained_models_v2/lstm_cnn_qa_{CATEGORY}'
torch.save(lstm_cnn_qa.state_dict(), model_path)

### Evaluacion

In [42]:
logistic_regressor = LogisticRegression(trainset.max_length, 1)
lstm = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
bilstm = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)

models = [logistic_regressor, lstm, bilstm]
paths = [os.getcwd() + f'/trained_models_v2/logistic_regressor_{CATEGORY}', 
         os.getcwd() + f'/trained_models_v2/lstm_{CATEGORY}',         
         os.getcwd() + f'/trained_models_v2/bilstm_{CATEGORY}']

print(paths[0])

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print() 

C:\Users\tec005m\mds\TFM\head-qa-afi\code/trained_models_v2/logistic_regressor_chemistry
DEV
Accuracy media 0.21052632
Puntos media -36.0
[tensor(0.2105)]
[-36]
---------
TEST
Accuracy media 0.23548257
Puntos media -13.0
[tensor(0.2727), tensor(0.1982)]
[21, -47]
---------

DEV
Accuracy media 0.26754385
Puntos media 16.0
[tensor(0.2675)]
[16]
---------
TEST
Accuracy media 0.3207754
Puntos media 65.0
[tensor(0.3420), tensor(0.2996)]
[85, 45]
---------

DEV
Accuracy media 0.20614035
Puntos media -40.0
[tensor(0.2061)]
[-40]
---------
TEST
Accuracy media 0.23981157
Puntos media -9.0
[tensor(0.2814), tensor(0.1982)]
[29, -47]
---------



In [49]:
lstm_qa = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
lstm_cnn_qa = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)

models = [lstm_qa, lstm_cnn_qa]

paths = [os.getcwd() + f'/trained_models_v2/lstm_qa_{CATEGORY}',
         os.getcwd() + f'/trained_models_v2/lstm_cnn_qa_{CATEGORY}'
        ]

for i, model in enumerate(models):
    model.load_state_dict(torch.load(paths[i]))
    model.eval()
    acc, points, acc_list, points_list = evaluate_better(model, dev_categ, trainset.encode, evaluator_ir)
    print('DEV')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    acc, points, acc_list, points_list = evaluate_better(model, test_categ, trainset.encode, evaluator_ir)
    print('TEST')
    print('Accuracy media', acc)
    print('Puntos media', points)
    print(acc_list)
    print(points_list)
    print('---------')
    print() 

Loading pretrained embeddings...
Loading pretrained embeddings...
DEV
Accuracy media 0.25
Puntos media 0.0
[tensor(0.2500)]
[0]
---------
TEST
Accuracy media 0.20085055
Puntos media -45.0
[tensor(0.2035), tensor(0.1982)]
[-43, -47]
---------

DEV
Accuracy media 0.2631579
Puntos media 12.0
[tensor(0.2632)]
[12]
---------
TEST
Accuracy media 0.3056239
Puntos media 51.0
[tensor(0.3117), tensor(0.2996)]
[57, 45]
---------

