In [1]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, random_oversamplig, save_dataset_to_pickle, load_dataset_from_pickle 
from training import train, validate, evaluate, evaluate_better, make_embedding_matrix, make_embedding_matrix, evaluator, evaluator_ir
from training import get_optimizer

from supervised_models import BiLSTM_model

%matplotlib inline
%load_ext autoreload
%autoreload 2



In [2]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
# training_instances = parse_dataset(training)
# validation_instances = parse_dataset(validation)
# testing_instances = parse_dataset(testing)

# oversampled_training = random_oversamplig(training_instances)

# save_dataset_to_pickle('../data/training.pickle', training_instances)
# save_dataset_to_pickle('../data/validation.pickle', validation_instances)
# save_dataset_to_pickle('../data/testing.pickle', testing_instances)
# save_dataset_to_pickle('../data/oversampled_training.pickle', oversampled_training)

In [5]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [6]:
vectorizer = Vectorizer.vectorize_training(oversampled_training)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [7]:
trainset = HeadQA(instances=oversampled_training, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [8]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [9]:
# embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
# word_to_idx, embeddings = load_embeddings_from_file(embedding_file)
# save_dataset_to_pickle('trained_models/biomedical_embeddings/word_to_index.pickle', word_to_idx)
# save_dataset_to_pickle('trained_models/biomedical_embeddings/wordvectors.pickle', embeddings)

In [10]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')

In [11]:
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [12]:
torch.random.manual_seed(42)
model = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(model, lr = 0.001, wd = 1e-5)



In [13]:
training_results = train(model, optimizer, train_dt, valid_dt, validate, epochs=50)



Epoch 0 train loss  0.3214 valid loss 0.455 and accuracy 0.2500
Epoch 1 train loss  50.0188 valid loss 0.455 and accuracy 0.2500
Epoch 2 train loss  50.0188 valid loss 0.455 and accuracy 0.2500
Epoch 3 train loss  50.0188 valid loss 0.455 and accuracy 0.2500
Epoch 4 train loss  22.4972 valid loss 0.052 and accuracy 0.2500
Epoch 5 train loss  0.3729 valid loss 0.043 and accuracy 0.2500
Epoch 6 train loss  0.3801 valid loss 0.044 and accuracy 0.2500
Epoch 7 train loss  0.3943 valid loss 0.044 and accuracy 0.2500
Epoch 8 train loss  0.4127 valid loss 0.046 and accuracy 0.2500
Epoch 9 train loss  0.4573 valid loss 0.043 and accuracy 0.2500
Epoch 10 train loss  0.4597 valid loss 0.044 and accuracy 0.2500
Epoch 11 train loss  0.4833 valid loss 0.049 and accuracy 0.2500
Epoch 12 train loss  0.4711 valid loss 0.047 and accuracy 0.2500
Epoch 13 train loss  0.4544 valid loss 0.047 and accuracy 0.2500
Epoch 14 train loss  0.4586 valid loss 0.067 and accuracy 0.2500
Epoch 15 train loss  0.4870 val

In [14]:
acc, points = evaluate(model, validation, trainset.encode, evaluator)
acc, points

(tensor([0.2291]), -114)

In [15]:
acc, points = evaluate(model, testing, trainset.encode, evaluator)
acc, points

(tensor([0.2480]), -22)

In [16]:
save_dataset_to_pickle('../data/train_results_bilstm.pickle', training_results)
training_results = load_dataset_from_pickle('../data/train_results_bilstm.pickle')

In [17]:
model_path = os.getcwd() + '/trained_models/bilstm'
torch.save(model.state_dict(), model_path)

In [19]:
model = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
model.load_state_dict(torch.load(model_path))
model.eval()

BiLSTM_model(
  (emb): Embedding(20403, 300, padding_idx=0)
  (dropout): Dropout(p=0.3, inplace=False)
  (lstm): LSTM(300, 64, batch_first=True, dropout=0.5, bidirectional=True)
  (attn): Linear(in_features=128, out_features=30, bias=True)
  (linear): Linear(in_features=3840, out_features=1, bias=True)
)

In [20]:
acc, points, acc_list, points_list = evaluate_better(model, validation, trainset.encode, evaluator)
acc, points, acc_list, points_list

(0.22909921,
 -19.0,
 [tensor(0.2389),
  tensor(0.2043),
  tensor(0.2044),
  tensor(0.2597),
  tensor(0.2522),
  tensor(0.2149)],
 [-10, -42, -41, 9, 2, -32])

In [21]:
acc, points, acc_list, points_list = evaluate_better(model, testing, trainset.encode, evaluator)
acc, points, acc_list, points_list

(0.24795495,
 -1.8333333333333333,
 [tensor(0.2719),
  tensor(0.2377),
  tensor(0.2149),
  tensor(0.2414),
  tensor(0.2391),
  tensor(0.2727),
  tensor(0.2389),
  tensor(0.2543),
  tensor(0.2576),
  tensor(0.2468),
  tensor(0.2578),
  tensor(0.2423)],
 [20, -11, -32, -8, -10, 21, -10, 4, 7, -3, 7, -7])