In [13]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, random_oversamplig, save_dataset_to_pickle, load_dataset_from_pickle 
from training import train, validate, evaluate, evaluate_better, make_embedding_matrix, make_embedding_matrix, evaluator, evaluator_ir
from training import get_optimizer

from supervised_models import BiLSTM_model

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
# training_instances = parse_dataset(training)
# validation_instances = parse_dataset(validation)
# testing_instances = parse_dataset(testing)

# oversampled_training = random_oversamplig(training_instances)

# save_dataset_to_pickle('../data/training.pickle', training_instances)
# save_dataset_to_pickle('../data/validation.pickle', validation_instances)
# save_dataset_to_pickle('../data/testing.pickle', testing_instances)
# save_dataset_to_pickle('../data/oversampled_training.pickle', oversampled_training)

In [6]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [7]:
vectorizer = Vectorizer.vectorize_training(oversampled_training)

vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [8]:
trainset = HeadQA(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [9]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [10]:
# embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
# word_to_idx, embeddings = load_embeddings_from_file(embedding_file)
# save_dataset_to_pickle('trained_models/biomedical_embeddings/word_to_index.pickle', word_to_idx)
# save_dataset_to_pickle('trained_models/biomedical_embeddings/wordvectors.pickle', embeddings)

In [10]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')

In [11]:
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [12]:
model = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(model, lr = 0.001, wd = 1e-5)



NameError: name 'get_optimizer' is not defined

In [14]:
training_results = train(model, optimizer, train_dt, valid_dt, validate, epochs=50)



Epoch 0 train loss  0.5028 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.5005 valid loss 0.003 and accuracy 0.7500
Epoch 2 train loss  0.5004 valid loss 0.003 and accuracy 0.7500
Epoch 3 train loss  0.4996 valid loss 0.003 and accuracy 0.7500
Epoch 4 train loss  0.4986 valid loss 0.003 and accuracy 0.7500
Epoch 5 train loss  0.4956 valid loss 0.003 and accuracy 0.7498
Epoch 6 train loss  0.4929 valid loss 0.003 and accuracy 0.7493
Epoch 7 train loss  0.4889 valid loss 0.003 and accuracy 0.7454
Epoch 8 train loss  0.4820 valid loss 0.004 and accuracy 0.7460
Epoch 9 train loss  0.4759 valid loss 0.003 and accuracy 0.7438
Epoch 10 train loss  0.4667 valid loss 0.003 and accuracy 0.7375
Epoch 11 train loss  0.4525 valid loss 0.004 and accuracy 0.7320
Epoch 12 train loss  0.4433 valid loss 0.004 and accuracy 0.6991
Epoch 13 train loss  0.4274 valid loss 0.004 and accuracy 0.7399
Epoch 14 train loss  0.4130 valid loss 0.004 and accuracy 0.7053
Epoch 15 train loss  0.4011 valid l

In [15]:
acc, points = evaluate(model, validation, trainset.encode, evaluator)
acc, points

(tensor([0.2482]), -10)

In [16]:
acc, points = evaluate(model, testing, trainset.encode, evaluator)
acc, points

(tensor([0.2659]), 174)

In [17]:
save_dataset_to_pickle('../data/train_results_bilstm_sig.pickle', training_results)
training_results = load_dataset_from_pickle('../data/train_results_bilstm.pickle')

In [18]:
model_path = os.getcwd() + '/trained_models/bilstm_sig'
torch.save(model.state_dict(), model_path)

In [15]:
model = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
model.load_state_dict(torch.load(os.getcwd() + '/trained_models/bilstm_sig'))
model.eval()

BiLSTM_model(
  (emb): Embedding(20403, 300, padding_idx=0)
  (dropout): Dropout(p=0.3, inplace=False)
  (lstm): LSTM(300, 64, batch_first=True, dropout=0.5, bidirectional=True)
  (attn): Linear(in_features=128, out_features=30, bias=True)
  (linear): Linear(in_features=3840, out_features=1, bias=True)
)

In [16]:
acc, points, acc_list, points_list = evaluate_better(model, validation, trainset.encode, evaluator)
acc, points, acc_list, points_list



(0.24831116,
 -1.6666666666666667,
 [tensor(0.2920),
  tensor(0.1957),
  tensor(0.2222),
  tensor(0.2424),
  tensor(0.2788),
  tensor(0.2588)],
 [38, -50, -25, -7, 26, 8])

In [17]:
acc, points, acc_list, points_list = evaluate_better(model, testing, trainset.encode, evaluator)
acc, points, acc_list, points_list

(0.2658792,
 14.5,
 [tensor(0.2763),
  tensor(0.2287),
  tensor(0.2544),
  tensor(0.2284),
  tensor(0.2565),
  tensor(0.2987),
  tensor(0.2522),
  tensor(0.2802),
  tensor(0.2751),
  tensor(0.2165),
  tensor(0.2711),
  tensor(0.3524)],
 [24, -19, 4, -20, 6, 45, 2, 28, 23, -31, 19, 93])