In [1]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, random_oversamplig, save_dataset_to_pickle, load_dataset_from_pickle 
from training import train, validate, evaluate, make_embedding_matrix, make_embedding_matrix, evaluator, evaluator_ir

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
# training_instances = parse_dataset(training)
# validation_instances = parse_dataset(validation)
# testing_instances = parse_dataset(testing)

# oversampled_training = random_oversamplig(training_instances)

# save_dataset_to_pickle('../data/training.pickle', training_instances)
# save_dataset_to_pickle('../data/validation.pickle', validation_instances)
# save_dataset_to_pickle('../data/testing.pickle', testing_instances)
# save_dataset_to_pickle('../data/oversampled_training.pickle', oversampled_training)

In [5]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [6]:
vectorizer = Vectorizer.vectorize_training(oversampled_training)

vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [7]:
trainset = HeadQA(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [8]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [9]:
class BiLSTM_model(nn.Module):
    def __init__(self, embedding_size, num_embeddings, num_classes, hidden_size=64,
                 pretrained_embeddings=None, padding_idx=0, max_length = 110):
        super(BiLSTM_model, self).__init__()

        self.embedding_size = embedding_size
        self.num_embeddings = num_embeddings
        self.hidden_size = hidden_size
        self.max_length = max_length
        
        if pretrained_embeddings is None:
            self.emb = nn.Embedding(embedding_dim=self.embedding_size,num_embeddings=self.num_embeddings,
                                    padding_idx=padding_idx)
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim=self.embedding_size, num_embeddings=self.num_embeddings,
                                    padding_idx=padding_idx, _weight=pretrained_embeddings)
            self.emb.weight.requires_grad = False
        self.dropout = nn.Dropout(0.3)            
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True, dropout = 0.5,bidirectional = True)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.linear = nn.Linear(self.hidden_size*2*self.max_length, num_classes) 
            
    def forward(self, x):
        x = self.emb(x)
        x = self.dropout(x)
        out, (ht, ct) = self.lstm(x)
        attn = self.attn(out)
        attn_weights = F.softmax(torch.tanh(attn), dim=1)
        attn_applied = torch.bmm(attn_weights, out)
        attn_applied = attn_applied.flatten(1) 
        return F.softmax(self.linear(attn_applied), dim = 0)

In [10]:
def get_optimizer(model, lr=0.01, wd=0.0):
    return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

In [11]:
# embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
# word_to_idx, embeddings = load_embeddings_from_file(embedding_file)
# save_dataset_to_pickle('trained_models/biomedical_embeddings/word_to_index.pickle', word_to_idx)
# save_dataset_to_pickle('trained_models/biomedical_embeddings/wordvectors.pickle', embeddings)

In [12]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors.pickle')

In [13]:
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [14]:
model = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(model, lr = 0.001, wd = 1e-5)



In [15]:
training_results = train(model, optimizer, train_dt, valid_dt, validate, epochs=100)

Epoch 0 train loss  0.7191 valid loss 0.005 and accuracy 0.7500
Epoch 1 train loss  0.7182 valid loss 0.005 and accuracy 0.7500
Epoch 2 train loss  0.7177 valid loss 0.005 and accuracy 0.7500
Epoch 3 train loss  0.7161 valid loss 0.005 and accuracy 0.7500
Epoch 4 train loss  0.7143 valid loss 0.005 and accuracy 0.7500
Epoch 5 train loss  0.7113 valid loss 0.005 and accuracy 0.7500
Epoch 6 train loss  0.7065 valid loss 0.005 and accuracy 0.7500
Epoch 7 train loss  0.7032 valid loss 0.005 and accuracy 0.7500
Epoch 8 train loss  0.6982 valid loss 0.005 and accuracy 0.7500
Epoch 9 train loss  0.6897 valid loss 0.005 and accuracy 0.7500
Epoch 10 train loss  0.6806 valid loss 0.006 and accuracy 0.7500
Epoch 11 train loss  0.6689 valid loss 0.006 and accuracy 0.7500
Epoch 12 train loss  0.6632 valid loss 0.006 and accuracy 0.7500
Epoch 13 train loss  0.6516 valid loss 0.006 and accuracy 0.7500
Epoch 14 train loss  0.6425 valid loss 0.006 and accuracy 0.7500
Epoch 15 train loss  0.6328 valid l

In [16]:
acc, points = evaluate(model, validation, trainset.encode, evaluator)
acc, points

(tensor([0.2218]), -154)

In [17]:
acc, points = evaluate(model, testing, trainset.encode, evaluator)
acc, points

(tensor([0.2713]), 234)

In [18]:
save_dataset_to_pickle('../data/train_results_bilstm.pickle', training_results)
training_results = load_dataset_from_pickle('../data/train_results_bilstm.pickle')

In [19]:
model_path = os.getcwd() + '/trained_models/bilstm'
torch.save(model.state_dict(), model_path)