In [1]:
import os
import time
import datetime
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, TensorDataset, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vectorizer, HeadQA, HeadQA_IR, clean_words, parse_dataset, parse_ir_dataset, random_oversamplig, save_dataset_to_pickle, load_dataset_from_pickle
from utils_data import random_undersampling
from training import evaluate, train_ir, validate_ir, evaluator_ir, load_embeddings_from_file, make_embedding_matrix


import transformers
from transformers.optimization import AdamW
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer, BertModel, BertForMaskedLM

from ir_models import LSTM_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
#training_instances = parse_ir_dataset(training)
#validation_instances = parse_ir_dataset(validation)
#testing_instances = parse_ir_dataset(testing)

#oversampled_training = random_oversamplig(training_instances)
#undersampled_training = random_undersampling(training_instances)

#save_dataset_to_pickle('../data/training_ir.pickle', training_instances)
#save_dataset_to_pickle('../data/validation_ir.pickle', validation_instances)
#save_dataset_to_pickle('../data/testing_ir.pickle', testing_instances)
#save_dataset_to_pickle('../data/oversampled_training_ir.pickle', oversampled_training)
# save_dataset_to_pickle('../data/undersampled_training_ir.pickle', undersampled_training)

In [5]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training_ir.pickle')
undersampled_training = load_dataset_from_pickle('../data/undersampled_training_ir.pickle')

In [6]:
vectorizer = Vectorizer.vectorize_ir_dataset(oversampled_training)

In [7]:
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [8]:
trainset = HeadQA_IR(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=15)

In [9]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [10]:
def get_optimizer(model, lr=0.01, wd=0.0):
    return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

In [11]:
# embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
# word_to_idx, embeddings = load_embeddings_from_file(embedding_file)

# save_dataset_to_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle', word_to_idx)
# save_dataset_to_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle', embeddings)

In [12]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')

In [13]:
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [14]:
model = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(model, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...




In [15]:
def validate_ir(model, dataloader):
    model.eval()
    loss, right, total = 0, 0, 0
    y_true, y_preds = [], []
    for x_0, x_1, y in dataloader:
        batch = y.shape[0]
        out = model(x_0.long(), x_1.long())
        loss = F.binary_cross_entropy(out, y.float())
        loss += batch*(loss.item())
        total += batch
        # pred = torch.max(out, dim=1)[1]
        pred = torch.where(out > 0.4, 1, 0)
        y_true.append(y)
        y_preds.append(pred)
        right += (pred == y).float().sum().item()
    return loss/total, right/total, y_true, y_preds

In [16]:
training_results = train_ir(model, optimizer, train_dt, valid_dt, validate_ir, epochs=50)



Epoch 0 train loss  0.5017 valid loss 0.003 and accuracy 0.7500
Epoch 1 train loss  0.4927 valid loss 0.003 and accuracy 0.7496
Epoch 2 train loss  0.4625 valid loss 0.004 and accuracy 0.7294
Epoch 3 train loss  0.4077 valid loss 0.005 and accuracy 0.6996
Epoch 4 train loss  0.3547 valid loss 0.005 and accuracy 0.7066
Epoch 5 train loss  0.2838 valid loss 0.005 and accuracy 0.6776
Epoch 6 train loss  0.2543 valid loss 0.005 and accuracy 0.6524
Epoch 7 train loss  0.1998 valid loss 0.005 and accuracy 0.6452
Epoch 8 train loss  0.1666 valid loss 0.007 and accuracy 0.7204
Epoch 9 train loss  0.1435 valid loss 0.009 and accuracy 0.6851
Epoch 10 train loss  0.1217 valid loss 0.009 and accuracy 0.6562
Epoch 11 train loss  0.1068 valid loss 0.007 and accuracy 0.6301
Epoch 12 train loss  0.0974 valid loss 0.008 and accuracy 0.6557
Epoch 13 train loss  0.0780 valid loss 0.009 and accuracy 0.6564
Epoch 14 train loss  0.0770 valid loss 0.008 and accuracy 0.6404
Epoch 15 train loss  0.0686 valid l

In [17]:
acc, points = evaluate(model, validation, trainset.encode, evaluator_ir)
acc, points

(tensor([0.2584]), 46)

In [18]:
acc, points = evaluate(model, testing, trainset.encode, evaluator_ir)
acc, points

(tensor([0.2659]), 174)

In [19]:
save_dataset_to_pickle('../data/train_results_lstm_qa_sig.pickle', training_results)
training_results = load_dataset_from_pickle('../data/train_results_lstm_qa_sig.pickle')

In [20]:
model_path = os.getcwd() + '/trained_models/lstm_qa_sig'
torch.save(model.state_dict(), model_path)