In [1]:
import os
import time
import datetime
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, TensorDataset, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vectorizer, HeadQA, HeadQA_IR, clean_words, parse_dataset, parse_ir_dataset, random_oversamplig, save_dataset_to_pickle, load_dataset_from_pickle
from training import train, validate, evaluate, evaluator_ir, train_ir, validate_ir, load_embeddings_from_file, make_embedding_matrix
from training import get_optimizer, evaluate_better

import transformers
from transformers.optimization import AdamW
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer, BertModel, BertForMaskedLM

from ir_models import LSTM_CNN_QA

%matplotlib inline
%load_ext autoreload
%autoreload 2



In [2]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
mixed_training = load_dataset_from_pickle('../data/mixed_oversampling_training_ir.pickle')

In [5]:
vectorizer = Vectorizer.vectorize_ir_dataset(mixed_training)

In [6]:
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [7]:
trainset = HeadQA_IR(instances=mixed_training, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=15)

In [8]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [9]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')

In [10]:
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [11]:
torch.random.manual_seed(42)
model = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(model, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...




In [12]:
training_results = train_ir(model, optimizer, train_dt, valid_dt, validate_ir, epochs=50)



Epoch 0 train loss  0.3662 valid loss 0.058 and accuracy 0.2500
Epoch 1 train loss  0.4419 valid loss 0.040 and accuracy 0.2500
Epoch 2 train loss  0.4381 valid loss 0.035 and accuracy 0.2500
Epoch 3 train loss  0.4143 valid loss 0.035 and accuracy 0.2500
Epoch 4 train loss  0.4194 valid loss 0.034 and accuracy 0.2500
Epoch 5 train loss  0.4207 valid loss 0.034 and accuracy 0.2500
Epoch 6 train loss  0.4175 valid loss 0.032 and accuracy 0.2500
Epoch 7 train loss  0.4179 valid loss 0.033 and accuracy 0.2500
Epoch 8 train loss  0.4154 valid loss 0.036 and accuracy 0.2500
Epoch 9 train loss  0.4214 valid loss 0.036 and accuracy 0.2500
Epoch 10 train loss  0.4289 valid loss 0.034 and accuracy 0.2500
Epoch 11 train loss  0.4270 valid loss 0.034 and accuracy 0.2500
Epoch 12 train loss  0.4217 valid loss 0.034 and accuracy 0.2500
Epoch 13 train loss  0.4192 valid loss 0.037 and accuracy 0.2500
Epoch 14 train loss  0.4260 valid loss 0.034 and accuracy 0.2500
Epoch 15 train loss  0.4228 valid l

In [13]:
acc, points = evaluate(model, testing, trainset.encode, evaluator_ir)
acc, points

(tensor([0.2538]), 42)

In [14]:
acc, points = evaluate(model, validation, trainset.encode, evaluator_ir)
acc, points

(tensor([0.2540]), 22)

In [15]:
save_dataset_to_pickle('results_v2/train_results_lstm_cnn_qa.pickle', training_results)
training_results = load_dataset_from_pickle('results_v2/train_results_lstm_cnn_qa.pickle')

In [16]:
model_path = os.getcwd() + '/trained_models_v2/lstm_cnn_qa'
torch.save(model.state_dict(), model_path)

In [17]:
torch.random.manual_seed(42)
model = LSTM_CNN_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
model.load_state_dict(torch.load(model_path))
model.eval()

Loading pretrained embeddings...


LSTM_CNN_QA(
  (emb): Embedding(28821, 300, padding_idx=0)
  (dropout): Dropout(p=0.5, inplace=False)
  (lstm): LSTM(300, 64, batch_first=True, dropout=0.5, bidirectional=True)
  (conv): Conv1d(2, 10, kernel_size=(3,), stride=(1,))
  (cosine): CosineSimilarity()
  (linear): Linear(in_features=128, out_features=64, bias=True)
  (linear1): Linear(in_features=64, out_features=1, bias=True)
)

In [18]:
acc, points, acc_list, points_list = evaluate_better(model, validation, trainset.encode, evaluator_ir)
acc, points, acc_list, points_list

(0.25407282,
 3.6666666666666665,
 [tensor(0.2832),
  tensor(0.2652),
  tensor(0.2267),
  tensor(0.2251),
  tensor(0.2655),
  tensor(0.2588)],
 [30, 14, -21, -23, 14, 8])

In [19]:
acc, points, acc_list, points_list = evaluate_better(model, testing, trainset.encode, evaluator_ir)
acc, points, acc_list, points_list

(0.2538183,
 3.5,
 [tensor(0.2412),
  tensor(0.2825),
  tensor(0.2675),
  tensor(0.2672),
  tensor(0.2435),
  tensor(0.3030),
  tensor(0.2124),
  tensor(0.2371),
  tensor(0.2183),
  tensor(0.2641),
  tensor(0.2622),
  tensor(0.2467)],
 [-8, 29, 16, 16, -6, 49, -34, -12, -29, 13, 11, -3])