In [76]:
import os
import time
import datetime
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader, TensorDataset, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vectorizer, HeadQA, HeadQA_IR, clean_words, parse_dataset, parse_ir_dataset, random_oversamplig, save_dataset_to_pickle, load_dataset_from_pickle
from training import train, validate, evaluate, evaluator_ir, train_ir, validate_ir, load_embeddings_from_file, make_embedding_matrix


import transformers
from transformers.optimization import AdamW
from transformers import BertForSequenceClassification, BertConfig, BertTokenizer, BertModel, BertForMaskedLM

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
#training_instances = parse_ir_dataset(training)
#validation_instances = parse_ir_dataset(validation)
#testing_instances = parse_ir_dataset(testing)

#oversampled_training = random_oversamplig(training_instances)

#save_dataset_to_pickle('../data/training_ir.pickle', training_instances)
#save_dataset_to_pickle('../data/validation_ir.pickle', validation_instances)
#save_dataset_to_pickle('../data/testing_ir.pickle', testing_instances)
#save_dataset_to_pickle('../data/oversampled_training_ir.pickle', oversampled_training)

In [5]:
training_instances = load_dataset_from_pickle('../data/training_ir.pickle')
validation_instances = load_dataset_from_pickle('../data/validation_ir.pickle')
testing_instances = load_dataset_from_pickle('../data/testing_ir.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training_ir.pickle')

In [7]:
oversampled_training[0]

{'question': 'Los potenciales postsinápticos excitadores:',
 'answer': 'Son de tipo todo o nada.',
 'tok_qtext': ['Los', 'potenciales', 'postsinápticos', 'excitadores', ':'],
 'tok_atext': ['Son', 'de', 'tipo', 'todo', 'o', 'nada', '.'],
 'label': 0,
 'category': 'biology'}

In [6]:
vectorizer = Vectorizer.vectorize_ir_dataset(oversampled_training)

In [7]:
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [8]:
trainset = HeadQA_IR(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
validset = HeadQA_IR(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=15)
testset = HeadQA_IR(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=15)

In [9]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [66]:
class LSTM_CNN_QA(torch.nn.Module):
    def __init__(self, vocab_size, hidden_size, x_size, n_classes, embedding_size=300,
                 padding_idx=0, pretrained_embeddings=None): 
        super(LSTM_CNN_QA, self).__init__()
        self.embedding_size = embedding_size
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_classes = n_classes
        
        if pretrained_embeddings is None:
            self.emb = nn.Embedding(embedding_dim=self.embedding_size,num_embeddings=self.vocab_size,
                                    padding_idx=padding_idx)
        else:
            print('Loading pretrained embeddings...')
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim=self.embedding_size, num_embeddings=self.vocab_size,
                                    padding_idx=padding_idx, _weight=pretrained_embeddings)
            self.emb.weight.requires_grad = False
        
        self.dropout = nn.Dropout(0.5)
        self.lstm = nn.LSTM(self.embedding_size, self.hidden_size, batch_first=True, dropout=0.5,bidirectional=True)
        
        self.conv = nn.Conv1d(in_channels=2, out_channels=10, kernel_size=3)   
        
#         self.conv_3 = nn.Conv1d(in_channels=self.embedding_size, out_channels= self.out_channels, kernel_size=3)
#         self.conv_4 = nn.Conv1d(in_channels=self.embedding_size, out_channels= self.out_channels, kernel_size=4)
#         self.conv_5 = nn.Conv1d(in_channels=self.embedding_size, out_channels= self.out_channels, kernel_size=5)
        
        self.cosine = nn.CosineSimilarity(dim=1)
        self.linear = nn.Linear(self.hidden_size*2, 64)  
        self.linear1 = nn.Linear(64, self.n_classes)
        
        
    def forward(self, x_0, x_1):
        x_0 = self.emb(x_0)
        x_1 = self.emb(x_1)
        out_0, (ht_0, ct_0) = self.lstm(x_0)
        out_1, (ht_1, ct_1) = self.lstm(x_1) 
        ht_0 = ht_0.transpose(0, 1)
        ht_1 = ht_1.transpose(0, 1)
        ht_0 = self.conv(ht_0)
        ht_1 = self.conv(ht_1)
        x = self.cosine(out_0, out_1)
        x = self.linear(x)
        x = self.linear1(x)
        x = F.softmax(x, dim=0)
        return x

In [67]:
def get_optimizer(model, lr=0.01, wd=0.0):
    return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

In [68]:
# embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
# word_to_idx, embeddings = load_embeddings_from_file(embedding_file)

# save_dataset_to_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle', word_to_idx)
# save_dataset_to_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle', embeddings)

In [69]:
word_to_idx = load_dataset_from_pickle('trained_models/biomedical_embeddings/word_to_index_ir.pickle')
embeddings = load_dataset_from_pickle('trained_models/biomedical_embeddings/wordvectors_ir.pickle')

In [70]:
embedding_file = "trained_models/biomedical_embeddings/Scielo_wiki_FastText300.vec"
words = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(words), word_to_idx, embeddings)

In [71]:
model = LSTM_QA(vocab_size=len(vocab), hidden_size=64, x_size=trainset.max_length, n_classes=1, embedding_size=300,
               pretrained_embeddings=embedding_matrix)
optimizer = get_optimizer(model, lr = 0.001, wd = 1e-5)

Loading pretrained embeddings...


In [72]:
for x_0, x_1, y in train_dt:
    out = model(x_0.long(), x_1.long())
    print(out.shape)
    break;

torch.Size([32, 1])


In [73]:
def validate_ir(model, dataloader):
    model.eval()
    loss, right, total = 0, 0, 0
    y_true, y_preds = [], []
    for x_0, x_1, y in dataloader:
        batch = y.shape[0]
        out = model(x_0.long(), x_1.long())
        loss = F.binary_cross_entropy(out, y.float())
        loss += batch*(loss.item())
        total += batch
        pred = torch.where(out > 0.4, 1, 0)
        y_true.append(y)
        y_preds.append(pred)
        right += (pred == y).float().sum().item()
    return loss/total, right/total, y_true, y_preds

In [74]:
training_results = train_ir(model, optimizer, train_dt, valid_dt, validate_ir, epochs=50)

Epoch 0 train loss  0.7188 valid loss 0.005 and accuracy 0.7500
Epoch 1 train loss  0.7116 valid loss 0.005 and accuracy 0.7500
Epoch 2 train loss  0.6854 valid loss 0.006 and accuracy 0.7500
Epoch 3 train loss  0.6437 valid loss 0.006 and accuracy 0.7500
Epoch 4 train loss  0.6075 valid loss 0.006 and accuracy 0.7498
Epoch 5 train loss  0.5735 valid loss 0.006 and accuracy 0.7491
Epoch 6 train loss  0.5488 valid loss 0.007 and accuracy 0.7493
Epoch 7 train loss  0.5308 valid loss 0.007 and accuracy 0.7489
Epoch 8 train loss  0.5194 valid loss 0.007 and accuracy 0.7498
Epoch 9 train loss  0.5082 valid loss 0.007 and accuracy 0.7487
Epoch 10 train loss  0.4978 valid loss 0.007 and accuracy 0.7489
Epoch 11 train loss  0.4862 valid loss 0.007 and accuracy 0.7494
Epoch 12 train loss  0.4813 valid loss 0.007 and accuracy 0.7485
Epoch 13 train loss  0.4789 valid loss 0.006 and accuracy 0.7498
Epoch 14 train loss  0.4693 valid loss 0.006 and accuracy 0.7496
Epoch 15 train loss  0.4689 valid l

In [77]:
acc, points = evaluate(model, testing, trainset.encode, evaluator_ir)
acc, points

(tensor([0.2750]), 274)

In [79]:
acc, points = evaluate(model, validation, trainset.encode, evaluator_ir)
acc, points

(tensor([0.2643]), 78)

In [81]:
model_path = os.getcwd() + '/trained_models/lstm_cnn_qa'
torch.save(model.state_dict(), model_path)