In [1]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, random_oversamplig, save_dataset_to_pickle, load_dataset_from_pickle
from training import train, validate, evaluate, evaluator

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
# training_instances = parse_dataset(training)
# validation_instances = parse_dataset(validation)
# testing_instances = parse_dataset(testing)

# oversampled_training = random_oversamplig(training_instances)

# save_dataset_to_pickle('../data/training.pickle', training_instances)
# save_dataset_to_pickle('../data/validation.pickle', validation_instances)
# save_dataset_to_pickle('../data/testing.pickle', testing_instances)
# save_dataset_to_pickle('../data/oversampled_training.pickle', oversampled_training)

In [17]:
training_instances[0]

{'question': 'Los potenciales postsinápticos excitadores:',
 'answer': 'Son de tipo todo o nada.',
 'label': 0,
 'sample_tok': ['Los',
  'potenciales',
  'postsinápticos',
  'excitadores',
  ':',
  '[SEP]',
  'Son',
  'de',
  'tipo',
  'todo',
  'o',
  'nada',
  '.'],
 'category': 'biology'}

In [5]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [6]:
vectorizer = Vectorizer.vectorize_training(oversampled_training)

In [7]:
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [8]:
trainset = HeadQA(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [9]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [10]:
class BasicLSTM(torch.nn.Module):
    def __init__(self, vocab_size, hidden_dim, x_size, n_classes, embedding_dim=300): 
        super(BasicLSTM, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, n_classes)
        self.dropout = nn.Dropout(0.5)
        self.n_classes = n_classes
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        out, (ht, ct) = self.lstm(x)        
        x = self.linear(ht[-1])
        return F.softmax(x, dim=0)

In [11]:
def get_optimizer(model, lr=0.01, wd=0.0):
    return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

In [12]:
model = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(model, lr = 0.001, wd = 1e-5)

In [13]:
training_results = train(model, optimizer, train_dt, valid_dt, validate, epochs=50)

Epoch 0 train loss  0.7195 valid loss 0.005 and accuracy 0.7500
Epoch 1 train loss  0.7177 valid loss 0.005 and accuracy 0.7500
Epoch 2 train loss  0.7165 valid loss 0.005 and accuracy 0.7500
Epoch 3 train loss  0.7118 valid loss 0.005 and accuracy 0.7500
Epoch 4 train loss  0.6992 valid loss 0.006 and accuracy 0.7500
Epoch 5 train loss  0.6729 valid loss 0.006 and accuracy 0.7500
Epoch 6 train loss  0.6385 valid loss 0.007 and accuracy 0.7500
Epoch 7 train loss  0.6119 valid loss 0.007 and accuracy 0.7500
Epoch 8 train loss  0.5872 valid loss 0.007 and accuracy 0.7500
Epoch 9 train loss  0.5690 valid loss 0.008 and accuracy 0.7500
Epoch 10 train loss  0.5537 valid loss 0.008 and accuracy 0.7500
Epoch 11 train loss  0.5439 valid loss 0.009 and accuracy 0.7500
Epoch 12 train loss  0.5383 valid loss 0.008 and accuracy 0.7500
Epoch 13 train loss  0.5255 valid loss 0.009 and accuracy 0.7500
Epoch 14 train loss  0.5218 valid loss 0.009 and accuracy 0.7500
Epoch 15 train loss  0.5197 valid l

In [14]:
acc, points = evaluate(model, validation, trainset.encode, evaluator)
acc, points

(tensor([0.2408]), -50)

In [15]:
acc, points = evaluate(model, testing, trainset.encode, evaluator)
acc, points

(tensor([0.2542]), 46)

In [18]:
save_dataset_to_pickle('../data/train_results_lstm.pickle', training_results)
training_results = load_dataset_from_pickle('../data/train_results_lstm.pickle')

In [19]:
training_results

[[0.7194906572261489, tensor(0.0054, requires_grad=True), 0.75],
 [0.7177084417228239, tensor(0.0054, requires_grad=True), 0.75],
 [0.7164642316749297, tensor(0.0054, requires_grad=True), 0.75],
 [0.7118355397718499, tensor(0.0055, requires_grad=True), 0.75],
 [0.6991902358560677, tensor(0.0059, requires_grad=True), 0.75],
 [0.672914225006678, tensor(0.0062, requires_grad=True), 0.75],
 [0.6385499821369907, tensor(0.0067, requires_grad=True), 0.75],
 [0.611919884078474, tensor(0.0070, requires_grad=True), 0.75],
 [0.5871523132525295, tensor(0.0074, requires_grad=True), 0.75],
 [0.5690496934465615, tensor(0.0076, requires_grad=True), 0.75],
 [0.5536535519433309, tensor(0.0077, requires_grad=True), 0.75],
 [0.543929350447942, tensor(0.0085, requires_grad=True), 0.75],
 [0.5382773188223322, tensor(0.0085, requires_grad=True), 0.75],
 [0.525461492409189, tensor(0.0088, requires_grad=True), 0.75],
 [0.5217669140861695, tensor(0.0086, requires_grad=True), 0.75],
 [0.5197156937007444, tensor(

In [20]:
# model_path = os.getcwd() + '/trained_models/basic_lstm'
# torch.save(model.state_dict(), model_path)