In [1]:
import os
import pickle
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, random_oversamplig, save_dataset_to_pickle, load_dataset_from_pickle
from training import train, validate, evaluate, evaluator, evaluate_better, get_optimizer

from supervised_models import BasicLSTM

%matplotlib inline
%load_ext autoreload
%autoreload 2



In [2]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
# training_instances = parse_dataset(training)
# validation_instances = parse_dataset(validation)
# testing_instances = parse_dataset(testing)

# oversampled_training = random_oversamplig(training_instances)

# save_dataset_to_pickle('../data/training.pickle', training_instances)
# save_dataset_to_pickle('../data/validation.pickle', validation_instances)
# save_dataset_to_pickle('../data/testing.pickle', testing_instances)
# save_dataset_to_pickle('../data/oversampled_training.pickle', oversampled_training)

In [5]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [6]:
vectorizer = Vectorizer.vectorize_training(oversampled_training)
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [7]:
trainset = HeadQA(instances=oversampled_training, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [8]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [9]:
torch.random.manual_seed(42)
model = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(model, lr = 0.001, wd = 1e-5)

In [10]:
training_results = train(model, optimizer, train_dt, valid_dt, validate, epochs=50)



Epoch 0 train loss  0.3690 valid loss 0.024 and accuracy 0.2500
Epoch 1 train loss  0.5278 valid loss 0.023 and accuracy 0.2500
Epoch 2 train loss  0.5082 valid loss 0.019 and accuracy 0.2500
Epoch 3 train loss  0.5299 valid loss 0.020 and accuracy 0.2500
Epoch 4 train loss  0.5222 valid loss 0.013 and accuracy 0.2500
Epoch 5 train loss  0.5012 valid loss 0.021 and accuracy 0.2500
Epoch 6 train loss  0.4874 valid loss 0.018 and accuracy 0.2500
Epoch 7 train loss  0.5184 valid loss 0.013 and accuracy 0.2500
Epoch 8 train loss  0.5377 valid loss 0.014 and accuracy 0.2500
Epoch 9 train loss  0.4955 valid loss 0.020 and accuracy 0.2500
Epoch 10 train loss  0.5038 valid loss 0.015 and accuracy 0.2500
Epoch 11 train loss  0.4897 valid loss 0.016 and accuracy 0.2500
Epoch 12 train loss  0.4769 valid loss 0.020 and accuracy 0.2500
Epoch 13 train loss  0.4689 valid loss 0.015 and accuracy 0.2500
Epoch 14 train loss  0.4903 valid loss 0.015 and accuracy 0.2500
Epoch 15 train loss  0.4667 valid l

In [11]:
acc, points = evaluate(model, validation, trainset.encode, evaluator)
acc, points

(tensor([0.2467]), -18)

In [12]:
acc, points = evaluate(model, testing, trainset.encode, evaluator)
acc, points

(tensor([0.2633]), 146)

In [13]:
save_dataset_to_pickle('../data/train_results_lstm.pickle', training_results)
training_results = load_dataset_from_pickle('../data/train_results_lstm.pickle')

In [16]:
model_path = os.getcwd() + '/trained_models/lstm'
torch.save(model.state_dict(), model_path)

In [17]:
model = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
model.load_state_dict(torch.load(os.getcwd() + '/trained_models/lstm'))
model.eval()

BasicLSTM(
  (embeddings): Embedding(20403, 100, padding_idx=0)
  (lstm): LSTM(100, 64, batch_first=True)
  (linear): Linear(in_features=64, out_features=1, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)

In [18]:
acc, points, acc_list, points_list = evaluate_better(model, validation, trainset.encode, evaluator)
acc, points, acc_list, points_list

(0.22040337,
 -27.0,
 [tensor(0.2345),
  tensor(0.1522),
  tensor(0.2089),
  tensor(0.2554),
  tensor(0.2434),
  tensor(0.2281)],
 [-14, -90, -37, 5, -6, -20])

In [19]:
acc, points, acc_list, points_list = evaluate_better(model, testing, trainset.encode, evaluator)
acc, points, acc_list, points_list

(0.2512668,
 1.1666666666666667,
 [tensor(0.2368),
  tensor(0.2646),
  tensor(0.2149),
  tensor(0.2155),
  tensor(0.2435),
  tensor(0.2684),
  tensor(0.2566),
  tensor(0.3103),
  tensor(0.2882),
  tensor(0.2251),
  tensor(0.2489),
  tensor(0.2423)],
 [-12, 13, -32, -32, -6, 17, 6, 56, 35, -23, -1, -7])