In [None]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, random_oversamplig
from training import train, validate, evaluate

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

In [None]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [None]:
training_instances = parse_dataset(training)
validation_instances = parse_dataset(validation)
testing_instances = parse_dataset(testing)

oversampled_training = random_oversamplig(training_instances)

In [None]:
vectorizer = Vectorizer.vectorize_training(oversampled_training)

In [None]:
vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [None]:
trainset = HeadQA(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [None]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,drop_last=True)

In [None]:
class BasicLSTM(torch.nn.Module):
    def __init__(self, vocab_size, hidden_dim, x_size, n_classes, embedding_dim=300): 
        super(BasicLSTM, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.linear = nn.Linear(hidden_dim, n_classes)
        self.dropout = nn.Dropout(0.5)
        self.n_classes = n_classes
        
    def forward(self, x):
        x = self.embeddings(x)
        x = self.dropout(x)
        out, (ht, ct) = self.lstm(x)        
        x = self.linear(ht[-1])
        return F.softmax(x, dim=0)

In [None]:
def get_optimizer(model, lr=0.01, wd=0.0):
    return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

In [None]:
model = BasicLSTM(len(vocab), 64, trainset.max_length, 1, embedding_dim=100)
optimizer = get_optimizer(model, lr = 0.001, wd = 1e-5)

In [None]:
training_results = train(model, optimizer, train_dt, valid_dt, validate, epochs=10)

In [None]:
acc, points = evaluate(model, validation, trainset.encode)
acc, points

In [None]:
acc, points = evaluate(model, testing, trainset.encode)
acc, points

In [None]:
model_path = os.getcwd() + '/trained_models/basic_lstm'
torch.save(model.state_dict(), model_path)