In [1]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, random_oversamplig
from training import train, validate, evaluate

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\CLAUDIA\.cache\huggingface\datasets\head_qa\es\1.1.0\d6803d1e84273cdc4a2cf3c5102945d166555f47b299ecbc5266d582f408f8e2)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [None]:
training_instances = parse_dataset(training)
validation_instances = parse_dataset(validation)
testing_instances = parse_dataset(testing)

oversampled_training = random_oversamplig(training_instances)

In [None]:
vectorizer = Vectorizer.vectorize_training(oversampled_training)

vocab = vectorizer.sentence_vocab
label_vocab = vectorizer.label_vocab

In [None]:
trainset = HeadQA(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [None]:
def load_glove_from_file(glove_filepath):
    word_to_index = {}
    embeddings = []
    with open(glove_filepath, "r") as fp:
        size, emb_size = fp.readline().split()
        for index, line in enumerate(fp):
            line = line.split(" ") # each line: word num1 num2 ...
            word = line[0]
            word_to_index[word] = index
            embedding_i = np.array([float(val) for val in line[1:]])
            embeddings.append(embedding_i)
    return word_to_index, np.stack(embeddings)

def make_embedding_matrix(glove_filepath, words):
    word_to_idx, glove_embeddings = load_glove_from_file(glove_filepath)
    embedding_size = glove_embeddings.shape[1]
    final_embeddings = np.zeros((len(words), embedding_size))
    for i, word in enumerate(words):
        if word in word_to_idx:
            final_embeddings[i,:] = glove_embeddings[word_to_idx[word]]
        else:
            embedding_i = torch.ones(1, embedding_size) #si el embedding no esta, se genera a partir de una distribución
            torch.nn.init.xavier_uniform_(embedding_i)
            final_embeddings[i,:] = embedding_i
    return final_embeddings

In [None]:
class BiLSTM_model(nn.Module):
    def __init__(self, embedding_size, num_embeddings, num_classes, hidden_size=64,
                 pretrained_embeddings=None, padding_idx=0, max_length = 110):
        super(BiLSTM_model, self).__init__()

        self.embedding_size = embedding_size
        self.num_embeddings = num_embeddings
        self.hidden_size = hidden_size
        self.max_length = max_length
        
        if pretrained_embeddings is None:
            self.emb = nn.Embedding(embedding_dim=self.embedding_size,num_embeddings=self.num_embeddings,
                                    padding_idx=padding_idx)
        else:
            pretrained_embeddings = torch.from_numpy(pretrained_embeddings).float()
            self.emb = nn.Embedding(embedding_dim=self.embedding_size, num_embeddings=self.num_embeddings,
                                    padding_idx=padding_idx, _weight=pretrained_embeddings)
            self.emb.weight.requires_grad = False
        self.dropout = nn.Dropout(0.3)            
        self.lstm = nn.LSTM(embedding_size, hidden_size, batch_first=True, dropout = 0.5,bidirectional = True)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        self.linear = nn.Linear(self.hidden_size*2*self.max_length, num_classes) 
            
    def forward(self, x):
        x = self.emb(x)
        x = self.dropout(x)
        out, (ht, ct) = self.lstm(x)
        attn = self.attn(out)
        attn_weights = F.softmax(torch.tanh(attn), dim=1)
        attn_applied = torch.bmm(attn_weights, out)
        attn_applied = attn_applied.flatten(1) 
        return F.softmax(self.linear(attn_applied), dim = 0)
        #return self.linear(attn_applied)

In [None]:
def get_optimizer(model, lr=0.01, wd=0.0):
    return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

In [None]:
embedding_file = data_path + "glove/glove-sbwc.i25.vec"
vocabulary = vocab.vocab2index.keys()
embedding_matrix = make_embedding_matrix(embedding_file, list(vocab))

In [None]:
model = BiLSTM_model(embedding_matrix.shape[1], embedding_matrix.shape[0], 1, 
                     pretrained_embeddings=embedding_matrix, max_length=trainset.max_length)
optimizer = get_optimizer(model, lr = 0.001, wd = 1e-5)

In [None]:
training_results = train(model, optimizer, train_dt, valid_dt, validate)

In [None]:
acc, points = evaluate(model, validation, trainset.encode)
acc, points

In [None]:
acc, points = evaluate(model, testing, trainset.encode)
acc, points

In [None]:
model_path = os.getcwd() + '/trained_models/bilstm'
torch.save(model.state_dict(), model_path)