In [None]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words
from training import train, evaluate, validate


%matplotlib inline
%load_ext autoreload
%autoreload 2

In [None]:
from datasets import load_dataset

# data_en = load_dataset('head_qa', 'en')
data_es = load_dataset('head_qa', 'es' )
data_es

In [None]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [None]:
for d in training:
    print(d)
    break;

In [None]:
#!python -m spacy download es_core_news_sm

In [None]:
import spacy
nlp = spacy.load("es_core_news_sm")

vocabulary = Vocabulary()

In [None]:
def parse_training(training):
    train = []       
    for sample in training:
        qtext, answers= sample['qtext'], sample['answers'] 
        q = nlp(qtext)
        tok_qtext = [token.text for token in q]
        right_answer = sample['ra']        
        for answer in answers:
            aid, atext = answer['aid'], answer['atext']
            a = nlp(atext)
            tok_atext = [token.text for token in a]
            instance_x = tok_qtext + ['SEP'] + tok_atext    
            instance_y = 1 if right_answer == aid else 0
            training_sample = {}
            training_sample['question'] = qtext
            training_sample['answer'] = atext
            training_sample['label'] = instance_y
            training_sample['sample_tok'] = instance_x
            training_sample['category'] = sample['category']
            train.append(training_sample)
    return train

In [None]:
instances = parse_training(training)
instances[0]

In [None]:
vectorizer = Vectorizer.vectorize_training(instances)

In [None]:
dataset = HeadQA(instances=instances, vectorizer=vectorizer, right_padding=False, max_length=30)
batch_size = 32
dataloader = DataLoader(dataset, batch_size=batch_size,shuffle=True, drop_last=True)

In [None]:
for x, y in dataset:
    print(x)
    print(y)
    break;

In [None]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, x_size, n_classes): 
        super(LogisticRegression, self).__init__()             
        self.linear = nn.Linear(x_size, n_classes)
        
    def forward(self, x):
        x = self.linear(x)
        return F.softmax(x, dim=1)

In [None]:
def get_optimizer(model, lr = 0.01, wd = 0.0):
    optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    return optim

In [None]:
model = LogisticRegression(dataset.max_length, 1)
optimizer = get_optimizer(model, lr = 0.0001, wd = 1e-5)

In [None]:
training_results = train(model, optimizer, dataloader, dataloader)