In [139]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_training
from training import train, validate_answer

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [85]:
from datasets import load_dataset

# data_en = load_dataset('head_qa', 'en')
data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\CLAUDIA\.cache\huggingface\datasets\head_qa\es\1.1.0\d6803d1e84273cdc4a2cf3c5102945d166555f47b299ecbc5266d582f408f8e2)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
for d in training:
    print(d)
    break;

{'name': 'Cuaderno_2013_1_B', 'year': '2013', 'category': 'biology', 'qid': 1, 'qtext': 'Los potenciales postsinápticos excitadores:', 'ra': 3, 'image': '', 'answers': [{'aid': 1, 'atext': 'Son de tipo todo o nada.'}, {'aid': 2, 'atext': 'Son hiperpolarizantes.'}, {'aid': 3, 'atext': 'Se pueden sumar.'}, {'aid': 4, 'atext': 'Se propagan a largas distancias.'}, {'aid': 5, 'atext': 'Presentan un periodo refractario.'}]}


In [None]:
training_instances = parse_training(training)

In [140]:
vocabulary = Vocabulary()
vectorizer = Vectorizer.vectorize_training(training_instances)
trainset = HeadQA(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [141]:
trainset[0]

(tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
          0.,  0.,  0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.,
         12., 13.]),
 tensor([0.]))

In [142]:
training_instances[0]

{'question': 'Los potenciales postsinápticos excitadores:',
 'answer': 'Son de tipo todo o nada.',
 'label': 0,
 'sample_tok': ['Los',
  'potenciales',
  'postsinápticos',
  'excitadores',
  ':',
  'SEP',
  'Son',
  'de',
  'tipo',
  'todo',
  'o',
  'nada',
  '.'],
 'category': 'biology'}

In [143]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,shuffle=True, drop_last=True)

In [144]:
for x, y in trainset:
    print(x)
    print(y)
    break;

tensor([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10., 11.,
        12., 13.])
tensor([0.])


In [145]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, x_size, n_classes): 
        super(LogisticRegression, self).__init__()             
        self.linear = nn.Linear(x_size, n_classes)
        
    def forward(self, x):
        x = self.linear(x)
        x = F.softmax(x, dim=0)
        return x

In [146]:
def get_optimizer(model, lr=0.01, wd=0.0):
    return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

In [147]:
model = LogisticRegression(dataset.max_length, 1)
optimizer = get_optimizer(model, lr = 0.0001, wd = 1e-5)

In [159]:
def encode(sample):
        qtext, answers = sample['qtext'], sample['answers']
        q = nlp(qtext)
        tok_qtext = [token.text for token in q]
        right_answer = sample['ra']
        X, Y = [], []
        for answer in answers:
            aid, atext = answer['aid'], answer['atext']
            a = nlp(atext)
            tok_atext = [token.text for token in a]
            instance_x = tok_qtext + ['SEP'] + tok_atext
            instance_y = 1 if right_answer == aid else 0
            x, y = trainset.vectorize(instance_x, instance_y)
            X.append(x)
            Y.append(y)
            print(X)
        return torch.Tensor(X), torch.Tensor(Y)

In [160]:
def train(model, optimizer, train_dl, test_dl, validate, encoder, epochs=100):
    y_trues, y_preds = [], []
    epochs_results = []
    for i in range(epochs):
        model.train()
        total = 0
        sum_loss = 0
        for x, y in train_dl:
            batch = y.shape[0]
            out = model(x.float())
            loss = F.binary_cross_entropy(out, y.float())
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total += batch
            sum_loss += batch*(loss.item())
        train_loss = sum_loss/total
        test_acc, points = validate(model, test_dl, encoder)
        #y_trues.append(y_real)
        #y_preds.append(y_pred)
        epochs_results.append([train_loss, points, test_acc])
        print("Epoch %s train loss  %.4f points %.3f and accuracy %.4f" %
              (i, train_loss, points, test_acc))
    return epochs_results

In [161]:
training_results = train(model, optimizer, trainset, validation, validate_answer, encode)

[tensor([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 9.2000e+01, 3.3730e+03, 8.0000e+00, 4.7920e+03, 3.7800e+02,
        2.3000e+01, 6.8590e+03, 0.0000e+00, 1.9000e+01, 1.5710e+03, 8.0000e+00,
        2.8400e+02, 2.0600e+02, 6.3000e+01, 9.9560e+03, 1.2260e+03, 5.0000e+00,
        6.0000e+00, 9.2000e+01, 4.7920e+03, 8.0000e+00, 0.0000e+00, 1.3000e+01])]
[tensor([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 9.2000e+01, 3.3730e+03, 8.0000e+00, 4.7920e+03, 3.7800e+02,
        2.3000e+01, 6.8590e+03, 0.0000e+00, 1.9000e+01, 1.5710e+03, 8.0000e+00,
        2.8400e+02, 2.0600e+02, 6.3000e+01, 9.9560e+03, 1.2260e+03, 5.0000e+00,
        6.0000e+00, 9.2000e+01, 4.7920e+03, 8.0000e+00, 0.0000e+00, 1.3000e+01]), tensor([0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 9.2000e+01, 3.3730e+03, 8.0000e+00, 4.7920e+03, 3.7800e+02,
        2.3000e+01, 6.8590e+03, 0.

ValueError: only one element tensors can be converted to Python scalars

In [13]:
validation[0]

{'name': 'Cuaderno_2015_1_B',
 'year': '2015',
 'category': 'biology',
 'qid': 1,
 'qtext': 'El potencial de equilibrio para un ión permeante a través de una membrana se calcula mediante:',
 'ra': 4,
 'image': '',
 'answers': [{'aid': 1, 'atext': 'El equilibrio de Gibbs-Donnan.'},
  {'aid': 2, 'atext': 'La ecuación de Goldman-Hodgkin-Katz.'},
  {'aid': 3, 'atext': 'La ecuación de Ohm.'},
  {'aid': 4, 'atext': 'La ecuación de Nernst.'}]}