In [19]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, random_oversamplig
from training import train, validate, evaluate

%matplotlib inline
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
from datasets import load_dataset

# data_en = load_dataset('head_qa', 'en')
data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\CLAUDIA\.cache\huggingface\datasets\head_qa\es\1.1.0\d6803d1e84273cdc4a2cf3c5102945d166555f47b299ecbc5266d582f408f8e2)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
for d in training:
    print(d)
    break;

{'name': 'Cuaderno_2013_1_B', 'year': '2013', 'category': 'biology', 'qid': 1, 'qtext': 'Los potenciales postsinápticos excitadores:', 'ra': 3, 'image': '', 'answers': [{'aid': 1, 'atext': 'Son de tipo todo o nada.'}, {'aid': 2, 'atext': 'Son hiperpolarizantes.'}, {'aid': 3, 'atext': 'Se pueden sumar.'}, {'aid': 4, 'atext': 'Se propagan a largas distancias.'}, {'aid': 5, 'atext': 'Presentan un periodo refractario.'}]}


In [5]:
training_instances = parse_dataset(training)
validation_instances = parse_dataset(validation)
testing_instances = parse_dataset(testing)

In [6]:
oversampled_training = random_oversamplig(training_instances)

In [7]:
vectorizer = Vectorizer.vectorize_training(oversampled_training)

In [8]:
trainset = HeadQA(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [9]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,shuffle=True, drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,shuffle=True, drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,shuffle=True, drop_last=True)

In [10]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, x_size, n_classes): 
        super(LogisticRegression, self).__init__()             
        self.linear = nn.Linear(x_size, n_classes)
        
    def forward(self, x):
        x = self.linear(x)
        x = F.softmax(x, dim=0)
        return x

In [11]:
def get_optimizer(model, lr=0.01, wd=0.0):
    return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

In [12]:
model = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(model, lr = 0.001, wd = 1e-5)

In [13]:
training_results = train(model, optimizer, train_dt, valid_dt, validate)

Epoch 0 train loss  21.6421 valid loss 0.171 and accuracy 23.9882
Epoch 1 train loss  21.6648 valid loss 0.133 and accuracy 24.0059
Epoch 2 train loss  21.7195 valid loss 0.133 and accuracy 24.0000
Epoch 3 train loss  21.7747 valid loss 0.227 and accuracy 24.0059
Epoch 4 train loss  21.5394 valid loss 0.114 and accuracy 24.0118
Epoch 5 train loss  21.7303 valid loss 0.133 and accuracy 23.9882
Epoch 6 train loss  21.7257 valid loss 0.171 and accuracy 24.0059
Epoch 7 train loss  21.7036 valid loss 0.152 and accuracy 23.9882
Epoch 8 train loss  21.7344 valid loss 0.227 and accuracy 24.0000
Epoch 9 train loss  21.6570 valid loss 0.152 and accuracy 24.0118
Epoch 10 train loss  21.5832 valid loss 0.133 and accuracy 24.0000
Epoch 11 train loss  21.6190 valid loss 0.076 and accuracy 23.9941
Epoch 12 train loss  21.6654 valid loss 0.095 and accuracy 23.9882
Epoch 13 train loss  21.5461 valid loss 0.152 and accuracy 24.0059
Epoch 14 train loss  21.7991 valid loss 0.190 and accuracy 24.0059
Epoch

In [14]:
acc, points = evaluate(model, validation, trainset.encode)

In [15]:
acc, points

(tensor([0.2474]), -14)

In [16]:
acc, points = evaluate(model, testing, trainset.encode)

In [17]:
acc, points

(tensor([0.2666]), 182)

In [20]:
model_path = os.getcwd() + '/trained_models/logistic_regressor'
torch.save(model.state_dict(), model_path)