In [1]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, random_oversamplig, save_dataset_to_pickle, load_dataset_from_pickle 
from training import train, validate, evaluate, evaluator

%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
for d in training:
    print(d)
    break;

{'answers': [{'aid': 1, 'atext': 'Son de tipo todo o nada.'}, {'aid': 2, 'atext': 'Son hiperpolarizantes.'}, {'aid': 3, 'atext': 'Se pueden sumar.'}, {'aid': 4, 'atext': 'Se propagan a largas distancias.'}, {'aid': 5, 'atext': 'Presentan un periodo refractario.'}], 'category': 'biology', 'image': '', 'name': 'Cuaderno_2013_1_B', 'qid': 1, 'qtext': 'Los potenciales postsinápticos excitadores:', 'ra': 3, 'year': '2013'}


In [5]:
training_instances = parse_dataset(training)
validation_instances = parse_dataset(validation)
testing_instances = parse_dataset(testing)

# oversampled_training = random_oversamplig(training_instances)

# save_dataset_to_pickle('../data/training.pickle', training_instances)
# save_dataset_to_pickle('../data/validation.pickle', validation_instances)
# save_dataset_to_pickle('../data/testing.pickle', testing_instances)
# save_dataset_to_pickle('../data/oversampled_training.pickle', oversampled_training)

In [6]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [7]:
vectorizer = Vectorizer.vectorize_training(oversampled_training)

In [8]:
trainset = HeadQA(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [9]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,shuffle=True, drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,shuffle=True, drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,shuffle=True, drop_last=True)

In [14]:
class LogisticRegression(torch.nn.Module):
    def __init__(self, x_size, n_classes): 
        super(LogisticRegression, self).__init__()             
        self.linear = nn.Linear(x_size, n_classes)
        
    def forward(self, x):
        x = self.linear(x.float())
        x = F.sigmoid(x)
        return x

In [15]:
def get_optimizer(model, lr=0.01, wd=0.0):
    return torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)

In [23]:
model = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(model, lr = 0.01, wd = 1e-5)

In [24]:
training_results = train(model, optimizer, train_dt, valid_dt, validate, epochs=20)

Epoch 0 train loss  20.7112 valid loss 0.171 and accuracy 0.7502
Epoch 1 train loss  19.9758 valid loss 0.095 and accuracy 0.7494
Epoch 2 train loss  19.9752 valid loss 0.152 and accuracy 0.7491
Epoch 3 train loss  19.9773 valid loss 0.095 and accuracy 0.7498
Epoch 4 train loss  19.9856 valid loss 0.133 and accuracy 0.7482
Epoch 5 train loss  20.0564 valid loss 0.076 and accuracy 0.7500
Epoch 6 train loss  19.9997 valid loss 0.057 and accuracy 0.7498
Epoch 7 train loss  19.9920 valid loss 0.227 and accuracy 0.7500
Epoch 8 train loss  20.0070 valid loss 0.114 and accuracy 0.7494
Epoch 9 train loss  20.0142 valid loss 0.152 and accuracy 0.7500
Epoch 10 train loss  20.0593 valid loss 0.152 and accuracy 0.7482
Epoch 11 train loss  20.0892 valid loss 0.133 and accuracy 0.7465
Epoch 12 train loss  20.2460 valid loss 0.133 and accuracy 0.7450
Epoch 13 train loss  20.7745 valid loss 0.114 and accuracy 0.7500
Epoch 14 train loss  20.0616 valid loss 0.171 and accuracy 0.7485
Epoch 15 train loss 

In [25]:
acc, points = evaluate(model, validation, trainset.encode, evaluator)

In [26]:
acc, points

(tensor([0.2313]), -102)

In [27]:
acc, points = evaluate(model, testing, trainset.encode, evaluator)

In [28]:
acc, points

(tensor([0.2422]), -86)

In [None]:
# model_path = os.getcwd() + '/trained_models/logistic_regressor'
# torch.save(model.state_dict(), model_path)

In [22]:
save_dataset_to_pickle('../data/train_results_lreg_sigmoid.pickle', training_results)

In [None]:
training_results = load_dataset_from_pickle('../data/train_results_lreg.pickle')