In [1]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, save_dataset_to_pickle, load_dataset_from_pickle 
from utils_data import random_oversamplig, mixed_oversampling, translate_instance, translate_instance_ir, similarity_instance_ir


from training import train, validate, evaluate, evaluator, evaluate_better, get_optimizer

from supervised_models import LogisticRegression

%matplotlib inline
%load_ext autoreload
%autoreload 2



In [2]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
for d in training:
    print(d)
    break;

{'answers': [{'aid': 1, 'atext': 'Son de tipo todo o nada.'}, {'aid': 2, 'atext': 'Son hiperpolarizantes.'}, {'aid': 3, 'atext': 'Se pueden sumar.'}, {'aid': 4, 'atext': 'Se propagan a largas distancias.'}, {'aid': 5, 'atext': 'Presentan un periodo refractario.'}], 'category': 'biology', 'image': '', 'name': 'Cuaderno_2013_1_B', 'qid': 1, 'qtext': 'Los potenciales postsinápticos excitadores:', 'ra': 3, 'year': '2013'}


In [5]:
# training_instances = parse_dataset(training)
# validation_instances = parse_dataset(validation)
# testing_instances = parse_dataset(testing)

# oversampled_training = random_oversamplig(training_instances)
# mixed_oversampling_training = mixed_oversampling(training_instances, translate_instance, similarity_instance)
# mixed_oversampling_training_ir = mixed_oversampling(training_instances_ir, translate_instance_ir, similarity_instance_ir)

# save_dataset_to_pickle('../data/training.pickle', training_instances)
# save_dataset_to_pickle('../data/validation.pickle', validation_instances)
# save_dataset_to_pickle('../data/testing.pickle', testing_instances)
# save_dataset_to_pickle('../data/oversampled_training.pickle', oversampled_training)
# save_dataset_to_pickle('../data/mixed_oversampling_training.pickle', mixed_oversampling_training)
# save_dataset_to_pickle('../data/mixed_oversampling_training_ir.pickle', mixed_oversampling_training_ir)

In [6]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')

oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')

In [7]:
vectorizer = Vectorizer.vectorize_training(oversampled_training)

In [8]:
trainset = HeadQA(instances=oversampled_training, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [9]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,shuffle=True, drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,shuffle=True, drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,shuffle=True, drop_last=True)

In [10]:
torch.random.manual_seed(42)
model = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(model, lr = 0.01, wd = 1e-5)

In [11]:
training_results = train(model, optimizer, train_dt, valid_dt, validate, epochs=20)



Epoch 0 train loss  49.8963 valid loss 0.417 and accuracy 0.3256
Epoch 1 train loss  49.5299 valid loss 0.493 and accuracy 0.3563
Epoch 2 train loss  49.7183 valid loss 0.455 and accuracy 0.3221
Epoch 3 train loss  49.7255 valid loss 0.455 and accuracy 0.2717
Epoch 4 train loss  49.8110 valid loss 0.417 and accuracy 0.2680
Epoch 5 train loss  50.0348 valid loss 0.436 and accuracy 0.2686
Epoch 6 train loss  50.0348 valid loss 0.398 and accuracy 0.2678
Epoch 7 train loss  50.0348 valid loss 0.455 and accuracy 0.2691
Epoch 8 train loss  50.0348 valid loss 0.436 and accuracy 0.2682
Epoch 9 train loss  50.0349 valid loss 0.455 and accuracy 0.2684
Epoch 10 train loss  50.0396 valid loss 0.379 and accuracy 0.2678
Epoch 11 train loss  50.0397 valid loss 0.455 and accuracy 0.2689
Epoch 12 train loss  50.0303 valid loss 0.398 and accuracy 0.2684
Epoch 13 train loss  50.0351 valid loss 0.427 and accuracy 0.2684
Epoch 14 train loss  50.0493 valid loss 0.436 and accuracy 0.2693
Epoch 15 train loss 

In [12]:
acc, points = evaluate(model, validation, trainset.encode, evaluator)

In [13]:
acc, points

(tensor([0.2365]), -74)

In [14]:
acc, points = evaluate(model, testing, trainset.encode, evaluator)

In [15]:
acc, points

(tensor([0.2440]), -66)

In [16]:
model_path = os.getcwd() + '/trained_models/logistic_regressor'
torch.save(model.state_dict(), model_path)

In [19]:
save_dataset_to_pickle('../data/train_results_lreg.pickle', training_results)

In [20]:
training_results = load_dataset_from_pickle('../data/train_results_lreg.pickle')

In [21]:
model = LogisticRegression(trainset.max_length, 1)
model.load_state_dict(torch.load(os.getcwd() + '/trained_models/logistic_regressor'))
model.eval()

LogisticRegression(
  (linear): Linear(in_features=30, out_features=1, bias=True)
)

In [22]:
acc, points, acc_list, points_list = evaluate_better(model, validation, trainset.encode, evaluator)
acc, points, acc_list, points_list

(0.23650944,
 -12.333333333333334,
 [tensor(0.2743),
  tensor(0.2130),
  tensor(0.2267),
  tensor(0.2511),
  tensor(0.2478),
  tensor(0.2061)],
 [22, -34, -21, 1, -2, -40])

In [23]:
acc, points, acc_list, points_list = evaluate_better(model, testing, trainset.encode, evaluator)
acc, points, acc_list, points_list

(0.24385573,
 -5.5,
 [tensor(0.2588),
  tensor(0.2287),
  tensor(0.2325),
  tensor(0.2500),
  tensor(0.2478),
  tensor(0.2900),
  tensor(0.2257),
  tensor(0.2543),
  tensor(0.2445),
  tensor(0.2424),
  tensor(0.2489),
  tensor(0.2026)],
 [8, -19, -16, 0, -2, 37, -22, 4, -5, -7, -1, -43])