In [1]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, save_dataset_to_pickle, load_dataset_from_pickle 
from utils_data import random_oversamplig, mixed_oversampling, similarity_instance, translate_instance, translate_instance_ir, similarity_instance_ir

from training import train, validate, evaluate, evaluator, evaluate_better, get_optimizer

from supervised_models import LogisticRegression

%matplotlib inline
%load_ext autoreload
%autoreload 2



In [2]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [3]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [4]:
# training_instances = parse_dataset(training)
# validation_instances = parse_dataset(validation)
# testing_instances = parse_dataset(testing)

# oversampled_training = random_oversamplig(training_instances)
# mixed_oversampling_training = mixed_oversampling(training_instances, translate_instance, similarity_instance)

# save_dataset_to_pickle('../data/training.pickle', training_instances)
# save_dataset_to_pickle('../data/validation.pickle', validation_instances)
# save_dataset_to_pickle('../data/testing.pickle', testing_instances)
# save_dataset_to_pickle('../data/oversampled_training.pickle', oversampled_training)

In [5]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')
mixed_training = load_dataset_from_pickle('../data/mixed_oversampling_training.pickle')

In [6]:
vectorizer = Vectorizer.vectorize_training(mixed_training)

In [7]:
trainset = HeadQA(instances=mixed_training, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [8]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,shuffle=True, drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,shuffle=True, drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,shuffle=True, drop_last=True)

In [9]:
torch.random.manual_seed(42)
model = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(model, lr = 0.01, wd = 1e-5)

In [10]:
training_results = train(model, optimizer, train_dt, valid_dt, validate, epochs=20)



Epoch 0 train loss  51.0915 valid loss 0.322 and accuracy 0.4458
Epoch 1 train loss  51.4025 valid loss 0.360 and accuracy 0.4662
Epoch 2 train loss  48.6275 valid loss 0.322 and accuracy 0.5327
Epoch 3 train loss  47.6087 valid loss 0.303 and accuracy 0.5414
Epoch 4 train loss  50.0376 valid loss 0.322 and accuracy 0.3945
Epoch 5 train loss  54.0387 valid loss 0.379 and accuracy 0.3305
Epoch 6 train loss  55.0258 valid loss 0.361 and accuracy 0.3381
Epoch 7 train loss  53.9642 valid loss 0.379 and accuracy 0.4131
Epoch 8 train loss  53.0682 valid loss 0.360 and accuracy 0.3776
Epoch 9 train loss  54.5461 valid loss 0.417 and accuracy 0.3818
Epoch 10 train loss  55.9252 valid loss 0.436 and accuracy 0.2763
Epoch 11 train loss  57.0213 valid loss 0.398 and accuracy 0.2651
Epoch 12 train loss  57.0392 valid loss 0.436 and accuracy 0.2728
Epoch 13 train loss  56.8008 valid loss 0.379 and accuracy 0.2895
Epoch 14 train loss  56.3669 valid loss 0.455 and accuracy 0.2994
Epoch 15 train loss 

In [11]:
acc, points = evaluate(model, validation, trainset.encode, evaluator)

In [12]:
acc, points

(tensor([0.2321]), -98)

In [13]:
acc, points = evaluate(model, testing, trainset.encode, evaluator)

In [14]:
acc, points

(tensor([0.2403]), -106)

In [15]:
model_path = os.getcwd() + '/trained_models_v2/logistic_regressor'
torch.save(model.state_dict(), model_path)

In [16]:
save_dataset_to_pickle('results_v2/train_results_lreg_v2.pickle', training_results)

In [17]:
training_results = load_dataset_from_pickle('results_v2/train_results_lreg_v2.pickle')

In [19]:
model = LogisticRegression(trainset.max_length, 1)
model.load_state_dict(torch.load(model_path))
model.eval()

LogisticRegression(
  (linear): Linear(in_features=30, out_features=1, bias=True)
)

In [20]:
acc, points, acc_list, points_list = evaluate_better(model, validation, trainset.encode, evaluator)
acc, points, acc_list, points_list

(0.23211032,
 -16.333333333333332,
 [tensor(0.2566),
  tensor(0.2043),
  tensor(0.2267),
  tensor(0.2511),
  tensor(0.2478),
  tensor(0.2061)],
 [6, -42, -21, 1, -2, -40])

In [21]:
acc, points, acc_list, points_list = evaluate_better(model, testing, trainset.encode, evaluator)
acc, points, acc_list, points_list

(0.24021287,
 -8.833333333333334,
 [tensor(0.2632),
  tensor(0.2242),
  tensor(0.2237),
  tensor(0.2414),
  tensor(0.2478),
  tensor(0.2727),
  tensor(0.2434),
  tensor(0.2586),
  tensor(0.2358),
  tensor(0.2424),
  tensor(0.2311),
  tensor(0.1982)],
 [12, -23, -24, -8, -2, 21, -6, 8, -13, -7, -17, -47])