In [1]:
import os
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from utils_data import Vocabulary, Vectorizer, HeadQA, clean_words, parse_dataset, save_dataset_to_pickle, load_dataset_from_pickle 
from utils_data import random_oversamplig, mixed_oversampling, similarity_instance, translate_instance, translate_instance_ir, similarity_instance_ir

from training import train, validate, evaluate, evaluator, evaluate_better, get_optimizer

from supervised_models import LogisticRegression

%matplotlib inline
%load_ext autoreload
%autoreload 2



In [37]:
from datasets import load_dataset

data_es = load_dataset('head_qa', 'es' )

Reusing dataset head_qa (C:\Users\tec005m\.cache\huggingface\datasets\head_qa\es\1.1.0\473dc5357942a3ff52963bd73cad0d167bd1bbc1ca5ca0732ee7372b480dd735)


In [38]:
training, validation, testing = data_es['train'], data_es['validation'], data_es['test']

In [40]:
# training_instances = parse_dataset(training)
# validation_instances = parse_dataset(validation)
# testing_instances = parse_dataset(testing)

# oversampled_training = random_oversamplig(training_instances)
# mixed_oversampling_training = mixed_oversampling(training_instances, translate_instance, similarity_instance)

# save_dataset_to_pickle('../data/training.pickle', training_instances)
# save_dataset_to_pickle('../data/validation.pickle', validation_instances)
# save_dataset_to_pickle('../data/testing.pickle', testing_instances)
# save_dataset_to_pickle('../data/oversampled_training.pickle', oversampled_training)

In [42]:
training_instances = load_dataset_from_pickle('../data/training.pickle')
validation_instances = load_dataset_from_pickle('../data/validation.pickle')
testing_instances = load_dataset_from_pickle('../data/testing.pickle')
oversampled_training = load_dataset_from_pickle('../data/oversampled_training.pickle')
mixed_training = load_dataset_from_pickle('../data/mixed_oversampling_training.pickle')

In [43]:
mixed_training[0]

{'question': 'Los potenciales postsinápticos excitadores:',
 'answer': 'Son de tipo todo o nada.',
 'tok_qtext': ['Los', 'potenciales', 'postsinápticos', 'excitadores', ':'],
 'tok_atext': ['Son', 'de', 'tipo', 'todo', 'o', 'nada', '.'],
 'label': 0,
 'category': 'biology'}

In [None]:
vectorizer = Vectorizer.vectorize_training(mixed_training)

In [None]:
trainset = HeadQA(instances=training_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
validset = HeadQA(instances=validation_instances, vectorizer=vectorizer, right_padding=False, max_length=30)
testset = HeadQA(instances=testing_instances, vectorizer=vectorizer, right_padding=False, max_length=30)

In [None]:
batch_size = 32
train_dt = DataLoader(trainset, batch_size=batch_size,shuffle=True, drop_last=True)
valid_dt = DataLoader(validset, batch_size=batch_size,shuffle=True, drop_last=True)
test_dt = DataLoader(testset, batch_size=batch_size,shuffle=True, drop_last=True)

In [None]:
model = LogisticRegression(trainset.max_length, 1)
optimizer = get_optimizer(model, lr = 0.01, wd = 1e-5)

In [None]:
training_results = train(model, optimizer, train_dt, valid_dt, validate, epochs=20)

In [None]:
acc, points = evaluate(model, validation, trainset.encode, evaluator)

In [None]:
acc, points

In [None]:
acc, points = evaluate(model, testing, trainset.encode, evaluator)

In [None]:
acc, points

In [None]:
# model_path = os.getcwd() + '/trained_models/logistic_regressor'
# torch.save(model.state_dict(), model_path)

In [None]:
save_dataset_to_pickle('../data/train_results_lreg_sigmoid.pickle', training_results)

In [None]:
training_results = load_dataset_from_pickle('../data/train_results_lreg.pickle')