In [1]:
from simpletransformers.question_answering import QuestionAnsweringModel
import logging
import orjson
import numpy as np
import math

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)



In [2]:
with open("../resources/data_final.json", "rb") as f:
    dataset = orjson.loads(f.read())

In [3]:
np.random.seed(0)
id_train = np.random.choice(len(dataset), math.ceil(len(dataset)/2))

In [4]:
id_train.shape

(48,)

In [5]:
id_test = np.asarray(list(set(range(len(dataset))).difference(set(id_train))))

In [6]:
id_test.shape

(60,)

In [7]:
# https://github.com/ThilinaRajapakse/simpletransformers#question-answering
# https://simpletransformers.ai/docs/qa-data-formats/#train-data-format

In [8]:
# Create dummy data to use for training.
train_data = dataset #[dataset[id] for id in id_train]

# Create the QuestionAnsweringModel
model = QuestionAnsweringModel('bert', 'mrm8488/bert-base-portuguese-cased-finetuned-squad-v1-pt', args={'reprocess_input_data': True, 'overwrite_output_dir': True})
model.lazy_loading = True

# The list can also be used directly
model.train_model(dataset  ) #train_data)

INFO:simpletransformers.question_answering.question_answering_model: Converting to features started.
convert squad examples to features: 100%|██████████| 95/95 [00:01<00:00, 78.62it/s]
add example index and unique id: 100%|██████████| 95/95 [00:00<00:00, 145210.96it/s]
INFO:simpletransformers.question_answering.question_answering_model: Training of bert model complete. Saved to outputs/.


HBox(children=(HTML(value='Epoch'), FloatProgress(value=0.0, max=1.0), HTML(value='')))

HBox(children=(HTML(value='Running Epoch 0 of 1'), FloatProgress(value=0.0, max=18.0), HTML(value='')))





In [9]:
# Evaluate the model. (Being lazy and evaluating on the train data itself)
#result, text = model.eval_model(train_data, output_dir="../resources/bert/")
result, text = model.eval_model(dataset, output_dir="../resources/bert/")

print(result)
print(text)

INFO:simpletransformers.question_answering.question_answering_model: Converting to features started.
convert squad examples to features: 100%|██████████| 95/95 [00:01<00:00, 69.50it/s]
add example index and unique id: 100%|██████████| 95/95 [00:00<00:00, 208399.00it/s]


HBox(children=(HTML(value='Running Evaluation'), FloatProgress(value=0.0, max=18.0), HTML(value='')))


{'correct': 0, 'similar': 95, 'incorrect': 0, 'eval_loss': -12.246527777777779}
{'correct_text': {}, 'similar_text': {'1243188563': {'truth': 'O primeiro PLE (2020.3) iniciará no dia 17 de agosto. Confira o Calendário Acadêmico com todas as datas ', 'predicted': '', 'question': 'Quando começam as aulas do PLE?'}, '1243188564': {'truth': 'Os emails da UFRPE estão sendo migrados para o Google. Com isso, o professor poderá usar ferramentas como o Google Meet, Google Classroom e o Google Drive com sua conta institucional. Assim, os encontros síncronos realizados como o Google Meet podem ser gravados e armazenados no Google Drive, sendo disponibilizados posteriormente em canais como o YouTube ou no Google Classroom. Também temos a plataforma RNP.', 'predicted': '', 'question': 'Nem o SIGAA nem o AVA podem transmitir as aulas online. Temos na UFRPE uma plataforma para isso?'}, '1243188565': {'truth': 'Sim. Contudo, recomenda-se que se dê preferência para as atividades assíncronas. No caso d

In [36]:
# Making predictions using the model.
to_predict = np.asarray([dataset[2]])
predict = model.predict(to_predict, n_best_size=1)

INFO:simpletransformers.question_answering.question_answering_model: Converting to features started.
convert squad examples to features: 100%|██████████| 1/1 [00:00<00:00, 45.00it/s]
add example index and unique id: 100%|██████████| 1/1 [00:00<00:00, 4568.96it/s]


HBox(children=(HTML(value='Running Prediction'), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [40]:
predict[0][0]['answer'][1]

'regulamento'