In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from transformers import AutoModel, AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
import torch
import torch.nn as nn
import pickle

In [3]:
tokenizer = AutoTokenizer.from_pretrained('KB/bert-base-swedish-cased')

In [4]:
model = torch.load('./results/model_CA_5_epochs_weighted_loss.pkl')
with open(r'./data/tokenized_data_eval.pkl', "rb") as input_file:
    test_data = pickle.load(input_file)
tokens = tokenizer.convert_ids_to_tokens(test_data[0]["input_ids"])
print(tokens)
dec = tokenizer.decode(test_data[0]["input_ids"])
print(dec)

['[CLS]', 'Laborator', '##ie', '##ingenjör', 'Laborator', '##ie', '##ingenjörer', 'kan', 'bland', 'annat', 'arbeta', 'inom', 'den', 'kemiska', 'industrin', ',', 'massa', '-', 'och', 'pappers', '##industrin', 'eller', 'läkemedels', '##industrin', ',', 'men', 'också', 'på', 'forsknings', '##institutioner', 'vid', 'högskolor', 'och', 'universitet', '.', 'Det', 'finns', 'även', 'laboratorie', '##ingenjörer', 'vid', 'sjukhus', 'och', 'myndigheter', 'som', 'Livsmedel', '##sverket', 'och', 'Statens', 'Kriminal', '##tekniska', 'Laborator', '##ium', '.', 'Arbetsuppgifter', 'På', 'laboratoriet', 'arbetar', 'laboratorie', '##ingenjören', 'med', 'tekniskt', 'avancerade', 'analys', '##apparater', 'och', 'dator', '##teknik', 'för', 'att', 'göra', 'analyser', 'och', 'kontroller', '.', 'Man', 'kan', 'undersöka', 'råvaror', 'som', 'används', 'vid', 'tillverkningen', 'på', 'en', 'industri', 'eller', 'prover', 'som', 'tagits', 'ute', 'i', 'produktionen', 'för', 'att', 'kontrollera', 'att', 'produkten', '

In [9]:
def get_predicted_answers(output, tokens):
    extracted_answers = []
    last_idx = None
    current_answer = ''
    for idx, label in enumerate(output):
        if label == 2 and len(current_answer) > 0: # add to existing answer, only if there is an existing answer..
            current_answer += tokens[idx] +' '
        if label == 1:
            current_answer = tokens[idx] +' '
        elif len(current_answer) > 0:
            extracted_answers.append(current_answer)
            current_answer = ''
    print(extracted_answers)




In [10]:
# Output class
# https://huggingface.co/docs/transformers/main_classes/output#transformers.modeling_tf_outputs.TFTokenClassifierOutput

model.eval()
test_input = []
test_labels = []
test_attn = []
token_type_ids = []
for i in range(len(test_data)):
    test_input.append(test_data[i]['input_ids'])
    test_labels.append(test_data[i]['labels'])
    test_attn.append(test_data[i]['attention_mask'])
    token_type_ids.append(test_data[i]['token_type_ids'])

print(len(test_input))
print(len(test_labels))
print(len(test_attn))
num_correct = 0
num_predicted = 0
num_pos_data = 0
for i in range(len(test_data)):
    output = model(torch.tensor([test_data[i]['input_ids']]), attention_mask=torch.tensor([test_data[i]['attention_mask']]), token_type_ids=torch.tensor([test_data[i]['token_type_ids']]), labels=torch.tensor([test_data[i]['labels']]))
    print('test idx: ', i)
    print('instance loss: ', output.loss)
    # print(output.logits)
    m = nn.Softmax(dim=2)
    max = m(output.logits)
    out = torch.argmax(max, dim=2)
    # print(max)
    # print('Output length: ', len(out[0]))
    # print('labels length: ', len(test_data[i]['labels']))
    # print('Labels: ', test_data[i]['labels'])
    tokens = tokenizer.convert_ids_to_tokens(test_data[i]["input_ids"])
    true_labels = test_data[i]['labels']
    # print(tokens)
    get_predicted_answers(out[0], tokens)
    for idx, pred_label in enumerate(out[0]):
        true_label = true_labels[idx]
        if true_label > 0:
            num_pos_data += 1
        if pred_label > 0:
            # print('label: ', pred_label)
            # print('token: ', tokens[idx])
            num_predicted += 1
            if pred_label == true_label:
                num_correct += 1

# calculate precision and recall
pr = num_correct/num_predicted
rec = num_correct/num_pos_data
print('precision: ', pr)
print('recall: ', rec)


44
44
44
test idx:  0
instance loss:  tensor(1.0268, grad_fn=<NllLossBackward>)
['Laborator ', 'Laborator ', 'inom den ', 'massa - ', 'läkemedels ##industrin ', 'forsknings ', 'vid ', '##sverket ', 'Statens Kriminal ', '##ingenjören ', 'tekniskt avancerade ', '##apparater ', 'dator ', 'att göra ', 'råvaror som ', 'prover ', 'att kontrollera ', 'kemiska och ', 'biologiska tester ', 'att undersöka ', 'att leta ', 'mikro ##organis ', '##mer som ', 'När en ', 'svaren och ', 'sammans ##t ', '##äll ##s ', 'att förbättra ', 'Laborator ', '##ingenjörer ', 'utarbeta normer ', 'hur man ', 'sköta underhållet ', 'att handled ', 'söka litteratur ', 'läsa artiklar ', 'På ', 'ingenjören ', '##anter ', '##medicinska analytiker ', '##log ##er ', 'kemist ##er ']
test idx:  1
instance loss:  tensor(0.9706, grad_fn=<NllLossBackward>)
['sjukhus ', 'kommunal äldre ', 'vårdcentral ', 'Om ', '##ologiska insatser ', 'läkemedels ##behandling ', 'stödja ##nd ', 'sjuksköterskan ', 'leder omvårdnad ', 'ansvarar ',