In [1]:
from transformers import (
    AutoConfig, ElectraTokenizerFast, ElectraForTokenClassification, 
    DataCollatorForTokenClassification,
    TrainingArguments, Trainer,
)

from transformers.optimization import (
    AdamW, get_linear_schedule_with_warmup,
    Adafactor, AdafactorSchedule,
)

import torch
import wandb

import datasets
import evaluate

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd

import os
import re
import random

import demoji

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
model_checkpoint = 'training_results/monologg_koelectra_base_v3_discriminator_uncleaned_v11/target_tagger/monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-370'

tokenizer = ElectraTokenizerFast.from_pretrained(model_checkpoint)
model = ElectraForTokenClassification.from_pretrained(model_checkpoint).to(device)

In [4]:
def preprocess_function(examples):
    input_ids = tokenizer.convert_tokens_to_ids(examples["input_tokens_list"])
    token_type_ids = [0 for _ in range(len(input_ids))]
    attention_mask = [1 for _ in range(len(input_ids))]   
    
    input_triplet = dict(
        input_ids = torch.tensor([input_ids]).to(device),
        token_type_ids = torch.tensor([token_type_ids]).to(device),
        attention_mask = torch.tensor([attention_mask]).to(device),
    )
    return input_triplet

In [5]:
TRAIN_DATA_PATH = './dataset/uncleaned_v11/tagger_train.json'
EVAL_DATA_PATH = './dataset/uncleaned_v11/tagger_dev.json'

train_dataset = pd.read_json(TRAIN_DATA_PATH)
eval_dataset = pd.read_json(EVAL_DATA_PATH)

In [6]:
# while True:
#     k = random.randrange(len(train_dataset))
#     input_tokens_list = train_dataset['input_tokens_list'][k]
#     sentence_form = train_dataset['sentence_form'][k]
#     annotations = train_dataset['annotation'][k]
#     if len(tokenizer.convert_tokens_to_ids(input_tokens_list)) != len(tokenizer.encode(sentence_form)):
#         print(k)
#         print(len(tokenizer.convert_tokens_to_ids(input_tokens_list)), len(tokenizer.encode(sentence_form)))
#         print(sentence_form)
#         print(annotations)
#         break

In [71]:
idx = random.randrange(len(eval_dataset))
sample = eval_dataset.iloc[idx]
# sample = train_dataset.iloc[1151]
print(sample.annotation)
print(sample.sentence_form)
print()

input_triplet = tokenizer(sample.sentence_form, return_tensors='pt')
input_triplet = {k:v.to(device) for k, v in input_triplet.items()}
# input_triplet = preprocess_function(sample)
output = model(**input_triplet).logits

print('label:'), print(sample.labels[1:-1])
print('prediction:'), print(output.argmax(-1)[-1].tolist()[1:-1])
print()
print(output.argmax(-1)[-1].tolist()[1:-1] == sample.labels[1:-1])
print()
toks = sample.input_tokens_list[1:-1]
pred = output.argmax(-1)[-1].tolist()[1:-1]
# labs = sample.labels

starts = list({k:v for k, v in enumerate(pred) if v == 1}.keys())
targets = []
for start in starts:
    target = [toks[start]]
    for tok, lab in zip(toks[start+1:], pred[start+1:]):
        if lab != 2:
            break
        else:
            target.append(tok)
    targets.append(tokenizer.convert_tokens_to_string(target))

print('targets in labels:')
for el in sample.annotation:
    print(el[1][0])
print('targets predicted:')
print(targets)

[['패키지/구성품#디자인', ['살랑드파리 , 아쿠아컴플릿시리즈', 27, 44], 'positive']]
Target 내 화장대를 블링블링하게 만들어주는 살랑드파리 , 아쿠아컴플릿시리즈 인데요!

label:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0]
prediction:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0]

True

targets in labels:
살랑드파리 , 아쿠아컴플릿시리즈
targets predicted:
['살랑드파리, 아쿠아컴플릿시리즈']


In [74]:
tokenizer.encode('살랑드파리 , 아쿠아컴플릿시리즈') == tokenizer.encode('살랑드파리, 아쿠아컴플릿시리즈')

True