In [261]:
from transformers import (
    AutoConfig, ElectraTokenizerFast, ElectraForTokenClassification, 
    DataCollatorForTokenClassification,
    TrainingArguments, Trainer,
)

from transformers.optimization import (
    AdamW, get_linear_schedule_with_warmup,
    Adafactor, AdafactorSchedule,
)

import torch
import wandb

import datasets
import evaluate

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd

import os
import re
import random

import demoji

In [262]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [263]:
model_checkpoint = 'training_results/monologg_koelectra_base_v3_discriminator_uncleaned_v9/target_tagger.v2/monologg_koelectra_base_v3_discriminator_uncleaned_v9/checkpoint-370'

tokenizer = ElectraTokenizerFast.from_pretrained(model_checkpoint)
model = ElectraForTokenClassification.from_pretrained(model_checkpoint).to(device)

In [264]:
def preprocess_function(examples):
    input_ids = tokenizer.convert_tokens_to_ids(examples["input_tokens_list"])
    token_type_ids = [0 for _ in range(len(input_ids))]
    attention_mask = [1 for _ in range(len(input_ids))]   
    
    input_triplet = dict(
        input_ids = torch.tensor([input_ids]).to(device),
        token_type_ids = torch.tensor([token_type_ids]).to(device),
        attention_mask = torch.tensor([attention_mask]).to(device),
    )
    return input_triplet

In [265]:
TRAIN_DATA_PATH = './dataset/uncleaned_v9/tagger_train.json'
EVAL_DATA_PATH = './dataset/uncleaned_v9/tagger_dev.json'

train_dataset = pd.read_json(TRAIN_DATA_PATH)
eval_dataset = pd.read_json(EVAL_DATA_PATH)

In [266]:
while True:
    k = random.randrange(len(train_dataset))
    input_tokens_list = train_dataset['input_tokens_list'][k]
    sentence_form = train_dataset['sentence_form'][k]
    annotations = train_dataset['annotation'][k]
    if len(tokenizer.convert_tokens_to_ids(input_tokens_list)) != len(tokenizer.encode(sentence_form)):
        print(k)
        print(len(tokenizer.convert_tokens_to_ids(input_tokens_list)), len(tokenizer.encode(sentence_form)))
        print(sentence_form)
        print(annotations)
        break

1485
17 16
Target##호호에미 #프리지어 향이 은은하고 좋다
[['본품#일반', ['호호에미 #프리지어 향', 8, 20], 'positive']]


In [267]:
# idx = random.randrange(len(eval_dataset))
# sample = eval_dataset.iloc[idx]
sample = train_dataset.iloc[1151]
print(sample.annotation)
print(sample.sentence_form)
print()
input_triplet = tokenizer(sample.sentence_form, return_tensors='pt')
input_triplet = {k:v.to(device) for k, v in input_triplet.items()}
# input_triplet = preprocess_function(sample)
output = model(**input_triplet).logits
print('prediction:'), print(output.argmax(-1)[-1].tolist()[1:-1])
print('label:'), print(sample.labels[1:-1])
print()
print(output.argmax(-1)[-1].tolist()[1:-1] == sample.labels[1:-1])
print()
toks = sample.input_tokens_list[1:-1]
pred = output.argmax(-1)[-1].tolist()[1:-1]
# labs = sample.labels

starts = list({k:v for k, v in enumerate(pred) if v == 1}.keys())
targets = []
for start in starts:
    target = [toks[start]]
    for tok, lab in zip(toks[start+1:], pred[start+1:]):
        if lab != 2:
            break
        else:
            target.append(tok)
    targets.append(target)

print('targets predicted:'), print(targets)

[['본품#품질', ['약산성토너', 36, 41], 'positive']]
Target#거품도 부드럽고, 자극이 없어 부담없이 쓸 수 있는 약산성토너에요.

prediction:
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
label:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 0, 0]

False

targets predicted:
[['Target']]


(None, None)

In [268]:
sent1 = '약산성토너 에요'
sent2 = '약산성토너에요'
sent3 = '약산성토너'
sent4 = ' 약산성토너'
sent5 = '약산성토너 '
sent6 = ' 약산성토너 '
tokenizer.encode(sent1), tokenizer.encode(sent2), tokenizer.encode(sent3), tokenizer.encode(sent4), tokenizer.encode(sent5), tokenizer.encode(sent6)

([2, 3103, 19160, 4348, 4330, 7616, 3],
 [2, 3103, 19160, 4348, 4330, 4073, 4150, 3],
 [2, 3103, 19160, 4348, 4330, 3],
 [2, 3103, 19160, 4348, 4330, 3],
 [2, 3103, 19160, 4348, 4330, 3],
 [2, 3103, 19160, 4348, 4330, 3])

In [269]:
tokenizer.convert_ids_to_tokens(tokenizer.encode(['약산성토너', '에요'])), tokenizer.convert_ids_to_tokens(tokenizer.encode('약산성토너에요'))

(['[CLS]', '약', '##산성', '##토', '##너', '[SEP]', '에요', '[SEP]'],
 ['[CLS]', '약', '##산성', '##토', '##너', '##에', '##요', '[SEP]'])

In [270]:
tokenizer.batch_decode(
([2, 3103, 19160, 4348, 4330, 7616, 3],
 [2, 3103, 19160, 4348, 4330, 4073, 4150, 3],
 [2, 3103, 19160, 4348, 4330, 3],
 [2, 3103, 19160, 4348, 4330, 3],
 [2, 3103, 19160, 4348, 4330, 3],
 [2, 3103, 19160, 4348, 4330, 3])
    )

['[CLS] 약산성토너 에요 [SEP]',
 '[CLS] 약산성토너에요 [SEP]',
 '[CLS] 약산성토너 [SEP]',
 '[CLS] 약산성토너 [SEP]',
 '[CLS] 약산성토너 [SEP]',
 '[CLS] 약산성토너 [SEP]']

In [271]:
tokenizer.convert_tokens_to_string(['미백', '기능', '##성', '워터'])

'미백 기능성 워터'

In [272]:
sent = 'Target#세안 후 처음 바르는 미백 기능성 워터로.. 생기있게 맑아지는듯ㅎㅎㅎ'
encoded = tokenizer.encode(sent)
decoded = tokenizer.decode(encoded)
print(decoded)
tokenized = tokenizer.tokenize(sent)
tokens_to_string = tokenizer.convert_tokens_to_string(tokenized)
print(tokens_to_string)

[CLS] Target # 세안 후 처음 바르는 미백 기능성 워터로.. 생기있게 맑아지는듯ㅎㅎㅎ [SEP]
Target # 세안 후 처음 바르는 미백 기능성 워터로.. 생기있게 맑아지는듯ㅎㅎㅎ


In [274]:
sent = '또 무슨 말이 있찌 뭐가있어또약산 성토너에요 이렇게 또 있어'
target = '약산 성토너'

def adjust_target(sentence_form, target):
    # target_idx = sentence_form.index(target)
    # target_rng = [target_idx, target_idx + len(target)]
    # sentence_form[target_rng[0]:target_rng[1]]

    split_sent = sentence_form.split(' ')
    split_target = target.split(' ')
    if len(split_target) > 1:
        first = split_target[0]
        last = split_target[-1]

        for el in split_sent:
            if first in el:
                first = el
        for el in split_sent:
            if last in el:
                last = el
        
        first_idx = sentence_form.index(first)
        last_idx = sentence_form.index(last) + len(last)
        target_rng = [first_idx, last_idx]
        target = sentence_form[first_idx:last_idx]
        return target, target_rng
    
    else:
        for el in split_sent:
            if target in el:
                target = el

        target_idx = sentence_form.index(target)
        target_rng = [target_idx, target_idx + len(target)]
        target = sentence_form[target_rng[0]:target_rng[1]]
        return target, target_rng

In [275]:
target, target_rng = adjust_target(sent, target)
sent[target_rng[0]:target_rng[1]]

'약산 성토너'

: 

In [251]:
sent[rng]

'뭐가있어또약산성토너에요'