In [45]:
from transformers import (
    AutoConfig, ElectraTokenizerFast, ElectraForTokenClassification, 
    DataCollatorForTokenClassification,
    TrainingArguments, Trainer,
)

from transformers.optimization import (
    AdamW, get_linear_schedule_with_warmup,
    Adafactor, AdafactorSchedule,
)

import torch
import wandb

import datasets
import evaluate

from sklearn.metrics import accuracy_score, f1_score

import numpy as np
import pandas as pd

import os
import re
import random

import demoji

In [46]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [47]:
model_checkpoint = 'training_results/monologg_koelectra_base_v3_discriminator_uncleaned_v11/target_tagger/monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-370'

tokenizer = ElectraTokenizerFast.from_pretrained(model_checkpoint)
model = ElectraForTokenClassification.from_pretrained(model_checkpoint).to(device)

In [49]:
TRAIN_DATA_PATH = './dataset/uncleaned_v11/tagger_train.json'
EVAL_DATA_PATH = './dataset/uncleaned_v11/tagger_dev.json'

train_dataset = pd.read_json(TRAIN_DATA_PATH)
eval_dataset = pd.read_json(EVAL_DATA_PATH)

In [7]:
idx = random.randrange(len(eval_dataset))
sample = eval_dataset.iloc[idx]
# sample = train_dataset.iloc[1151]
print('original: ')
print(sample.annotation)
print(sample.sentence_form)
print()

input_triplet = tokenizer(sample.sentence_form, return_tensors='pt')
input_triplet = {k:v.to(device) for k, v in input_triplet.items()}
# input_triplet = preprocess_function(sample)
output = model(**input_triplet).logits

print('label:'), print(sample.labels[1:-1])
print('prediction:'), print(output.argmax(-1)[-1].tolist()[1:-1])
print()
print(output.argmax(-1)[-1].tolist()[1:-1] == sample.labels[1:-1])
print()
toks = sample.input_tokens_list[1:-1]
pred = output.argmax(-1)[-1].tolist()[1:-1]
# labs = sample.labels

starts = list({k:v for k, v in enumerate(pred) if v == 1}.keys())
targets = []
for start in starts:
    target = [toks[start]]
    for tok, lab in zip(toks[start+1:], pred[start+1:]):
        if lab != 2:
            break
        else:
            target.append(tok)
    targets.append(tokenizer.convert_tokens_to_string(target))

print('targets in labels:')
for el in sample.annotation:
    print(el[1][0])
print('targets predicted:')
print(targets)

original: 
[['제품 전체#일반', ['스웨터', 30, 33], 'positive']]
Target 오늘입고 옷장속으로 들어갈 내가 좋아하는 스웨터

label:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
prediction:
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]

True

targets in labels:
스웨터
targets predicted:
['스웨터']


In [8]:
tokenizer.encode('살랑드파리 , 아쿠아컴플릿시리즈') == tokenizer.encode('살랑드파리, 아쿠아컴플릿시리즈')

True

In [78]:
idx = random.randrange(len(eval_dataset))
sample = eval_dataset.iloc[idx]
# sample = train_dataset.iloc[1151]
print('original: ')
print(sample.annotation)
print(sample.sentence_form)
print()

original: 
[['본품#품질', ['달바 미스트 세럼..!', 30, 42], 'positive']]
Target 촤르르~~피부 윤광 장난아니게 만들어주는 달바 미스트 세럼..!



In [79]:
sentence = sample.sentence_form
sentence = re.sub('#', '', sentence)
sentence = re.sub('\xa0', ' ', sentence)
print(sentence)

Target 촤르르~~피부 윤광 장난아니게 만들어주는 달바 미스트 세럼..!


In [80]:
tokens = tokenizer.tokenize(sentence)
ids = tokenizer.encode(sentence)[1:-1]
print(tokens == tokenizer.convert_ids_to_tokens(ids))
print(tokens)
print(tokenizer.convert_ids_to_tokens(ids))

True
['T', '##ar', '##ge', '##t', '촤', '##르르', '~', '~', '피부', '윤', '##광', '장난', '##아', '##니', '##게', '만들', '##어', '##주', '##는', '달', '##바', '미스트', '세', '##럼', '.', '.', '!']
['T', '##ar', '##ge', '##t', '촤', '##르르', '~', '~', '피부', '윤', '##광', '장난', '##아', '##니', '##게', '만들', '##어', '##주', '##는', '달', '##바', '미스트', '세', '##럼', '.', '.', '!']


In [81]:
input_triplet = tokenizer(sentence, return_tensors='pt')
input_triplet = {k:v.to(device) for k, v in input_triplet.items()}
output = model(**input_triplet).logits
pred = output.argmax(-1)[-1].tolist()[1:-1]
print(pred)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 2, 2, 0, 0, 0]


In [82]:
starts = list({k:v for k, v in enumerate(pred) if v == 1}.keys())
targets = []
for start in starts:
    target = [tokens[start]]
    for tok, lab in zip(tokens[start+1:], pred[start+1:]):
        if lab != 2:
            break
        else:
            target.append(tok)
    targets.append(tokenizer.convert_tokens_to_string(target))

print('targets predicted:')
print(targets)

targets predicted:
['달바 미스트 세럼']


In [83]:
sentence = sample.sentence_form
sentence = re.sub('#', '', sentence)
sentence = re.sub('\xa0', ' ', sentence)

tokens = tokenizer.tokenize(sentence)
ids = tokenizer.encode(sentence)[1:-1]

input_triplet = tokenizer(sentence, return_tensors='pt')
input_triplet = {k:v.to(device) for k, v in input_triplet.items()}

output = model(**input_triplet).logits
pred = output.argmax(-1)[-1].tolist()[1:-1]

starts = list({k:v for k, v in enumerate(pred) if v == 1}.keys())
targets = []
for start in starts:
    target = [tokens[start]]
    for tok, lab in zip(tokens[start+1:], pred[start+1:]):
        if lab != 2:
            break
        else:
            target.append(tok)
    targets.append(tokenizer.convert_tokens_to_string(target))

print('targets predicted:')
print(targets)

targets predicted:
['달바 미스트 세럼']
