In [1]:
from transformers import (
    AutoConfig, ElectraTokenizerFast, ElectraForTokenClassification, 
    DataCollatorForTokenClassification,
    TrainingArguments, Trainer,
)

import re, math, random, json, os
from copy import deepcopy
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
from collections import Counter

from module.preprocess import decorate_form, decorate_acd_pair, decorate_asc_pair, decorate_acd_pair_split, decorate_asc_pair_split
from module.utils import count_tags, make_token_classification_pair, remove_props, align_tokens_and_labels, get_filter, generate_token_classification_data, adjust_target
import demoji

  from .autonotebook import tqdm as notebook_tqdm


# Load Raw Data

In [2]:
train_json = './dataset/nikluge-sa-2022-train.jsonl'
dev_json = './dataset/nikluge-sa-2022-dev.jsonl'
test_json = './dataset/nikluge-sa-2022-test.jsonl'

train = pd.read_json(train_json, lines=True)
dev = pd.read_json(dev_json, lines=True)
test = pd.read_json(test_json, lines=True)

train = train.drop(2319)
dev = dev.drop(1692)

In [3]:
total = pd.concat([train, dev]).reset_index(drop=True)

In [4]:
for idx, row in total.iterrows():
    for annotation in row.annotation:
        if bool(annotation[1][0]) == False:
            print(row.id)
            print(row.annotation)

nikluge-sa-2022-train-00003
[['제품 전체#일반', [None, 0, 0], 'positive']]
nikluge-sa-2022-train-00012
[['제품 전체#일반', [None, 0, 0], 'negative']]
nikluge-sa-2022-train-00013
[['제품 전체#일반', [None, 0, 0], 'negative']]
nikluge-sa-2022-train-00014
[['제품 전체#일반', [None, 0, 0], 'negative']]
nikluge-sa-2022-train-00016
[['본품#품질', [None, 0, 0], 'negative']]
nikluge-sa-2022-train-00018
[['제품 전체#일반', [None, 0, 0], 'negative']]
nikluge-sa-2022-train-00019
[['본품#품질', [None, 0, 0], 'positive']]
nikluge-sa-2022-train-00020
[['제품 전체#가격', [None, 0, 0], 'negative']]
nikluge-sa-2022-train-00022
[['본품#품질', [None, 0, 0], 'negative']]
nikluge-sa-2022-train-00023
[['본품#품질', [None, 0, 0], 'positive'], ['본품#품질', ['녹음', 14, 16], 'positive']]
nikluge-sa-2022-train-00025
[['제품 전체#편의성', [None, 0, 0], 'negative']]
nikluge-sa-2022-train-00026
[['제품 전체#디자인', [None, 0, 0], 'neutral']]
nikluge-sa-2022-train-00027
[['제품 전체#디자인', [None, 0, 0], 'negative']]
nikluge-sa-2022-train-00028
[['제품 전체#디자인', [None, 0, 0], 'negative']]
nikl

# Declare Stuff to use

In [5]:
### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'

]
more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

tf_id_to_name = ['True', 'False']
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}

polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

# Fix annotations without a target

In [6]:
### Add prefix
trgs_prefix = 'Target '
trgs_prefix_len = len(trgs_prefix)
total['sentence_form'] = trgs_prefix + total['sentence_form']

In [7]:
trg = 'Target'
trg_rng = [0, 6]

In [8]:
### Fix
for idx, row in total.iterrows():
    # trg_idx = 0
    for idx in range(len(row.annotation)):
        if bool(row.annotation[idx][1][0]) == False:
            row.annotation[idx][1][0] = trg
            row.annotation[idx][1][1] = trg_rng[0]
            row.annotation[idx][1][2] = trg_rng[1]
        else:
            row.annotation[idx][1][1] += trgs_prefix_len
            row.annotation[idx][1][2] += trgs_prefix_len
    row.annotation = [el for el in row.annotation if el != []]

In [9]:
### Check
for idx, row in total.iterrows():
    for annotation in row.annotation:
        if bool(annotation[1][0]) == False:
            print(row)        

# Drop \# and \\xa0

In [10]:
total['sentence_form'] = total.sentence_form.str.replace('#', '')
total['sentence_form'] = total.sentence_form.str.replace('\xa0', ' ')

In [11]:
for _, row in total.iterrows():
    for idx in range(len(row.annotation)):
        row.annotation[idx][1][0] = row.annotation[idx][1][0].replace('#', '')
        row.annotation[idx][1][0] = row.annotation[idx][1][0].replace('\xa0', ' ')

In [12]:
total['checker'] = total.sentence_form.str.find('#')
print(total[total['checker'] > -1])
total['checker'] = total.sentence_form.str.find('\xa0')
print(total[total['checker'] > -1])

Empty DataFrame
Columns: [id, sentence_form, annotation, checker]
Index: []
Empty DataFrame
Columns: [id, sentence_form, annotation, checker]
Index: []


In [13]:
### Check
for idx, row in total.iterrows():
    for annotation in row.annotation:
        if '#' in annotation[1][0]:
            print(row)        
for idx, row in total.iterrows():
    for annotation in row.annotation:
        if '\xa0' in annotation[1][0]:
            print(row)        

In [14]:
### Check
for idx, row in total.iterrows():
    for annotation in row.annotation:
        if bool(annotation[1][0]) == False:
            print(row)        

# Drop props that can not be split into train and dev

In [15]:
# total['checker'] = total.annotation.apply(bool)
# total = total[total.checker == True].copy()
# count_tags(total, entity_property_pair)

In [16]:
filter = [x for x in entity_property_pair if x not in ['본품#인지도', '패키지/구성품#가격']]
total = remove_props(total, filter)
# count_tags(total, entity_property_pair)

In [17]:
### Check
for idx, row in total.iterrows():
    for annotation in row.annotation:
        if bool(annotation[1][0]) == False:
            print(row)        

# Before annotation adjustment

In [18]:
# random.seed(120)
# for _ in range(5):
#     idx = random.randrange(len(total))
#     print(total.iloc[idx].id)
#     print(total.iloc[idx].sentence_form)
#     print(total.iloc[idx].annotation)
#     print()

In [19]:
### Check
for idx, row in total.iterrows():
    for annotation in row.annotation:
        if bool(annotation[1][0]) == False:
            print(row.id)
            print(row.annotation)

# Adjust annotations

In [20]:
### adjust annotations
for idx, row in total.iterrows():
    sentence_form = row.sentence_form
    for idx in range(len(row.annotation)):
        target = row.annotation[idx][1][0]
        target_rng = row.annotation[idx][1][1:]
        new_target, new_target_rng = adjust_target(sentence_form, target)

        row.annotation[idx][1][0] = new_target
        row.annotation[idx][1][1] = new_target_rng[0]
        row.annotation[idx][1][2] = new_target_rng[1]

In [21]:
# random.seed(120)
# for _ in range(5):
#     idx = random.randrange(len(total))
#     print(total.iloc[idx].id)
#     print(total.iloc[idx].sentence_form)
#     print(total.iloc[idx].annotation)
#     print()

In [22]:
for idx, row in total.iterrows():
    for annotation in row.annotation:
        if bool(annotation[1][0]) == False:
            print(row.id)

# Split

In [23]:
total['stratified'] = total.annotation.apply(lambda x: x[0][0])
tagger_train, tagger_dev, _, _ = train_test_split(total, total['stratified'], test_size=0.2, random_state=42,  stratify=total['stratified'])
tagger_train.reset_index(inplace=True, drop=True)
tagger_dev.reset_index(inplace=True, drop=True)

In [24]:
# count_tags(tagger_train, entity_property_pair)

In [25]:
# count_tags(tagger_dev, entity_property_pair)

# Generate token classification pairs

In [26]:
train_split_inputs, train_split_labels = generate_token_classification_data(tagger_train)
dev_split_inputs, dev_split_labels = generate_token_classification_data(tagger_dev)
tagger_train['split_form'], tagger_train['split_label'] = train_split_inputs, train_split_labels
tagger_dev['split_form'], tagger_dev['split_label'] = dev_split_inputs, dev_split_labels

# Tokenize and Align

In [27]:
target_tagger_labels = ['Other', 'TRG_B', 'TRG_I']
labels = target_tagger_labels
label2id = {k: i for i, k in enumerate(labels)}
id2label = {i: k for i, k in enumerate(labels)}
num_labels = len(labels)

In [28]:
model_checkpoint = 'monologg/koelectra-base-v3-discriminator'
tokenizer = ElectraTokenizerFast.from_pretrained(model_checkpoint)

In [29]:
### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'
]
sentiments = ['positive', 'negative', 'neutral']
target = ['Target']
special_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']
emojis = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
emojis = list(set(demoji.findall(' '.join(emojis.sentence_form.to_list())).keys()))
ep_labels = pd.Series(entity_property_pair, name='sentence_form', copy=True)

tokens2add = special_tokens + emojis
# tokens2add = special_tokens + emojis + entity_property_pair + sentiments + target

print(len(tokenizer))
tokenizer_train_data = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame().drop_duplicates()
tokenizer_train_data = tokenizer_train_data.sentence_form.to_list()
new_tokenizer = tokenizer.train_new_from_iterator(tokenizer_train_data, vocab_size=1)
new_tokens = set(list(new_tokenizer.vocab.keys()) + tokens2add) - set(tokenizer.vocab.keys())
tokenizer.add_tokens(list(new_tokens))
print(len(new_tokenizer))
print(len(tokenizer))
# model.resize_token_embeddings(len(tokenizer))

35000



3060
35254


In [30]:
input_tokens_list, labels = align_tokens_and_labels(tagger_train, tokenizer)
tagger_train['input_tokens_list'], tagger_train['labels'] = input_tokens_list, labels
input_tokens_list, labels = align_tokens_and_labels(tagger_dev, tokenizer)
tagger_dev['input_tokens_list'], tagger_dev['labels'] = input_tokens_list, labels

In [31]:
tagger_train['checker'] = tagger_train.input_tokens_list.apply(len) == tagger_train.labels.apply(len)
print(tagger_train[tagger_train.checker == False])
tagger_dev['checker'] = tagger_dev.input_tokens_list.apply(len) == tagger_dev.labels.apply(len)
print(tagger_dev[tagger_dev.checker == False])

Empty DataFrame
Columns: [id, sentence_form, annotation, checker, stratified, split_form, split_label, input_tokens_list, labels]
Index: []
Empty DataFrame
Columns: [id, sentence_form, annotation, checker, stratified, split_form, split_label, input_tokens_list, labels]
Index: []


In [32]:
def find_targets(toks, labs):
    starts = list({k:v for k, v in enumerate(labs) if v == 1}.keys())
    targets = []
    for start in starts:
        target = [toks[start]]
        for tok, lab in zip(toks[start+1:], labs[start+1:]):
            if lab != 2:
                break
            else:
                target.append(tok)
        targets.append(target)
    print(targets)

In [33]:
idx = random.randrange(len(tagger_train))
print(tagger_train.iloc[idx].annotation)
print(tagger_train.iloc[idx].sentence_form)
print(tagger_train.iloc[idx].input_tokens_list)
print(tagger_train.iloc[idx].labels)
print()
find_targets(tagger_train.iloc[idx].input_tokens_list, tagger_train.iloc[idx].labels)
print()
idx = random.randrange(len(tagger_dev))
print(tagger_dev.iloc[idx].annotation)
print(tagger_dev.iloc[idx].sentence_form)
print(tagger_dev.iloc[idx].input_tokens_list)
print(tagger_dev.iloc[idx].labels)
print()
find_targets(tagger_dev.iloc[idx].input_tokens_list, tagger_dev.iloc[idx].labels)

[['패키지/구성품#디자인', ['사이즈로', 12, 16], 'positive']]
Target 미니미한 사이즈로 가방 속 나의 뷰티 잇템
['[CLS]', 'T', '##ar', '##ge', '##t', '미니', '##미', '##한', '사이즈', '##로', '가방', '속', '나의', '뷰티', '잇', '##템', '[SEP]']
[-100, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 0, -100]

[['사이즈', '##로']]

[['본품#품질', ['Target', 0, 6], 'positive']]
Target &name&이 사용한 before & after 보고나니 사용하지않을 수 없음 !!!!
['[CLS]', 'T', '##ar', '##ge', '##t', '&name&', '이', '사용', '##한', 'be', '##for', '##e', '&', 'af', '##ter', '보고', '##나', '##니', '사용', '##하', '##지', '##않', '##을', '수', '없', '##음', '!', '!', '!', '!', '[SEP]']
[-100, 1, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -100]

[['T', '##ar', '##ge', '##t']]


# Save Tagger Data

In [34]:
DATA_V = 'uncleaned_v12'
save_path = f'./dataset/{DATA_V}'
print(save_path)

./dataset/uncleaned_v12


In [35]:
tokenizer.save_pretrained(os.path.join(save_path, 'tokenizer'))

('./dataset/uncleaned_v12/tokenizer/tokenizer_config.json',
 './dataset/uncleaned_v12/tokenizer/special_tokens_map.json',
 './dataset/uncleaned_v12/tokenizer/vocab.txt',
 './dataset/uncleaned_v12/tokenizer/added_tokens.json',
 './dataset/uncleaned_v12/tokenizer/tokenizer.json')

In [36]:
!mkdir -p {save_path}

train.to_csv(f'{save_path}/raw_train.csv', index=False, encoding='utf-8')
dev.to_csv(f'{save_path}/raw_dev.csv', index=False, encoding='utf-8')
test.to_csv(f'{save_path}/raw_test.csv', index=False, encoding='utf-8')

tagger_train.to_json(f'{save_path}/tagger_train.json', force_ascii=False)
tagger_dev.to_json(f'{save_path}/tagger_dev.json', force_ascii=False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
