In [1]:
from transformers import (
    AutoConfig, ElectraTokenizerFast, ElectraForTokenClassification, 
    DataCollatorForTokenClassification,
    TrainingArguments, Trainer,
)

# from torch.nn import CrossEntropyLoss
# loss = CrossEntropyLoss()
# loss.ignore_index

# import nlpaug.augmenter.char as nac
# import nlpaug.augmenter.word as naw
# import nlpaug.augmenter.sentence as nas
# import nlpaug.flow as nafc
# from nlpaug.util import Action

# from googletrans import Translator
# import translators as ts

import re, math, random, json, os
from copy import deepcopy
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
from collections import Counter

from module.preprocess import decorate_form, decorate_acd_pair, decorate_asc_pair, decorate_acd_pair_split, decorate_asc_pair_split
from module.utils import count_tags, make_token_classification_pair, remove_props, align_tokens_and_labels, get_filter, generate_token_classification_data, adjust_target
import demoji

# from cleantext import clean
# from pykospacing import Spacing
# from hanspell import spell_checker

  from .autonotebook import tqdm as notebook_tqdm


# Declare Stuff to use

In [2]:
### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'

]
more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

tf_id_to_name = ['True', 'False']
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}

polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

In [3]:
DATA_V = 'uncleaned_v11'
save_path = f'./dataset/{DATA_V}'
print(save_path)

./dataset/uncleaned_v11


# ACD and ASC Preprocess

In [4]:
train = pd.read_json(f'{save_path}/tagger_train.json')
dev = pd.read_json(f'{save_path}/tagger_dev.json')

In [5]:
train = train[['id', 'sentence_form', 'annotation']]
dev = dev[['id', 'sentence_form', 'annotation']]

In [6]:
def reformat(df):
    ep =[]
    p = []
    for index, row in df.iterrows():
        utterance = row.sentence_form
        id = row.id
        form = utterance
        # form = decorate_form(utterance)
        
        trg_ep_sents = {}
        for annotation in row.annotation:
            ep_and_sent = [annotation[0], annotation[2]]
            target = annotation[1][0]
            if target not in trg_ep_sents.keys():
                trg_ep_sents[target] = []
                trg_ep_sents[target].append(ep_and_sent)
            else:
                trg_ep_sents[target].append(ep_and_sent)
        
        for target in trg_ep_sents.keys():
            annotations = trg_ep_sents[target]
            for pair in entity_property_pair:
                isPairInOpinion = False
                if pd.isna(utterance):
                    break
                for annotation in annotations:
                    entity_property = annotation[0]
                    sentiment = annotation[1]

                    if entity_property == pair:
                        
                        acd_pair = '#'.join([target, entity_property])
                        
                        ep_append = [id, form, acd_pair, tf_name_to_id['True']]
                        ep.append(ep_append)
                        p.append([id, utterance, target, entity_property, sentiment])
                        isPairInOpinion = True
                        break
                if isPairInOpinion is False:
                    
                    acd_pair = '#'.join([target, pair])
                    
                    ep_append = [id, form, acd_pair, tf_name_to_id['False']]
                    ep.append(ep_append)
    return ep, p

In [7]:
def reformat_p_binary(df):
    p_binary = []
    for i, row in df.iterrows():
        row.id, row.form, row.target, row.pair, row.sentiment
        
        form = row.form
        # form = decorate_form(row.form)
        
        for sentiment in polarity_id_to_name:
            if sentiment == row.sentiment:

                asc_pair = '#'.join([row.target, row.pair, row.sentiment])
                # asc_pair = decorate_asc_pair(row.pair, row.sentiment)
                # asc_pair = decorate_asc_pair_split(row.pair, row.sentiment)

                p_binary_append = [row.id, form, asc_pair, tf_name_to_id['True']]
                p_binary.append(p_binary_append)
            else:

                asc_pair = '#'.join([row.target, row.pair, sentiment])
                # asc_pair = decorate_asc_pair(row.pair, sentiment)
                # asc_pair = decorate_asc_pair_split(row.pair, sentiment)

                p_binary_append = [row.id, form, asc_pair, tf_name_to_id['False']]
                p_binary.append(p_binary_append)
    return p_binary

In [8]:
ep_train, p_train = reformat(train)
ep_dev, p_dev = reformat(dev)

ep_train = pd.DataFrame(ep_train, columns=['id', 'form', 'pair', 'labels'])
ep_dev = pd.DataFrame(ep_dev, columns=['id', 'form', 'pair', 'labels'])

p_train = pd.DataFrame(p_train, columns=['id', 'form', 'target', 'pair', 'sentiment'])
p_dev = pd.DataFrame(p_dev, columns=['id', 'form', 'target', 'pair', 'sentiment'])

len(ep_train), len(ep_dev), len(p_train), len(p_dev)

(124700, 30775, 5073, 1253)

In [9]:
p_binary_train = reformat_p_binary(p_train)
p_binary_train = pd.DataFrame(p_binary_train, columns=['id', 'form', 'pair', 'labels'])

p_binary_dev = reformat_p_binary(p_dev)
p_binary_dev = pd.DataFrame(p_binary_dev, columns=['id', 'form', 'pair', 'labels'])

len(ep_train), len(ep_dev), len(p_binary_train), len(p_binary_dev)

(124700, 30775, 15219, 3759)

### Counting

In [10]:
print('binary_multi: ', end=''), print(len(ep_train), len(ep_dev), len(p_train), len(p_dev))
print('binary_binary: ', end=''), print(len(ep_train), len(ep_dev), len(p_binary_train), len(p_binary_dev))
ep_train = ep_train.drop_duplicates()
ep_dev = ep_dev.drop_duplicates()
p_train = p_train.drop_duplicates()
p_dev = p_dev.drop_duplicates()
p_binary_train = p_binary_train.drop_duplicates()
p_binary_dev = p_binary_dev.drop_duplicates()
print('\nafter drop_duplicates\n')
print('binary_multi: ', end=''), print(len(ep_train), len(ep_dev), len(p_train), len(p_dev))
print('binary_binary: ', end=''), print(len(ep_train), len(ep_dev), len(p_binary_train), len(p_binary_dev))

binary_multi: 124700 30775 5073 1253
binary_binary: 124700 30775 15219 3759

after drop_duplicates

binary_multi: 124700 30775 5073 1253
binary_binary: 124700 30775 15219 3759


(None, None)

### Validate Here

In [11]:
for idx, row in train.iterrows():
    if len(row.annotation) == 5:
        print(row.id)
        print()
        print(row.sentence_form)
        print()
        for annotation in row.annotation:
            print(annotation)

nikluge-sa-2022-train-00174

Target 가성비 좋고 실용성 좋은 토드비 착용하는데.. 스타일도 색상도 예쁘고 재질도 부드러워서.. 만족만족..

['브랜드#가격', ['토드비', 21, 24], 'positive']
['제품 전체#품질', ['토드비', 21, 24], 'positive']
['제품 전체#디자인', ['스타일도', 33, 37], 'positive']
['본품#일반', ['색상도', 38, 41], 'positive']
['제품 전체#품질', ['재질도', 46, 49], 'positive']


In [12]:
df = p_binary_train
for idx, row in df.iterrows():
    if row.id == 'nikluge-sa-2022-train-00174':
        print(row.id, '\n',
            row.form, '\n',
            row.pair, '\n',
            row.labels, '\n',)

nikluge-sa-2022-train-00174 
 Target 가성비 좋고 실용성 좋은 토드비 착용하는데.. 스타일도 색상도 예쁘고 재질도 부드러워서.. 만족만족.. 
 토드비#브랜드#가격#positive 
 0 

nikluge-sa-2022-train-00174 
 Target 가성비 좋고 실용성 좋은 토드비 착용하는데.. 스타일도 색상도 예쁘고 재질도 부드러워서.. 만족만족.. 
 토드비#브랜드#가격#negative 
 1 

nikluge-sa-2022-train-00174 
 Target 가성비 좋고 실용성 좋은 토드비 착용하는데.. 스타일도 색상도 예쁘고 재질도 부드러워서.. 만족만족.. 
 토드비#브랜드#가격#neutral 
 1 

nikluge-sa-2022-train-00174 
 Target 가성비 좋고 실용성 좋은 토드비 착용하는데.. 스타일도 색상도 예쁘고 재질도 부드러워서.. 만족만족.. 
 토드비#제품 전체#품질#positive 
 0 

nikluge-sa-2022-train-00174 
 Target 가성비 좋고 실용성 좋은 토드비 착용하는데.. 스타일도 색상도 예쁘고 재질도 부드러워서.. 만족만족.. 
 토드비#제품 전체#품질#negative 
 1 

nikluge-sa-2022-train-00174 
 Target 가성비 좋고 실용성 좋은 토드비 착용하는데.. 스타일도 색상도 예쁘고 재질도 부드러워서.. 만족만족.. 
 토드비#제품 전체#품질#neutral 
 1 

nikluge-sa-2022-train-00174 
 Target 가성비 좋고 실용성 좋은 토드비 착용하는데.. 스타일도 색상도 예쁘고 재질도 부드러워서.. 만족만족.. 
 스타일도#제품 전체#디자인#positive 
 0 

nikluge-sa-2022-train-00174 
 Target 가성비 좋고 실용성 좋은 토드비 착용하는데.. 스타일도 색상도 예쁘고 재질도 부드러워서.. 만족만족.. 
 스타일도#제품 전체#디자인#negative 
 1 

nikl

### Save

In [13]:
DATA_V = 'uncleaned_v11'
save_path = f'./dataset/{DATA_V}'
print(save_path)

./dataset/uncleaned_v11


In [14]:
!mkdir -p {save_path}

# train.to_csv(f'{save_path}/raw_train.csv', index=False)
# dev.to_csv(f'{save_path}/raw_dev.csv', index=False)
# test.to_csv(f'{save_path}/raw_test.csv', index=False)

ep_train.to_csv(f'{save_path}/ce_train.csv', index=False)
ep_dev.to_csv(f'{save_path}/ce_dev.csv', index=False)

p_binary_train.to_csv(f'{save_path}/pc_binary_train.csv', index=False)
p_binary_dev.to_csv(f'{save_path}/pc_binary_dev.csv', index=False)

### Additional Length Test If Needed

In [None]:
# ep_train, ep_dev, p_binary_train, p_binary_dev

In [None]:
model_checkpoint = 'dataset/uncleaned_v11/tokenizer'
tokenizer = ElectraTokenizerFast.from_pretrained(model_checkpoint)

In [None]:
ep_train, ep_dev, p_binary_train, p_binary_dev
len_counter = []
for df in [ep_train, ep_dev, p_binary_train, p_binary_dev]:
    for idx, row in df.iterrows():
        len_counter.append(len(tokenizer(row["form"], row["pair"], truncation=True).input_ids))

In [None]:
max(len_counter)