In [1]:
from transformers import (
    AutoTokenizer, AutoModelForMaskedLM, 
)

from collections import Counter
import re, math, random, json
from copy import deepcopy
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pandas as pd

from module.preprocess import decorate_form, decorate_acd_pair, decorate_asc_pair, decorate_acd_pair_split, decorate_asc_pair_split
from module.utils import count_tags, make_token_classification_pair, remove_props, get_filter
from module.augmentation import back_trans, random_replace, random_insert, random_swap, random_split

# import demoji
# from cleantext import clean
# from pykospacing import Spacing
# from hanspell import spell_checker

Using state  server backend.


In [2]:
def jsonload(fname, encoding="utf-8"):
    with open(fname, encoding=encoding) as f:
        j = json.load(f)

    return j


# json 개체를 파일이름으로 깔끔하게 저장
def jsondump(j, fname):
    with open(fname, "w", encoding="UTF8") as f:
        json.dump(j, f, ensure_ascii=False)

# jsonl 파일 읽어서 list에 저장
def jsonlload(fname, encoding="utf-8"):
    json_list = []
    with open(fname, encoding=encoding) as f:
        for line in f.readlines():
            json_list.append(json.loads(line))
    return json_list

# Load Raw Data

In [3]:
train_json = './dataset/nikluge-sa-2022-train.jsonl'
dev_json = './dataset/nikluge-sa-2022-dev.jsonl'
test_json = './dataset/nikluge-sa-2022-test.jsonl'

train = jsonlload(train_json)
dev = jsonlload(dev_json)
test = jsonlload(test_json)
train = pd.DataFrame(train)
dev = pd.DataFrame(dev)
test = pd.DataFrame(test)

# train = pd.read_json(train_json, lines=True)
# dev = pd.read_json(dev_json, lines=True)
# test = pd.read_json(test_json, lines=True)

train = train.drop(2319)
dev = dev.drop(1692)

# Declare Stuff to use

In [4]:
### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'

]
more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

tf_id_to_name = ['True', 'False']
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}

polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

In [5]:
total = pd.concat([train, dev]).reset_index(drop=True)

In [6]:
eps = []
for idx, row in total.iterrows():
    temp = []
    for annotation in row.annotation:
        ep = annotation[0]
        temp.append(ep)
    temp = list(set(temp))
    eps.append(temp)
total['eps'] = eps

In [7]:
minor = ['브랜드#가격',
    '패키지/구성품#다양성',
    '본품#가격',
    '본품#인지도',
    '패키지/구성품#가격']

In [8]:
def check_if_in_minor(eps):
    for ep in eps:
        if ep in minor:
            return True
    return False

indices_in_minor = total[total.eps.apply(check_if_in_minor) == True].index

In [9]:
rows_in_minor = total.iloc[indices_in_minor].copy()

In [10]:
total = total.drop(indices_in_minor)

In [11]:
total['stratified'] = total.annotation.apply(lambda x: x[0][0])
train, dev, _, _ = train_test_split(total, total['stratified'], test_size=0.2, random_state=42,  stratify=total['stratified'])
train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [12]:
train = pd.concat([train, rows_in_minor]).reset_index(drop=True)
train['stratified'] = train.annotation.apply(lambda x: x[0][0])

In [13]:
# count_tags(total, entity_property_pair)
# count_tags(train, entity_property_pair)
# count_tags(dev, entity_property_pair)

In [14]:
checker_one = train.sentence_form.apply(lambda x: len(x.split(' ')))
condition_one = checker_one > 4
checker_two = train.annotation.apply(len)
condition_two = checker_two == 1

rows2aug = train[(condition_one) & (condition_two)]
# rows2aug = train[condition_two]

count_tags(rows2aug, entity_property_pair)

tags found:  3037
tag set of df:  23
tag set of offered:  25
difference:  {'제품 전체#다양성', '브랜드#디자인'}
본품#품질		1245
제품 전체#일반		745
제품 전체#품질		252
본품#일반		189
제품 전체#디자인	108
본품#편의성		96
제품 전체#편의성	93
제품 전체#인지도	73
브랜드#일반		49
패키지/구성품#디자인	41
패키지/구성품#편의성	33
제품 전체#가격		29
패키지/구성품#일반	22
본품#다양성		17
본품#디자인		10
브랜드#품질		9
브랜드#인지도		9
패키지/구성품#품질	9
브랜드#가격		2
패키지/구성품#다양성	2
본품#가격		2
본품#인지도		1
패키지/구성품#가격	1


In [15]:
# 본품#품질		1907

ep2aug = ['제품 전체#일반',
          '제품 전체#품질',
          '본품#일반',
          '제품 전체#디자인',
          '본품#편의성',
          '제품 전체#편의성',
          '제품 전체#인지도',
          '브랜드#일반',
          '패키지/구성품#디자인',
          '패키지/구성품#편의성',
          '제품 전체#가격',
          '패키지/구성품#일반',
          '본품#다양성',
          '본품#디자인',
          '브랜드#품질',
          '브랜드#인지도',
          '패키지/구성품#품질',
          '브랜드#가격',
          '패키지/구성품#다양성',
          '본품#가격',
          '본품#인지도',
          '패키지/구성품#가격']

In [16]:
dfs = {}
for ep in ep2aug:
    df = rows2aug[rows2aug.stratified == ep]
    df = df[['id', 'sentence_form', 'annotation']].values.tolist()
    dfs[ep] = df    

# Augmentation

Back Translation / Random Insertion / Random Replacement / Random Swap / Random Deletion / Random Split

In [17]:
augmenters = [random_replace, random_insert, random_split]
target_num = 300

In [18]:
for ep in tqdm(dfs.keys()):
    samples = dfs[ep]
    if len(dfs[ep]) >= target_num:
        continue
    
    while True:
        rand_sample = random.randrange(len(dfs[ep]))
        id = dfs[ep][rand_sample][0]
        sentence_form = dfs[ep][rand_sample][1]
        annotation = dfs[ep][rand_sample][2]
        
        rand_aug = random.randrange(len(augmenters))
        aug_results = augmenters[rand_aug](3, sentence_form)
        
        for aug_result in aug_results:
            augged = [id, aug_result, annotation]
            if augged not in samples:
                samples.append(augged)
                
        if len(samples) > target_num:
            samples = samples[:target_num]
            dfs[ep] = samples
            break

100%|██████████| 22/22 [01:48<00:00,  4.92s/it]


In [32]:
for ep in tqdm(dfs.keys()):
    samples = dfs[ep]
    print(len(samples))

100%|██████████| 22/22 [00:00<00:00, 43669.99it/s]

745
300
300
300
300
300
300
300
300
300
300
300
300
300
300
300
300
300
300
300
300
300





In [20]:
major = train[train.stratified == '본품#품질'][['id', 'sentence_form', 'annotation']].values.tolist().copy()

In [21]:
minors = list(dfs.values())

for minor in minors:
    major.extend(minor)

In [22]:
augmented_acd_train = pd.DataFrame(major, columns=['id', 'sentence_form', 'annotation'])

In [23]:
augmented_acd_train

Unnamed: 0,id,sentence_form,annotation
0,nikluge-sa-2022-dev-00144,백탁현상없고 발림성 좋고 밀리지 않아요,"[[본품#품질, [None, 0, 0], positive]]"
1,nikluge-sa-2022-dev-00665,그래서 그날을 위해서 준비한 #순면커버 100% 안전한 생리대 겟!,"[[본품#품질, [순면커버, 17, 21], positive]]"
2,nikluge-sa-2022-train-00281,피쉬콜라겐 씨실트 캐비어추출물이 목피부를 탄력있고 밀도높게 관리해줘요~,"[[본품#품질, [피쉬콜라겐 씨실트 캐비어추출물, 0, 16], positive]]"
3,nikluge-sa-2022-train-02332,초미립자가 피부 표면에 뿌려주면 은은한 광채가 곧바로 이쁨뿜뿜 ㅋㅋ,"[[본품#품질, [None, 0, 0], positive]]"
4,nikluge-sa-2022-train-02214,자연 그대로의 상쾌함을 느낄 수 있는 라울루 치약 💦,"[[본품#품질, [라울루 치약, 21, 27], positive]]"
...,...,...,...
8757,nikluge-sa-2022-dev-00056,근데 그 그 리고 그 그 쓸데 없이 비싼 부가 액세서리.,"[[패키지/구성품#가격, [부가 액세서리, 12, 19], negative]]"
8758,nikluge-sa-2022-dev-00056,근데 그 그 리고 그 쓸데 하나 없이 비싼 부가 액세서리.,"[[패키지/구성품#가격, [부가 액세서리, 12, 19], negative]]"
8759,nikluge-sa-2022-dev-00056,그 리 그 리고 또 그 쓸 데없이 비싼 부가 액 세서리.,"[[패키지/구성품#가격, [부가 액세서리, 12, 19], negative]]"
8760,nikluge-sa-2022-dev-00056,그 뿐 리고 사실 쓸 데없이 많이 비싼 부가 액 세서리.,"[[패키지/구성품#가격, [부가 액세서리, 12, 19], negative]]"


In [24]:
1907 * 23

43861

In [25]:
def reformat(df):
    ep =[]
    p = []
    for index, row in df.iterrows():
        utterance = row.sentence_form
        id = row.id
        
        form = utterance
        # form = decorate_form(utterance)

        for pair in entity_property_pair:
            isPairInOpinion = False
            if pd.isna(utterance):
                break
            for annotation in row.annotation:
                entity_property = annotation[0]
                sentiment = annotation[2]
                if entity_property == pair:
                    acd_pair = entity_property
                    ep_append = [id, form, acd_pair, tf_name_to_id['True']]
                    ep.append(ep_append)
                    p.append([id, utterance, acd_pair, polarity_name_to_id[sentiment]])
                    isPairInOpinion = True
                    break
            if isPairInOpinion is False:
                acd_pair = pair
                ep_append = [id, form, acd_pair, tf_name_to_id['False']]
                ep.append(ep_append)
    return ep, p

In [26]:
len(augmented_acd_train), len(dev)

(8762, 1156)

In [27]:
ep_train, p_train = reformat(augmented_acd_train)
ep_dev, p_dev = reformat(dev)

ep_train = pd.DataFrame(ep_train, columns=['id', 'form', 'pair', 'labels'])
ep_dev = pd.DataFrame(ep_dev, columns=['id', 'form', 'pair', 'labels'])

p_train = pd.DataFrame(p_train, columns=['id', 'form', 'pair', 'labels'])
p_dev = pd.DataFrame(p_dev, columns=['id', 'form', 'pair', 'labels'])

len(ep_train), len(ep_dev), len(p_train), len(p_dev)

(219050, 28900, 8842, 1223)

### Counting

In [28]:
print('binary_multi: ', end=''), print(len(ep_train), len(ep_dev), len(p_train), len(p_dev))
ep_train = ep_train.drop_duplicates()
ep_dev = ep_dev.drop_duplicates()
p_train = p_train.drop_duplicates()
p_dev = p_dev.drop_duplicates()
print('\nafter drop_duplicates\n')
print('binary_multi: ', end=''), print(len(ep_train), len(ep_dev), len(p_train), len(p_dev))

binary_multi: 219050 28900 8842 1223

after drop_duplicates

binary_multi: 219050 28900 8842 1223


(None, None)

### Validate Here

In [29]:
# dfs = [ep_train, ep_dev, p_train, p_dev]
# for df in dfs:
#     for idx, row in df.iterrows():
#         print(row.id, '\n',
#             row.form, '\n',
#             row.pair, '\n',
#             row.labels,  '\n',)
#         if idx == 999:
#             break

### Save

In [30]:
DATA_V = 'uncleaned_v15'
save_path = f'./dataset/{DATA_V}'
print(save_path)

./dataset/uncleaned_v15


In [33]:
!mkdir -p {save_path}

# train.to_csv(f'{save_path}/raw_train.csv', index=False, encoding='utf-8-sig')
# dev.to_csv(f'{save_path}/raw_dev.csv', index=False, encoding='utf-8-sig')
# test.to_csv(f'{save_path}/raw_test.csv', index=False, encoding='utf-8-sig')

ep_train.to_csv(f'{save_path}/ce_train_balanced.csv', index=False, encoding='utf-8-sig')
ep_dev.to_csv(f'{save_path}/ce_dev.csv', index=False, encoding='utf-8-sig')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
