In [94]:
from transformers import (
    AutoTokenizer, AutoModelForMaskedLM, 
)

# import nlpaug.augmenter.char as nac
# import nlpaug.augmenter.word as naw
# import nlpaug.augmenter.sentence as nas
# import nlpaug.flow as nafc
# from nlpaug.util import Action

# from googletrans import Translator
# import translators as ts

import re, math, random, json
from copy import deepcopy
from sklearn.model_selection import StratifiedKFold
import pandas as pd
from tqdm import tqdm
from collections import Counter

from module.preprocess import decorate_form, decorate_acd_pair, decorate_asc_pair, decorate_acd_pair_split, decorate_asc_pair_split

import demoji

# from cleantext import clean
# from pykospacing import Spacing
# from hanspell import spell_checker

# Load Raw Data

In [95]:
train_json = './dataset/nikluge-sa-2022-train.jsonl'
dev_json = './dataset/nikluge-sa-2022-dev.jsonl'
test_json = './dataset/nikluge-sa-2022-test.jsonl'

train = pd.read_json(train_json, lines=True)
dev = pd.read_json(dev_json, lines=True)
test = pd.read_json(test_json, lines=True)

train = train.drop(2319)
dev = dev.drop(1692)

In [96]:
train

Unnamed: 0,id,sentence_form,annotation
0,nikluge-sa-2022-train-00001,둘쨋날은 미친듯이 밟아봤더니 기어가 헛돌면서 틱틱 소리가 나서 경악.,"[[본품#품질, [기어, 16, 18], negative]]"
1,nikluge-sa-2022-train-00002,"이거 뭐 삐꾸를 준 거 아냐 불안하고, 거금 투자한 게 왜 이래.. 싶어서 정이 확...","[[본품#품질, [기어 텐션, 67, 72], negative]]"
2,nikluge-sa-2022-train-00003,간사하게도 그 이후에는 라이딩이 아주 즐거워져서 만족스럽게 탔다.,"[[제품 전체#일반, [None, 0, 0], positive]]"
3,nikluge-sa-2022-train-00004,샥이 없는 모델이라 일반 도로에서 타면 노면의 진동 때문에 손목이 덜덜덜 떨리고 이...,"[[제품 전체#일반, [샥이 없는 모델, 0, 8], neutral]]"
4,nikluge-sa-2022-train-00005,안장도 딱딱해서 엉덩이가 아팠는데 무시하고 타고 있다.,"[[본품#일반, [안장, 0, 2], negative]]"
...,...,...,...
2996,nikluge-sa-2022-train-02997,(슬픔),"[[제품 전체#가격, [None, 0, 0], negative]]"
2997,nikluge-sa-2022-train-02998,보드랍고 괜찮다!,"[[제품 전체#품질, [None, 0, 0], positive]]"
2998,nikluge-sa-2022-train-02999,#일본 유니클로 질이 우리나라보다 좋은 것 같으면 기분 탓인가.......,"[[브랜드#일반, [유니클로, 4, 8], neutral]]"
2999,nikluge-sa-2022-train-03000,마지막으로 귀여워서 집어온 모자.,"[[제품 전체#디자인, [모자, 15, 17], positive]]"


# Declare Stuff to use

In [97]:
### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'

]
more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

tf_id_to_name = ['True', 'False']
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}

polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

# Remove annotations without a target

In [98]:
for idx, row in train.iterrows():
    for idx in range(len(row.annotation)):
        if row.annotation[idx][1][0] == None:
            row.annotation[idx] = []
    row.annotation = [el for el in row.annotation if el != []]

# Drop rows without a single annotation

In [99]:
for idx, row in train.iterrows():
    for annotation in row.annotation:
        if annotation[1][0] == None:
            print(row)        

In [100]:
train['checker'] = train.annotation.apply(bool)
train = train[train.checker == True].copy()

# Generate token classification pairs

In [103]:
def make_token_classification_pair(original_input, annotations):
    targets = []
    for annotation in annotations:
        if annotation[1][0] != None:
            target = annotation[1][1:]
            targets.append(target)
    targets.sort()
    input_len = len(original_input)

    split_input = []
    split_label = []
    pointer = 0
    for target in targets:
        start = target[0]
        end = target[1]
        if start != 0:
            split_input.append(original_input[pointer:start])
            split_label.append(0)
        split_input.append(original_input[start:end])
        split_label.append(1)
        pointer = end
    if end != len(original_input):
        split_input.append(original_input[end:])
        split_label.append(0)

    return split_input, split_label

In [102]:
### test a sample
train['checker'] = train.annotation.apply(len)
original_input = train[train.checker > 1].iloc[2].sentence_form
annotations = train[train.checker > 1].iloc[2].annotation
original_input, annotations

('디자인과 조작감에서 차이가 날 뿐인데 LEXMA 블루투스 마우스는 디자인과 조작감 둘 다 합격점.',
 [['본품#품질', ['조작감', 5, 8], 'positive'],
  ['제품 전체#디자인', ['LEXMA 블루투스 마우스', 21, 35], 'positive']])

In [106]:
split_inputs, split_labels = [], []
for original_input, annotations in zip(train.sentence_form, train.annotation):
    split_input, split_label = make_token_classification_pair(original_input, annotations)
    split_inputs.append(split_input)
    split_labels.append(split_label)

In [111]:
count = 0
for x, y in zip(split_inputs, split_labels):
    print(x)
    print(y)
    print()
    count += 1
    if count == 99:
        break

['둘쨋날은 미친듯이 밟아봤더니 ', '기어', '가 헛돌면서 틱틱 소리가 나서 경악.']
[0, 1, 0]

['이거 뭐 삐꾸를 준 거 아냐 불안하고, 거금 투자한 게 왜 이래.. 싶어서 정이 확 떨어졌는데 산 곳 가져가서 확인하니 ', '기어 텐션', ' 문제라고 고장 아니래.']
[0, 1, 0]

['샥이 없는 모델', '이라 일반 도로에서 타면 노면의 진동 때문에 손목이 덜덜덜 떨리고 이가 부딪칠 지경인데 이마저도 며칠 타면서 익숙해지니 신경쓰이지 않게 됐다.']
[1, 0]

['안장', '도 딱딱해서 엉덩이가 아팠는데 무시하고 타고 있다.']
[1, 0]

['지금 내 실력과 저질 체력으로는 이 정도 ', '자전거', '도 되게 훌륭한 거라는..']
[0, 1, 0]

['내장 기어 3단', '은 썩 좋은 물건이라 기어 변환도 부드럽고 겉에서는 기어가 보이지 않기 때문에 깔끔하다.']
[1, 0]

['한번 교환했는데 새로 온 ', 'UD20', '은 불량화소가 있고 ㅜ ㅜ ㅜ']
[0, 1, 0]

['전에 작동 안되었던 ', '자막 검색 후 등록 기능', '이 똑같이 작동 안 된다!!!']
[0, 1, 0]

['왜 ', '[등록]키', '를 만들어놓고 제대로 단어장에 등록이 되지 않는 거냐!!']
[0, 1, 0]

['다른 ', '부가 기능', '은 참 훌륭한데..']
[0, 1, 0]

['동영상 재생하면서 자막 중 모르는 내용 있으면 터치해서 바로 검색하는 기능', ' 때문에 산 건데 이게 에러다..']
[1, 0]

['아까 한 번 잠깐 되더니 지금 또 ', '등록 버튼', '이 먹통이다.']
[0, 1, 0]

['전자사전 기능', '은 훌륭한데 이런 게 고장이 없어야지.. ㅜ ㅜ ㅜ ㅜ ㅜ']
[1, 0]

['만화책 보는 건 오케이, ', '녹음', '도 잘 되고..']
[0, 1, 0]

['스펙상으로는 116g인데 집에서 저울로 재어보니 ', '118g', ', 실제로 들어보니 돌덩이 같다.']
[0, 1, 0]

['동영상 

In [104]:
make_token_classification_pair(original_input, annotations)

(['디자인과 ', '조작감', '에서 차이가 날 뿐인데 ', 'LEXMA 블루투스 마우스', '는 디자인과 조작감 둘 다 합격점.'],
 [0, 1, 0, 1, 0])

In [90]:
targets = []

for annotation in annotations:
    if annotation[1][0] != None:
        target = annotation[1][1:]
        targets.append(target)
targets.sort()
print(targets)

[[5, 8], [21, 35]]


In [91]:
input_len = len(original_input)
split_input = []
label = []
pointer = 0
for target in targets:
    start = target[0]
    end = target[1]
    if start != 0:
        split_input.append(original_input[pointer:start])
        label.append(0)
    split_input.append(original_input[start:end])
    label.append(1)
    pointer = end
if end != len(original_input):
    split_input.append(original_input[end:])
    label.append(0)

In [92]:
original_input, split_input, label

('디자인과 조작감에서 차이가 날 뿐인데 LEXMA 블루투스 마우스는 디자인과 조작감 둘 다 합격점.',
 ['디자인과 ', '조작감', '에서 차이가 날 뿐인데 ', 'LEXMA 블루투스 마우스', '는 디자인과 조작감 둘 다 합격점.'],
 [0, 1, 0, 1, 0])

# Count

In [93]:
df = pd.concat([train, dev])
df = pd.concat([train])

count = 0
tags = []
ner_inputs = []
for idx, row in df.iterrows():
    if len(row.annotation) > 0:
        for annotation in row.annotation:
            form = row.sentence_form
            if annotation[1][0] != None and annotation[1][2] > 0:
                tags.append(annotation[0])
                ner_input = []
                start = annotation[1][1]
                end = annotation[1][2]
                if start != 0:
                    ner_input.append(form[:start])
                ner_input.append(form[start:end])
                if len(form) != end:
                    ner_input.append(form[end:])
                count += 1
            else:
                tags.append(annotation[0])
                ner_input = []
                ner_input.append(form)
                count += 1
            ner_inputs.append(ner_input)

print('tags found: ', count)
print('tag set of df: ', len(set(tags)))
print('tag set of offered: ', len(set(entity_property_pair)))
print('difference: ', set(entity_property_pair)-set(tags))

tag_counter = Counter(tags)
sorted(tag_counter.items(), key=lambda x: x[1], reverse=True)

tags found:  1811
tag set of df:  21
tag set of offered:  25
difference:  {'패키지/구성품#가격', '브랜드#디자인', '제품 전체#다양성', '본품#가격'}


[('본품#품질', 561),
 ('제품 전체#일반', 434),
 ('본품#일반', 239),
 ('제품 전체#품질', 136),
 ('제품 전체#디자인', 85),
 ('본품#편의성', 48),
 ('제품 전체#편의성', 47),
 ('브랜드#일반', 47),
 ('패키지/구성품#디자인', 44),
 ('제품 전체#인지도', 37),
 ('패키지/구성품#편의성', 33),
 ('패키지/구성품#일반', 23),
 ('본품#다양성', 17),
 ('제품 전체#가격', 15),
 ('본품#디자인', 14),
 ('브랜드#품질', 11),
 ('패키지/구성품#품질', 10),
 ('브랜드#인지도', 7),
 ('브랜드#가격', 1),
 ('패키지/구성품#다양성', 1),
 ('본품#인지도', 1)]

# Filter entity_property_pair and Drop rows accordingly

In [None]:
FILTER_MODE = False

# filter = ['본품#품질',
#           '제품 전체#일반',
#           '본품#일반',
#           '제품 전체#품질',
#           '제품 전체#디자인',
#           '본품#편의성',
#           '제품 전체#편의성',
#           '제품 전체#인지도',
#           '패키지/구성품#디자인',
#           '브랜드#일반',
#           '제품 전체#가격']  # 2716

# filter = ['본품#품질',
#           '제품 전체#일반',
#           '본품#일반',
#           '제품 전체#품질',
#           '제품 전체#디자인',
#           '본품#편의성',
#           '제품 전체#편의성',
#           '제품 전체#인지도',
#           '패키지/구성품#디자인',
#           '브랜드#일반'] # 2676

# filter = ['본품#품질',
#           '제품 전체#일반',
#           '본품#일반',
#           '제품 전체#품질',
#           '제품 전체#디자인',
#           '본품#편의성',
#           '제품 전체#편의성',
#           '제품 전체#인지도',
#           '패키지/구성품#디자인']  # 2627

# filter = ['본품#품질',
#           '제품 전체#일반',
#           '본품#일반',
#           '제품 전체#품질',
#           '제품 전체#디자인',
#           '본품#편의성',
#           '제품 전체#편의성',
#           '제품 전체#인지도']  # 2575

# filter = ['본품#품질',
#           '제품 전체#일반',
#           '본품#일반',
#           '제품 전체#품질',
#           '제품 전체#디자인',
#           '본품#편의성',
#           '제품 전체#편의성'] # 2509

# filter = ['본품#품질',
#           '제품 전체#일반',
#           '본품#일반',
#           '제품 전체#품질',
#           '제품 전체#디자인',
#           '본품#편의성'] # 2421

filter = ['본품#품질',
          '제품 전체#일반',
          '본품#일반',
          '제품 전체#품질',
          '제품 전체#디자인'] # 2339


def remove_props(df):
    for idx, row in df.iterrows():
        empty = []
        stay = True
        for annotation in row.annotation:
            if annotation[0] not in filter:
            # if annotation[0] not in filter or annotation[2] != 'positive':
                stay = False
        if stay == False:
            row.annotation = empty
    df['check'] = df.annotation.apply(lambda x: bool(x))
    df = df.drop(df[df.check == False].index)
    return df


In [None]:
if FILTER_MODE == True:
    train = remove_props(train)
    dev = remove_props(dev)
len(train), len(dev)

# Preprocess

## Cleansing

### Before

In [None]:
# for el in train.sample(n=5).sentence_form:
#     print(el)

In [None]:
# train.sentence_form = train.sentence_form.apply(preprocess)
# dev.sentence_form = dev.sentence_form.apply(preprocess)
# test.sentence_form = test.sentence_form.apply(preprocess)
# total = pd.concat([train, dev])

### Test

In [None]:
# case = total.sentence_form.str.contains('r[^A-Za-z0-9가-힣\s]+', case=False, flags=0, na=None, regex=True)
# for e in total[case].sentence_form:
#     print(e)

### After

In [None]:
# for i, row in total[['id', 'sentence_form']].sample(n=5).iterrows():
#     print(row.id, '\t', row.sentence_form)

In [None]:
# total['check'] = total.sentence_form.str.find('OO')
# for row in total[total.check > -1].sentence_form:
#     print(row)
#     break

In [None]:
# total

## Reformat

In [None]:
len(entity_property_pair)

In [None]:
decorate_form, decorate_acd_pair, decorate_asc_pair, decorate_acd_pair_split, decorate_asc_pair_split

In [None]:
def reformat(df):
    ep =[]
    p = []
    for index, row in df.iterrows():
        utterance = row.sentence_form
        id = row.id
        
        form = utterance
        # form = decorate_form(utterance)

        for pair in entity_property_pair:
            isPairInOpinion = False
            if pd.isna(utterance):
                break
            for annotation in row.annotation:
                entity_property = annotation[0]
                sentiment = annotation[2]
                if entity_property == pair:
                    
                    acd_pair = entity_property
                    # acd_pair = decorate_acd_pair(entity_property)
                    # acd_pair = decorate_acd_pair_split(entity_property)
                    
                    ep_append = [id, form, acd_pair, tf_name_to_id['True']]
                    ep.append(ep_append)
                    p.append([id, utterance, entity_property, sentiment])
                    isPairInOpinion = True
                    break
            if isPairInOpinion is False:
                
                acd_pair = pair
                # acd_pair = decorate_acd_pair(pair)
                # acd_pair = decorate_acd_pair_split(pair)
                
                ep_append = [id, form, acd_pair, tf_name_to_id['False']]
                ep.append(ep_append)
    return ep, p

In [None]:
def reformat_p_binary(df):
    p_binary = []
    for i, row in df.iterrows():
        row.id, row.form, row.pair, row.sentiment
        
        form = row.form
        # form = decorate_form(row.form)
        
        for sentiment in polarity_id_to_name:
            if sentiment == row.sentiment:

                asc_pair = '#'.join([row.pair, row.sentiment])
                # asc_pair = decorate_asc_pair(row.pair, row.sentiment)
                # asc_pair = decorate_asc_pair_split(row.pair, row.sentiment)

                p_binary_append = [row.id, form, asc_pair, tf_name_to_id['True']]
                p_binary.append(p_binary_append)
            else:

                asc_pair = '#'.join([row.pair, sentiment])
                # asc_pair = decorate_asc_pair(row.pair, sentiment)
                # asc_pair = decorate_asc_pair_split(row.pair, sentiment)

                p_binary_append = [row.id, form, asc_pair, tf_name_to_id['False']]
                p_binary.append(p_binary_append)
    return p_binary

In [None]:
len(train), len(dev)

In [None]:
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'

]

In [None]:
ep_train, p_train = reformat(train)
ep_dev, p_dev = reformat(dev)

ep_train = pd.DataFrame(ep_train, columns=['id', 'form', 'pair', 'labels'])
ep_dev = pd.DataFrame(ep_dev, columns=['id', 'form', 'pair', 'labels'])

p_train = pd.DataFrame(p_train, columns=['id', 'form', 'pair', 'sentiment'])
p_dev = pd.DataFrame(p_dev, columns=['id', 'form', 'pair', 'sentiment'])

len(ep_train), len(ep_dev), len(p_train), len(p_dev)

In [None]:
p_binary_train = reformat_p_binary(p_train)
p_binary_train = pd.DataFrame(p_binary_train, columns=['id', 'form', 'pair', 'labels'])

p_binary_dev = reformat_p_binary(p_dev)
p_binary_dev = pd.DataFrame(p_binary_dev, columns=['id', 'form', 'pair', 'labels'])

len(ep_train), len(ep_dev), len(p_binary_train), len(p_binary_dev)

In [None]:
# ep_train.sort_values(['id', 'labels'], inplace=True)
# ep_dev.sort_values(['id', 'labels'], inplace=True)
# p_binary_train.sort_values(['id', 'labels'], inplace=True, ascending=[True, True])
# p_binary_dev.sort_values(['id', 'labels'], inplace=True, ascending=[True, True])

### Counting

In [None]:
print('binary_multi: ', end=''), print(len(ep_train), len(ep_dev), len(p_train), len(p_dev))
print('binary_binary: ', end=''), print(len(ep_train), len(ep_dev), len(p_binary_train), len(p_binary_dev))
ep_train = ep_train.drop_duplicates()
ep_dev = ep_dev.drop_duplicates()
p_train = p_train.drop_duplicates()
p_dev = p_dev.drop_duplicates()
p_binary_train = p_binary_train.drop_duplicates()
p_binary_dev = p_binary_dev.drop_duplicates()
print('\nafter drop_duplicates\n')
print('binary_multi: ', end=''), print(len(ep_train), len(ep_dev), len(p_train), len(p_dev))
print('binary_binary: ', end=''), print(len(ep_train), len(ep_dev), len(p_binary_train), len(p_binary_dev))

### Validate Here

In [None]:
df = ep_train
for idx, row in df.iterrows():
    print(row.id, '\n',
          row.form, '\n',
          row.pair, '\n',
          row.labels,  '\n',)
    if idx == 49:
        break

### Save

In [None]:
DATA_V = 'uncleaned_v6'
save_path = f'./dataset/{DATA_V}'
print(save_path)

In [None]:
!mkdir -p {save_path}

train.to_csv(f'{save_path}/raw_train.csv', index=False)
dev.to_csv(f'{save_path}/raw_dev.csv', index=False)
test.to_csv(f'{save_path}/raw_test.csv', index=False)

ep_train.to_csv(f'{save_path}/ce_train.csv', index=False)
ep_dev.to_csv(f'{save_path}/ce_dev.csv', index=False)

p_binary_train.to_csv(f'{save_path}/pc_binary_train.csv', index=False)
p_binary_dev.to_csv(f'{save_path}/pc_binary_dev.csv', index=False)

### Additional Length Test If Needed

In [None]:
# ep_train, ep_dev, p_binary_train, p_binary_dev

In [None]:
model_checkpoint = 'snunlp/KR-ELECTRA-discriminator'

train_path = f'./dataset/{DATA_V}/raw_train.csv'
dev_path = f'./dataset/{DATA_V}/raw_dev.csv'
test_path = f'./dataset/{DATA_V}/raw_test.csv'
train = pd.read_csv(train_path)
dev = pd.read_csv(dev_path)
test = pd.read_csv(test_path)

### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'
]
special_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']
emojis = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
emojis = list(set(demoji.findall(' '.join(emojis.sentence_form.to_list())).keys()))
ep_labels = pd.Series(entity_property_pair, name='sentence_form', copy=True)

tokens2add = special_tokens + emojis

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(len(tokenizer))
tokenizer_train_data = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame().drop_duplicates()
tokenizer_train_data = tokenizer_train_data.sentence_form.to_list()
new_tokenizer = tokenizer.train_new_from_iterator(tokenizer_train_data, vocab_size=1)
new_tokens = set(list(new_tokenizer.vocab.keys()) + tokens2add) - set(tokenizer.vocab.keys())
tokenizer.add_tokens(list(new_tokens))
print(len(new_tokenizer))
print(len(tokenizer))
# model.resize_token_embeddings(len(tokenizer))

In [None]:
ep_train, ep_dev, p_binary_train, p_binary_dev
len_counter = []
for df in [ep_train, ep_dev, p_binary_train, p_binary_dev]:
    for idx, row in df.iterrows():
        len_counter.append(len(tokenizer(row["form"], row["pair"], truncation=True).input_ids))

In [None]:
max(len_counter)

### done here.

## Save Files

In [None]:
# save_path = './dataset/cleaned_v1'

# train.to_csv(f'{save_path}/raw_train.csv', index=False)
# dev.to_csv(f'{save_path}/raw_dev.csv', index=False)
# test.to_csv(f'{save_path}/raw_test.csv', index=False)

# ep_train.to_csv(f'{save_path}/ce_train.csv', index=False)
# ep_dev.to_csv(f'{save_path}/ce_dev.csv', index=False)
# p_train.to_csv(f'{save_path}/pc_train.csv', index=False)
# p_dev.to_csv(f'{save_path}/pc_dev.csv', index=False)
# p_binary_train.to_csv(f'{save_path}/pc_binary_train.csv', index=False)
# p_binary_dev.to_csv(f'{save_path}/pc_binary_dev.csv', index=False)

# ASC Augmentation

In [None]:
model_checkpoint = '/content/drive/MyDrive/aspect_based_sentiment_analysis/base_model/klue_roberta_base/v2/klue_roberta_base_mlm/checkpoint-19860'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
sTokens = tokenizer.all_special_tokens

def delTokens(sent):
    sent = sent.split(' ')
    temp = []
    for e in sent:
        if e not in sTokens:
            temp.append(e)
    return ' '.join(temp)

In [None]:
positive, negative, neutral = p_train[p_train.sentiment == 'positive'], p_train[p_train.sentiment == 'negative'], p_train[p_train.sentiment == 'neutral']

In [None]:
len(positive), len(negative), len(neutral)

In [None]:
(58 * 3) * 4 * 3, (95 * 3) * 4 * 2 # bt ri rr

Back Translation / Random Insertion / Random Replacement / Random Swap / Random Deletion

In [None]:
def backTrans(text):
    aug1 = ts.papago(text, sleep_seconds=5, from_language='ko', to_language='en')
    aug1 = ts.papago(aug1, sleep_seconds=5, from_language='en', to_language='ko')

    aug2 = ts.papago(text, sleep_seconds=5, from_language='ko', to_language='ja')
    aug2 = ts.papago(aug2, sleep_seconds=5, from_language='ja', to_language='ko')

    return [aug1, aug2]

def randomInsert(num, sample, device):
    aug = naw.ContextualWordEmbsAug(
        model_path=model_checkpoint, action="insert", model_type='bert', top_k=5, aug_p=0.3, aug_min=1, aug_max=1, device=device)

    aug_result = aug.augment(sample, n=num, num_thread=12)
    aug_result = list(map(delTokens, aug_result))
    aug_result = list(set(aug_result))
    return aug_result

def randomReplace(num, sample, device):
    aug = naw.ContextualWordEmbsAug(
        model_path=model_checkpoint, action="insert", model_type='bert', top_k=5, aug_p=0.3, aug_min=1, aug_max=1, device=device)

    aug_result = aug.augment(sample, n=num, num_thread=12)
    aug_result = list(map(delTokens, aug_result))
    aug_result = list(set(aug_result))
    return aug_result

def randomSwap(num, sample):
    aug = naw.RandomWordAug(action='swap', aug_min=1, aug_max=1, aug_p=0.3)    
    aug_result = aug.augment(sample, n=num, num_thread=2)
    aug_result = list(set(aug_result))
    return aug_result

def randomSplit(num, sample):
    aug = naw.SplitAug(aug_min=1, aug_max=1, aug_p=0.3, min_char=3)
    aug_result = aug.augment(sample, n=num, num_thread=2)
    aug_result = list(set(aug_result))
    return aug_result

In [None]:
(58 * 3) * 5 * 4, (95 * 3) * 4 * 3 # bt ri rr

In [None]:
def backtransRoutine(data2augment, output_path):
    print('back translation started.')
    temp = []
    for row in data2augment:
        augs = backTrans(row[1])
        for aug in augs:
            if aug != '' and aug != row[1]:
                new = [row[0], aug, row[2], row[3]]
                if new not in data2augment:
                    temp.append(new)
    data2augment.extend(temp)
    print(len(f'back translation finished.\ncurrent count: {len(data2augment)}'))

    data_aug = pd.DataFrame(data2augment, columns=['id', 'sentence_form', 'entity_property', 'sentiment'])
    data_aug.to_csv(f'{output_path}', index=False)

    return data_aug

In [None]:
import os

def edaRoutine(data2augment, ri, rr, output_path):
    print(f'current count: {len(data2augment)}')
    print('random insertion started.')
    temp = []
    for row in data2augment:
        augs = randomInsert(ri, row[1], 'cuda')
        for aug in augs:
            if aug != '' and aug != row[1]:
                new = [row[0], aug, row[2], row[3]]
                if new not in data2augment:
                    temp.append(new)
    data2augment.extend(temp)
    print(f'random insertion finished.\ncurrent count: {len(data2augment)}')

    print('random replacement started.')
    temp = []
    for row in data2augment:
        augs = randomReplace(rr, row[1], 'cuda')
        for aug in augs:
            if aug != '' and aug != row[1]:
                new = [row[0], aug, row[2], row[3]]
                if new not in data2augment:
                    temp.append(new)
    data2augment.extend(temp)
    print(f'random replacement finished.\ncurrent count: {len(data2augment)}')

    print('random swap and split started.')
    while len(data2augment) < len(positive):
        temp = []
        k = random.randrange(len(negative))
        id, text, entity, sentiment = data2augment[k]

        selector = random.randint(0,1)
        if selector == 0:
            augs = randomSwap(1, text)
            for aug in augs:
                if aug != '' and aug != text:
                    new = [id, aug, entity, sentiment]
                    if new not in data2augment:
                        temp.append(new)
            data2augment.extend(temp)
        else:
            augs = randomSplit(1, text)
            for aug in augs:
                if aug != '' and aug != text:
                    new = [id, aug, entity, sentiment]
                    if new not in data2augment:
                        temp.append(new)
            data2augment.extend(temp)
        if len(data2augment)%25 == 0:
            print(f'random swap and split in progress.\ncurrent count: {len(data2augment)}')

    print(f'whole augmentation routine finished.\ntotal count: {len(data2augment)}')

    data_aug = pd.DataFrame(data2augment, columns=['id', 'sentence_form', 'entity_property', 'sentiment'])
    data_aug.to_csv(f'{output_path}', index=False)

    return data_aug

In [None]:
### negative
# # back translation

# data2augment = negative.values.tolist()

output_folder = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11'
output_file = 'negative_bt.csv'
output_path = os.path.join(output_folder, output_file)

# negative_bt = backtransRoutine(data2augment, output_path)
negative_bt = pd.read_csv(output_path)
negative_bt = negative_bt.values.tolist()
# RI / RR

ri = 4 # times - 1
rr = 3 # times - 1

output_folder = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11'
output_file = 'negative_aug.csv'
output_path = os.path.join(output_folder, output_file)

# negative_aug = edaRoutine(negative_bt, ri, rr, output_path)
negative_aug = pd.read_csv(output_path)

In [None]:
negative_aug
negative_aug = negative_aug.drop_duplicates()

In [None]:
# negative_aug.sample(n=15, frac=None, replace=False, weights=None, random_state=None, axis=None, ignore_index=False)

In [None]:
# negative_aug.sort_values('id').head(50).sentence_form

In [None]:
### neutral
# back translation

# data2augment = neutral.values.tolist()

output_folder = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11'
output_file = 'neutral_bt.csv'
output_path = os.path.join(output_folder, output_file)

# neutral_bt = backtransRoutine(data2augment, output_path)
neutral_bt = pd.read_csv(output_path)
neutral_bt = neutral_bt.values.tolist()

# RI / RR

ri = 3 # times - 1
rr = 2 # times - 1

output_folder = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11'
output_file = 'neutral_aug.csv'
output_path = os.path.join(output_folder, output_file)

# neutral_aug = edaRoutine(neutral_bt, ri, rr, output_path)
neutral_aug = pd.read_csv(output_path)

In [None]:
neutral_aug
neutral_aug = neutral_aug.drop_duplicates()

In [None]:
# neutral_aug.sample(n=15, frac=None, replace=False, weights=None, random_state=None, axis=None, ignore_index=False)

In [None]:
# neutral_aug.sort_values('id').head(50).sentence_form

In [None]:
p_train_aug = pd.concat([positive, negative_aug, neutral_aug])

In [None]:
def reformat_p_binary(df):
    p_binary = []
    for i, row in df.iterrows():
        row.id, row.sentence_form, row.entity_property, row.sentiment
        for sentiment in polarity_id_to_name:
            if sentiment == row.sentiment:
                p_binary.append([row.id, row.sentence_form, '#'.join([row.entity_property, row.sentiment]), tf_name_to_id['True']])
            else: 
                p_binary.append([row.id, row.sentence_form, '#'.join([row.entity_property, sentiment]), tf_name_to_id['False']])
    return p_binary

In [None]:
p_binary_train_aug = reformat_p_binary(p_train_aug)
p_binary_train_aug = pd.DataFrame(p_binary_train_aug, columns=['id', 'sentence_form', 'entity_property', 'labels'])

In [None]:
p_binary_train_aug

In [None]:
p_binary_dev = reformat_p_binary(p_dev)
p_binary_dev = pd.DataFrame(p_binary_dev, columns=['id', 'sentence_form', 'entity_property', 'labels'])

In [None]:
p_binary_dev

# Counting

In [None]:
len(ep_train), len(ep_dev), len(p_train), len(p_dev)

In [None]:
len(ep_train), len(ep_dev), len(p_binary_train_aug), len(p_binary_dev)

In [None]:
ep_train = ep_train.drop_duplicates()
p_binary_train_aug = p_binary_train_aug.drop_duplicates()
ep_dev = ep_dev.drop_duplicates()
p_binary_dev = p_binary_dev.drop_duplicates()
len(ep_train), len(ep_dev), len(p_binary_train_aug), len(p_binary_dev)

# Export

In [None]:
%cd /content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11

# train.to_csv('raw_train.csv', index=False)
# dev.to_csv('raw_dev.csv', index=False)
# test.to_csv('raw_test.csv', index=False)

ep_train.to_csv('ce_train.csv', index=False)
p_binary_train_aug.to_csv('pc_binary_train_aug.csv', index=False)
ep_dev.to_csv('ce_dev.csv', index=False)
p_binary_dev.to_csv('pc_binary_dev.csv', index=False)

In [None]:
# emojis = pd.concat([ep_train.sentence_form, p_train.sentence_form, ep_dev.sentence_form, p_dev.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
# emojis = list(set(demoji.findall(' '.join(emojis.sentence_form.to_list())).keys()))

In [None]:
df = pd.read_csv('ce_train.csv')
df[df.id == 'nikluge-sa-2022-train-00065']

In [None]:
df = pd.read_csv('ce_dev.csv')
df

In [None]:
df = pd.read_csv('pc_binary_train_aug.csv')
df

In [None]:
df = pd.read_csv('pc_binary_dev.csv')
df