In [148]:
from transformers import (
    AutoTokenizer, AutoModelForMaskedLM, 
)

# import nlpaug.augmenter.char as nac
# import nlpaug.augmenter.word as naw
# import nlpaug.augmenter.sentence as nas
# import nlpaug.flow as nafc
# from nlpaug.util import Action

# from googletrans import Translator
# import translators as ts

import re, math, random, json
from copy import deepcopy
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
from collections import Counter

from module.preprocess import decorate_form, decorate_acd_pair, decorate_asc_pair, decorate_acd_pair_split, decorate_asc_pair_split
from module.utils import count_tags, make_token_classification_pair, remove_props, align_tokens_and_labels, get_filter, generate_token_classification_data
import demoji

# from cleantext import clean
# from pykospacing import Spacing
# from hanspell import spell_checker

# Load Raw Data

In [149]:
train_json = './dataset/nikluge-sa-2022-train.jsonl'
dev_json = './dataset/nikluge-sa-2022-dev.jsonl'
test_json = './dataset/nikluge-sa-2022-test.jsonl'

train = pd.read_json(train_json, lines=True)
dev = pd.read_json(dev_json, lines=True)
test = pd.read_json(test_json, lines=True)

train = train.drop(2319)
dev = dev.drop(1692)

# Declare Stuff to use

In [150]:
### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'

]
more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

tf_id_to_name = ['True', 'False']
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}

polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

# Remove annotations without a target

In [151]:
total = pd.concat([train, dev])

### Add prefix
trgs_prefix = 'Target_1 Target_2 Target_3 '
total['sentence_form'] = trgs_prefix + total['sentence_form']
total

Unnamed: 0,id,sentence_form,annotation
0,nikluge-sa-2022-train-00001,Target_1 Target_2 Target_3 둘쨋날은 미친듯이 밟아봤더니 기어가...,"[[본품#품질, [기어, 16, 18], negative]]"
1,nikluge-sa-2022-train-00002,Target_1 Target_2 Target_3 이거 뭐 삐꾸를 준 거 아냐 불안하...,"[[본품#품질, [기어 텐션, 67, 72], negative]]"
2,nikluge-sa-2022-train-00003,Target_1 Target_2 Target_3 간사하게도 그 이후에는 라이딩이 아...,"[[제품 전체#일반, [None, 0, 0], positive]]"
3,nikluge-sa-2022-train-00004,Target_1 Target_2 Target_3 샥이 없는 모델이라 일반 도로에서 ...,"[[제품 전체#일반, [샥이 없는 모델, 0, 8], neutral]]"
4,nikluge-sa-2022-train-00005,Target_1 Target_2 Target_3 안장도 딱딱해서 엉덩이가 아팠는데 ...,"[[본품#일반, [안장, 0, 2], negative]]"
...,...,...,...
2789,nikluge-sa-2022-dev-02790,Target_1 Target_2 Target_3 썸머세트라고 해서 세럼 + 선크림 ...,"[[패키지/구성품#일반, [썸머세트, 0, 4], positive]]"
2790,nikluge-sa-2022-dev-02791,Target_1 Target_2 Target_3 비싸다.,"[[제품 전체#가격, [None, 0, 0], negative]]"
2791,nikluge-sa-2022-dev-02792,Target_1 Target_2 Target_3 비싸지만 건강이 더 비싸니까.,"[[제품 전체#가격, [None, 0, 0], negative]]"
2792,nikluge-sa-2022-dev-02793,Target_1 Target_2 Target_3 대형으로 샀더니 잘 맞음.,"[[제품 전체#일반, [대형, 0, 2], positive]]"


In [152]:
print(trgs_prefix)
trgs_len = len(trgs_prefix)
print(trgs_len)
print(trgs_prefix[0:8])
print(trgs_prefix[9:17])
print(trgs_prefix[18:26])
trgs = ['Target_1', 'Target_2', 'Target_3']
trg_rngs = [[0, 8], [9, 17], [18, 26]]

Target_1 Target_2 Target_3 
27
Target_1
Target_2
Target_3


In [153]:
### Remove
for idx, row in total.iterrows():
    trg_idx = 0
    for idx in range(len(row.annotation)):
        if row.annotation[idx][1][0] == None:
            row.annotation[idx][1][0] = trgs[trg_idx]
            row.annotation[idx][1][1] = trg_rngs[trg_idx][0]
            row.annotation[idx][1][2] = trg_rngs[trg_idx][1]
            trg_idx += 1
        else:
            row.annotation[idx][1][1] += trgs_len
            row.annotation[idx][1][2] += trgs_len
    row.annotation = [el for el in row.annotation if el != []]
    
# ### Remove
# for idx, row in total.iterrows():
#     for idx in range(len(row.annotation)):
#         if row.annotation[idx][1][0] == None:
#             row.annotation[idx] = []
#     row.annotation = [el for el in row.annotation if el != []]

In [154]:
### Check
for idx, row in total.iterrows():
    for annotation in row.annotation:
        if annotation[1][0] == None:
            print(row)        

# Drop rows without a single annotation

In [156]:
total['checker'] = total.annotation.apply(bool)
total = total[total.checker == True].copy()

In [157]:
count_tags(total, entity_property_pair)

tags found:  6332
tag set of df:  23
tag set of offered:  25
difference:  {'브랜드#디자인', '제품 전체#다양성'}
본품#품질	2380
제품 전체#일반	1622
제품 전체#품질	493
본품#일반	491
제품 전체#디자인	286
본품#편의성	191
제품 전체#편의성	180
제품 전체#인지도	141
패키지/구성품#디자인	117
브랜드#일반	103
제품 전체#가격	92
패키지/구성품#편의성	65
패키지/구성품#일반	50
본품#다양성	31
본품#디자인	21
브랜드#품질	19
패키지/구성품#품질	19
브랜드#인지도	17
브랜드#가격	7
패키지/구성품#다양성	3
본품#가격	2
본품#인지도	1
패키지/구성품#가격	1


In [158]:
filter = [x for x in entity_property_pair if x not in ['본품#인지도', '패키지/구성품#가격']]
total = remove_props(total, filter)
count_tags(total, entity_property_pair)

tags found:  6330
tag set of df:  21
tag set of offered:  25
difference:  {'패키지/구성품#가격', '본품#인지도', '브랜드#디자인', '제품 전체#다양성'}
본품#품질	2380
제품 전체#일반	1622
제품 전체#품질	493
본품#일반	491
제품 전체#디자인	286
본품#편의성	191
제품 전체#편의성	180
제품 전체#인지도	141
패키지/구성품#디자인	117
브랜드#일반	103
제품 전체#가격	92
패키지/구성품#편의성	65
패키지/구성품#일반	50
본품#다양성	31
본품#디자인	21
브랜드#품질	19
패키지/구성품#품질	19
브랜드#인지도	17
브랜드#가격	7
패키지/구성품#다양성	3
본품#가격	2


# Split

In [159]:
total['stratified'] = total.annotation.apply(lambda x: x[0][0])

In [160]:
train, dev, _, _ = train_test_split(total, total['stratified'], test_size=0.2, random_state=42,  stratify=total['stratified'])
train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [161]:
count_tags(train, entity_property_pair)

tags found:  5076
tag set of df:  21
tag set of offered:  25
difference:  {'패키지/구성품#가격', '본품#인지도', '브랜드#디자인', '제품 전체#다양성'}
본품#품질	1907
제품 전체#일반	1295
본품#일반	397
제품 전체#품질	396
제품 전체#디자인	230
본품#편의성	155
제품 전체#편의성	142
제품 전체#인지도	113
패키지/구성품#디자인	94
브랜드#일반	82
제품 전체#가격	74
패키지/구성품#편의성	52
패키지/구성품#일반	41
본품#다양성	26
본품#디자인	17
브랜드#품질	16
패키지/구성품#품질	15
브랜드#인지도	14
브랜드#가격	6
본품#가격	2
패키지/구성품#다양성	2


In [162]:
count_tags(dev, entity_property_pair)

tags found:  1254
tag set of df:  20
tag set of offered:  25
difference:  {'패키지/구성품#가격', '브랜드#디자인', '본품#가격', '본품#인지도', '제품 전체#다양성'}
본품#품질	473
제품 전체#일반	327
제품 전체#품질	97
본품#일반	94
제품 전체#디자인	56
제품 전체#편의성	38
본품#편의성	36
제품 전체#인지도	28
패키지/구성품#디자인	23
브랜드#일반	21
제품 전체#가격	18
패키지/구성품#편의성	13
패키지/구성품#일반	9
본품#다양성	5
패키지/구성품#품질	4
본품#디자인	4
브랜드#품질	3
브랜드#인지도	3
패키지/구성품#다양성	1
브랜드#가격	1


# Generate token classification pairs

In [163]:
# ### test a sample
# train['checker'] = train.annotation.apply(len)
# original_input = train[train.checker > 1].iloc[2].sentence_form
# annotations = train[train.checker > 1].iloc[2].annotation
# original_input, annotations

In [164]:
train_split_inputs, train_split_labels = generate_token_classification_data(train)
dev_split_inputs, dev_split_labels = generate_token_classification_data(dev)

In [165]:
train['split_form'], train['split_label'] = train_split_inputs, train_split_labels
dev['split_form'], dev['split_label'] = dev_split_inputs, dev_split_labels

In [166]:
from torch.nn import CrossEntropyLoss
loss = CrossEntropyLoss()

In [167]:
loss.ignore_index

-100

In [168]:
from transformers import (
    AutoConfig, ElectraTokenizer, ElectraForTokenClassification, 
    DataCollatorForTokenClassification,
    TrainingArguments, Trainer,
)

In [169]:
target_tagger_labels = ['Other', 'TRG_B', 'TRG_I']
labels = target_tagger_labels
label2id = {k: i for i, k in enumerate(labels)}
id2label = {i: k for i, k in enumerate(labels)}
num_labels = len(labels)

In [170]:
model_checkpoint = 'monologg/koelectra-base-v3-discriminator'
tokenizer = ElectraTokenizer.from_pretrained(model_checkpoint)
model = ElectraForTokenClassification.from_pretrained(
    model_checkpoint, label2id=label2id, id2label=id2label, num_labels=num_labels
)
model.config.label2id, model.config.id2label, model.num_labels

Some weights of the model checkpoint at monologg/koelectra-base-v3-discriminator were not used when initializing ElectraForTokenClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForTokenClassification were not initialized from the model checkpoint at monologg/koelectra-base-v3-discriminator and are newly initialized: ['classifier

({'Other': 0, 'TRG_B': 1, 'TRG_I': 2}, {0: 'Other', 1: 'TRG_B', 2: 'TRG_I'}, 3)

In [171]:
input_tokens_list, labels = align_tokens_and_labels(train, tokenizer)
train['input_tokens_list'], train['labels'] = input_tokens_list, labels
input_tokens_list, labels = align_tokens_and_labels(dev, tokenizer)
dev['input_tokens_list'], dev['labels'] = input_tokens_list, labels

In [172]:
train['checker'] = train.input_tokens_list.apply(len) == train.labels.apply(len)
print(train[train.checker == False])
dev['checker'] = dev.input_tokens_list.apply(len) == dev.labels.apply(len)
print(dev[dev.checker == False])

Empty DataFrame
Columns: [id, sentence_form, annotation, checker, stratified, split_form, split_label, input_tokens_list, labels]
Index: []
Empty DataFrame
Columns: [id, sentence_form, annotation, checker, stratified, split_form, split_label, input_tokens_list, labels]
Index: []


In [182]:
idx = random.randrange(len(train))
print(train.iloc[idx].annotation)
print(train.iloc[idx].labels)
print(train.iloc[idx].input_tokens_list)

[['본품#일반', ['대용량', 31, 34], 'positive'], ['제품 전체#가격', ['Target_1', 0, 8], 'positive']]
[-100, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -100]
['[CLS]', 'T', '##ar', '##ge', '##t', '_', '1', 'T', '##ar', '##ge', '##t', '_', '2', 'T', '##ar', '##ge', '##t', '_', '3', '게다가', '대용량', '이라', '가성', '##비', '[UNK]', '!', '👍', '[SEP]']


In [183]:
# [['본품#일반', ['대용량', 31, 34], 'positive'], ['제품 전체#가격', ['Target_1', 0, 8], 'positive']]
tokens = ['[CLS]', 'T', '##ar', '##ge', '##t', '_', '1', 'T', '##ar', '##ge', '##t', '_', '2', 'T', '##ar', '##ge', '##t', '_', '3', '게다가', '대용량', '이라', '가성', '##비', '[UNK]', '!', '👍', '[SEP]']
labels = [-100, 1, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, -100]

In [199]:
tokens[20], labels[20]

('대용량', 1)

# CHECK POINT

In [None]:
# count = 0
# for x, y in zip(split_inputs, split_labels):
#     print(x)
#     print(y)
#     print()
#     count += 1
#     if count == 99:
#         break

# Count

In [None]:
count_tags(train, entity_property_pair)

# Filter entity_property_pair and Drop rows accordingly

In [None]:
FILTER_MODE = False

In [None]:
if FILTER_MODE == True:
    filter = get_filter()
    train = remove_props(train, filter)
    dev = remove_props(dev, filter)
len(train), len(dev)

# Preprocess

## Cleansing

### Before

In [None]:
# for el in train.sample(n=5).sentence_form:
#     print(el)

In [None]:
# train.sentence_form = train.sentence_form.apply(preprocess)
# dev.sentence_form = dev.sentence_form.apply(preprocess)
# test.sentence_form = test.sentence_form.apply(preprocess)
# total = pd.concat([train, dev])

### Test

In [None]:
# case = total.sentence_form.str.contains('r[^A-Za-z0-9가-힣\s]+', case=False, flags=0, na=None, regex=True)
# for e in total[case].sentence_form:
#     print(e)

### After

In [None]:
# for i, row in total[['id', 'sentence_form']].sample(n=5).iterrows():
#     print(row.id, '\t', row.sentence_form)

In [None]:
# total['check'] = total.sentence_form.str.find('OO')
# for row in total[total.check > -1].sentence_form:
#     print(row)
#     break

In [None]:
# total

## Reformat

In [None]:
len(entity_property_pair)

In [None]:
decorate_form, decorate_acd_pair, decorate_asc_pair, decorate_acd_pair_split, decorate_asc_pair_split

In [None]:
def reformat(df):
    ep =[]
    p = []
    for index, row in df.iterrows():
        utterance = row.sentence_form
        id = row.id
        
        form = utterance
        # form = decorate_form(utterance)

        for pair in entity_property_pair:
            isPairInOpinion = False
            if pd.isna(utterance):
                break
            for annotation in row.annotation:
                entity_property = annotation[0]
                sentiment = annotation[2]
                if entity_property == pair:
                    
                    acd_pair = entity_property
                    # acd_pair = decorate_acd_pair(entity_property)
                    # acd_pair = decorate_acd_pair_split(entity_property)
                    
                    ep_append = [id, form, acd_pair, tf_name_to_id['True']]
                    ep.append(ep_append)
                    p.append([id, utterance, entity_property, sentiment])
                    isPairInOpinion = True
                    break
            if isPairInOpinion is False:
                
                acd_pair = pair
                # acd_pair = decorate_acd_pair(pair)
                # acd_pair = decorate_acd_pair_split(pair)
                
                ep_append = [id, form, acd_pair, tf_name_to_id['False']]
                ep.append(ep_append)
    return ep, p

In [None]:
def reformat_p_binary(df):
    p_binary = []
    for i, row in df.iterrows():
        row.id, row.form, row.pair, row.sentiment
        
        form = row.form
        # form = decorate_form(row.form)
        
        for sentiment in polarity_id_to_name:
            if sentiment == row.sentiment:

                asc_pair = '#'.join([row.pair, row.sentiment])
                # asc_pair = decorate_asc_pair(row.pair, row.sentiment)
                # asc_pair = decorate_asc_pair_split(row.pair, row.sentiment)

                p_binary_append = [row.id, form, asc_pair, tf_name_to_id['True']]
                p_binary.append(p_binary_append)
            else:

                asc_pair = '#'.join([row.pair, sentiment])
                # asc_pair = decorate_asc_pair(row.pair, sentiment)
                # asc_pair = decorate_asc_pair_split(row.pair, sentiment)

                p_binary_append = [row.id, form, asc_pair, tf_name_to_id['False']]
                p_binary.append(p_binary_append)
    return p_binary

In [None]:
len(train), len(dev)

In [None]:
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'

]

In [None]:
ep_train, p_train = reformat(train)
ep_dev, p_dev = reformat(dev)

ep_train = pd.DataFrame(ep_train, columns=['id', 'form', 'pair', 'labels'])
ep_dev = pd.DataFrame(ep_dev, columns=['id', 'form', 'pair', 'labels'])

p_train = pd.DataFrame(p_train, columns=['id', 'form', 'pair', 'sentiment'])
p_dev = pd.DataFrame(p_dev, columns=['id', 'form', 'pair', 'sentiment'])

len(ep_train), len(ep_dev), len(p_train), len(p_dev)

In [None]:
p_binary_train = reformat_p_binary(p_train)
p_binary_train = pd.DataFrame(p_binary_train, columns=['id', 'form', 'pair', 'labels'])

p_binary_dev = reformat_p_binary(p_dev)
p_binary_dev = pd.DataFrame(p_binary_dev, columns=['id', 'form', 'pair', 'labels'])

len(ep_train), len(ep_dev), len(p_binary_train), len(p_binary_dev)

In [None]:
# ep_train.sort_values(['id', 'labels'], inplace=True)
# ep_dev.sort_values(['id', 'labels'], inplace=True)
# p_binary_train.sort_values(['id', 'labels'], inplace=True, ascending=[True, True])
# p_binary_dev.sort_values(['id', 'labels'], inplace=True, ascending=[True, True])

### Counting

In [None]:
print('binary_multi: ', end=''), print(len(ep_train), len(ep_dev), len(p_train), len(p_dev))
print('binary_binary: ', end=''), print(len(ep_train), len(ep_dev), len(p_binary_train), len(p_binary_dev))
ep_train = ep_train.drop_duplicates()
ep_dev = ep_dev.drop_duplicates()
p_train = p_train.drop_duplicates()
p_dev = p_dev.drop_duplicates()
p_binary_train = p_binary_train.drop_duplicates()
p_binary_dev = p_binary_dev.drop_duplicates()
print('\nafter drop_duplicates\n')
print('binary_multi: ', end=''), print(len(ep_train), len(ep_dev), len(p_train), len(p_dev))
print('binary_binary: ', end=''), print(len(ep_train), len(ep_dev), len(p_binary_train), len(p_binary_dev))

### Validate Here

In [None]:
df = ep_train
for idx, row in df.iterrows():
    print(row.id, '\n',
          row.form, '\n',
          row.pair, '\n',
          row.labels,  '\n',)
    if idx == 49:
        break

### Save

In [None]:
DATA_V = 'uncleaned_v6'
save_path = f'./dataset/{DATA_V}'
print(save_path)

In [None]:
!mkdir -p {save_path}

train.to_csv(f'{save_path}/raw_train.csv', index=False)
dev.to_csv(f'{save_path}/raw_dev.csv', index=False)
test.to_csv(f'{save_path}/raw_test.csv', index=False)

ep_train.to_csv(f'{save_path}/ce_train.csv', index=False)
ep_dev.to_csv(f'{save_path}/ce_dev.csv', index=False)

p_binary_train.to_csv(f'{save_path}/pc_binary_train.csv', index=False)
p_binary_dev.to_csv(f'{save_path}/pc_binary_dev.csv', index=False)

### Additional Length Test If Needed

In [None]:
# ep_train, ep_dev, p_binary_train, p_binary_dev

In [None]:
model_checkpoint = 'snunlp/KR-ELECTRA-discriminator'

train_path = f'./dataset/{DATA_V}/raw_train.csv'
dev_path = f'./dataset/{DATA_V}/raw_dev.csv'
test_path = f'./dataset/{DATA_V}/raw_test.csv'
train = pd.read_csv(train_path)
dev = pd.read_csv(dev_path)
test = pd.read_csv(test_path)

### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'
]
special_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']
emojis = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
emojis = list(set(demoji.findall(' '.join(emojis.sentence_form.to_list())).keys()))
ep_labels = pd.Series(entity_property_pair, name='sentence_form', copy=True)

tokens2add = special_tokens + emojis

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(len(tokenizer))
tokenizer_train_data = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame().drop_duplicates()
tokenizer_train_data = tokenizer_train_data.sentence_form.to_list()
new_tokenizer = tokenizer.train_new_from_iterator(tokenizer_train_data, vocab_size=1)
new_tokens = set(list(new_tokenizer.vocab.keys()) + tokens2add) - set(tokenizer.vocab.keys())
tokenizer.add_tokens(list(new_tokens))
print(len(new_tokenizer))
print(len(tokenizer))
# model.resize_token_embeddings(len(tokenizer))

In [None]:
ep_train, ep_dev, p_binary_train, p_binary_dev
len_counter = []
for df in [ep_train, ep_dev, p_binary_train, p_binary_dev]:
    for idx, row in df.iterrows():
        len_counter.append(len(tokenizer(row["form"], row["pair"], truncation=True).input_ids))

In [None]:
max(len_counter)

### done here.

## Save Files

In [None]:
# save_path = './dataset/cleaned_v1'

# train.to_csv(f'{save_path}/raw_train.csv', index=False)
# dev.to_csv(f'{save_path}/raw_dev.csv', index=False)
# test.to_csv(f'{save_path}/raw_test.csv', index=False)

# ep_train.to_csv(f'{save_path}/ce_train.csv', index=False)
# ep_dev.to_csv(f'{save_path}/ce_dev.csv', index=False)
# p_train.to_csv(f'{save_path}/pc_train.csv', index=False)
# p_dev.to_csv(f'{save_path}/pc_dev.csv', index=False)
# p_binary_train.to_csv(f'{save_path}/pc_binary_train.csv', index=False)
# p_binary_dev.to_csv(f'{save_path}/pc_binary_dev.csv', index=False)

# ASC Augmentation

In [None]:
model_checkpoint = '/content/drive/MyDrive/aspect_based_sentiment_analysis/base_model/klue_roberta_base/v2/klue_roberta_base_mlm/checkpoint-19860'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
sTokens = tokenizer.all_special_tokens

def delTokens(sent):
    sent = sent.split(' ')
    temp = []
    for e in sent:
        if e not in sTokens:
            temp.append(e)
    return ' '.join(temp)

In [None]:
positive, negative, neutral = p_train[p_train.sentiment == 'positive'], p_train[p_train.sentiment == 'negative'], p_train[p_train.sentiment == 'neutral']

In [None]:
len(positive), len(negative), len(neutral)

In [None]:
(58 * 3) * 4 * 3, (95 * 3) * 4 * 2 # bt ri rr

Back Translation / Random Insertion / Random Replacement / Random Swap / Random Deletion

In [None]:
def backTrans(text):
    aug1 = ts.papago(text, sleep_seconds=5, from_language='ko', to_language='en')
    aug1 = ts.papago(aug1, sleep_seconds=5, from_language='en', to_language='ko')

    aug2 = ts.papago(text, sleep_seconds=5, from_language='ko', to_language='ja')
    aug2 = ts.papago(aug2, sleep_seconds=5, from_language='ja', to_language='ko')

    return [aug1, aug2]

def randomInsert(num, sample, device):
    aug = naw.ContextualWordEmbsAug(
        model_path=model_checkpoint, action="insert", model_type='bert', top_k=5, aug_p=0.3, aug_min=1, aug_max=1, device=device)

    aug_result = aug.augment(sample, n=num, num_thread=12)
    aug_result = list(map(delTokens, aug_result))
    aug_result = list(set(aug_result))
    return aug_result

def randomReplace(num, sample, device):
    aug = naw.ContextualWordEmbsAug(
        model_path=model_checkpoint, action="insert", model_type='bert', top_k=5, aug_p=0.3, aug_min=1, aug_max=1, device=device)

    aug_result = aug.augment(sample, n=num, num_thread=12)
    aug_result = list(map(delTokens, aug_result))
    aug_result = list(set(aug_result))
    return aug_result

def randomSwap(num, sample):
    aug = naw.RandomWordAug(action='swap', aug_min=1, aug_max=1, aug_p=0.3)    
    aug_result = aug.augment(sample, n=num, num_thread=2)
    aug_result = list(set(aug_result))
    return aug_result

def randomSplit(num, sample):
    aug = naw.SplitAug(aug_min=1, aug_max=1, aug_p=0.3, min_char=3)
    aug_result = aug.augment(sample, n=num, num_thread=2)
    aug_result = list(set(aug_result))
    return aug_result

In [None]:
(58 * 3) * 5 * 4, (95 * 3) * 4 * 3 # bt ri rr

In [None]:
def backtransRoutine(data2augment, output_path):
    print('back translation started.')
    temp = []
    for row in data2augment:
        augs = backTrans(row[1])
        for aug in augs:
            if aug != '' and aug != row[1]:
                new = [row[0], aug, row[2], row[3]]
                if new not in data2augment:
                    temp.append(new)
    data2augment.extend(temp)
    print(len(f'back translation finished.\ncurrent count: {len(data2augment)}'))

    data_aug = pd.DataFrame(data2augment, columns=['id', 'sentence_form', 'entity_property', 'sentiment'])
    data_aug.to_csv(f'{output_path}', index=False)

    return data_aug

In [None]:
import os

def edaRoutine(data2augment, ri, rr, output_path):
    print(f'current count: {len(data2augment)}')
    print('random insertion started.')
    temp = []
    for row in data2augment:
        augs = randomInsert(ri, row[1], 'cuda')
        for aug in augs:
            if aug != '' and aug != row[1]:
                new = [row[0], aug, row[2], row[3]]
                if new not in data2augment:
                    temp.append(new)
    data2augment.extend(temp)
    print(f'random insertion finished.\ncurrent count: {len(data2augment)}')

    print('random replacement started.')
    temp = []
    for row in data2augment:
        augs = randomReplace(rr, row[1], 'cuda')
        for aug in augs:
            if aug != '' and aug != row[1]:
                new = [row[0], aug, row[2], row[3]]
                if new not in data2augment:
                    temp.append(new)
    data2augment.extend(temp)
    print(f'random replacement finished.\ncurrent count: {len(data2augment)}')

    print('random swap and split started.')
    while len(data2augment) < len(positive):
        temp = []
        k = random.randrange(len(negative))
        id, text, entity, sentiment = data2augment[k]

        selector = random.randint(0,1)
        if selector == 0:
            augs = randomSwap(1, text)
            for aug in augs:
                if aug != '' and aug != text:
                    new = [id, aug, entity, sentiment]
                    if new not in data2augment:
                        temp.append(new)
            data2augment.extend(temp)
        else:
            augs = randomSplit(1, text)
            for aug in augs:
                if aug != '' and aug != text:
                    new = [id, aug, entity, sentiment]
                    if new not in data2augment:
                        temp.append(new)
            data2augment.extend(temp)
        if len(data2augment)%25 == 0:
            print(f'random swap and split in progress.\ncurrent count: {len(data2augment)}')

    print(f'whole augmentation routine finished.\ntotal count: {len(data2augment)}')

    data_aug = pd.DataFrame(data2augment, columns=['id', 'sentence_form', 'entity_property', 'sentiment'])
    data_aug.to_csv(f'{output_path}', index=False)

    return data_aug

In [None]:
### negative
# # back translation

# data2augment = negative.values.tolist()

output_folder = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11'
output_file = 'negative_bt.csv'
output_path = os.path.join(output_folder, output_file)

# negative_bt = backtransRoutine(data2augment, output_path)
negative_bt = pd.read_csv(output_path)
negative_bt = negative_bt.values.tolist()
# RI / RR

ri = 4 # times - 1
rr = 3 # times - 1

output_folder = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11'
output_file = 'negative_aug.csv'
output_path = os.path.join(output_folder, output_file)

# negative_aug = edaRoutine(negative_bt, ri, rr, output_path)
negative_aug = pd.read_csv(output_path)

In [None]:
negative_aug
negative_aug = negative_aug.drop_duplicates()

In [None]:
# negative_aug.sample(n=15, frac=None, replace=False, weights=None, random_state=None, axis=None, ignore_index=False)

In [None]:
# negative_aug.sort_values('id').head(50).sentence_form

In [None]:
### neutral
# back translation

# data2augment = neutral.values.tolist()

output_folder = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11'
output_file = 'neutral_bt.csv'
output_path = os.path.join(output_folder, output_file)

# neutral_bt = backtransRoutine(data2augment, output_path)
neutral_bt = pd.read_csv(output_path)
neutral_bt = neutral_bt.values.tolist()

# RI / RR

ri = 3 # times - 1
rr = 2 # times - 1

output_folder = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11'
output_file = 'neutral_aug.csv'
output_path = os.path.join(output_folder, output_file)

# neutral_aug = edaRoutine(neutral_bt, ri, rr, output_path)
neutral_aug = pd.read_csv(output_path)

In [None]:
neutral_aug
neutral_aug = neutral_aug.drop_duplicates()

In [None]:
# neutral_aug.sample(n=15, frac=None, replace=False, weights=None, random_state=None, axis=None, ignore_index=False)

In [None]:
# neutral_aug.sort_values('id').head(50).sentence_form

In [None]:
p_train_aug = pd.concat([positive, negative_aug, neutral_aug])

In [None]:
def reformat_p_binary(df):
    p_binary = []
    for i, row in df.iterrows():
        row.id, row.sentence_form, row.entity_property, row.sentiment
        for sentiment in polarity_id_to_name:
            if sentiment == row.sentiment:
                p_binary.append([row.id, row.sentence_form, '#'.join([row.entity_property, row.sentiment]), tf_name_to_id['True']])
            else: 
                p_binary.append([row.id, row.sentence_form, '#'.join([row.entity_property, sentiment]), tf_name_to_id['False']])
    return p_binary

In [None]:
p_binary_train_aug = reformat_p_binary(p_train_aug)
p_binary_train_aug = pd.DataFrame(p_binary_train_aug, columns=['id', 'sentence_form', 'entity_property', 'labels'])

In [None]:
p_binary_train_aug

In [None]:
p_binary_dev = reformat_p_binary(p_dev)
p_binary_dev = pd.DataFrame(p_binary_dev, columns=['id', 'sentence_form', 'entity_property', 'labels'])

In [None]:
p_binary_dev

# Counting

In [None]:
len(ep_train), len(ep_dev), len(p_train), len(p_dev)

In [None]:
len(ep_train), len(ep_dev), len(p_binary_train_aug), len(p_binary_dev)

In [None]:
ep_train = ep_train.drop_duplicates()
p_binary_train_aug = p_binary_train_aug.drop_duplicates()
ep_dev = ep_dev.drop_duplicates()
p_binary_dev = p_binary_dev.drop_duplicates()
len(ep_train), len(ep_dev), len(p_binary_train_aug), len(p_binary_dev)

# Export

In [None]:
%cd /content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11

# train.to_csv('raw_train.csv', index=False)
# dev.to_csv('raw_dev.csv', index=False)
# test.to_csv('raw_test.csv', index=False)

ep_train.to_csv('ce_train.csv', index=False)
p_binary_train_aug.to_csv('pc_binary_train_aug.csv', index=False)
ep_dev.to_csv('ce_dev.csv', index=False)
p_binary_dev.to_csv('pc_binary_dev.csv', index=False)

In [None]:
# emojis = pd.concat([ep_train.sentence_form, p_train.sentence_form, ep_dev.sentence_form, p_dev.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
# emojis = list(set(demoji.findall(' '.join(emojis.sentence_form.to_list())).keys()))

In [None]:
df = pd.read_csv('ce_train.csv')
df[df.id == 'nikluge-sa-2022-train-00065']

In [None]:
df = pd.read_csv('ce_dev.csv')
df

In [None]:
df = pd.read_csv('pc_binary_train_aug.csv')
df

In [None]:
df = pd.read_csv('pc_binary_dev.csv')
df