In [1]:
from transformers import (
    AutoTokenizer, AutoModelForMaskedLM, 
)

# import nlpaug.augmenter.char as nac
# import nlpaug.augmenter.word as naw
# import nlpaug.augmenter.sentence as nas
# import nlpaug.flow as nafc
# from nlpaug.util import Action

# from googletrans import Translator
# import translators as ts

import re, math, random, json
from copy import deepcopy
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
from collections import Counter

from module.preprocess import decorate_form, decorate_acd_pair, decorate_asc_pair, decorate_acd_pair_split, decorate_asc_pair_split
from module.utils import count_tags, make_token_classification_pair, remove_props, get_filter

import demoji

# from cleantext import clean
# from pykospacing import Spacing
# from hanspell import spell_checker

  from .autonotebook import tqdm as notebook_tqdm


# Load Raw Data

In [2]:
train_json = './dataset/nikluge-sa-2022-train.jsonl'
dev_json = './dataset/nikluge-sa-2022-dev.jsonl'
test_json = './dataset/nikluge-sa-2022-test.jsonl'

train = pd.read_json(train_json, lines=True)
dev = pd.read_json(dev_json, lines=True)
test = pd.read_json(test_json, lines=True)

train = train.drop(2319)
dev = dev.drop(1692)

# Declare Stuff to use

In [3]:
### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'

]
more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

tf_id_to_name = ['True', 'False']
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}

polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

# Count

In [4]:
total = pd.concat([train, dev])
total['stratified'] = total.annotation.apply(lambda x: x[0][0])
# count_tags(total, entity_property_pair)

In [5]:
indexes = []
for idx, row in total.iterrows():
    for annotation in row.annotation:
        if annotation[0] in ['본품#인지도', '패키지/구성품#가격']:
            indexes.append(row.id)
total['checker'] = total.id.apply(lambda x: True if x in indexes else False)
appendables = total[total.checker == True]

In [6]:
appendables.to_json('temp.json')

In [7]:
filter = [x for x in entity_property_pair if x not in ['본품#인지도', '패키지/구성품#가격']]
total = remove_props(total, filter)
# count_tags(total, entity_property_pair)

In [8]:
train, dev, _, _ = train_test_split(total, total['stratified'], test_size=0.2, random_state=42,  stratify=total['stratified'])
train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)
appendables = pd.read_json('temp.json')
train = pd.concat([train, appendables]).reset_index(drop=True)

In [9]:
count_tags(train, entity_property_pair)

tags found:  5078
tag set of df:  23
tag set of offered:  25
difference:  {'제품 전체#다양성', '브랜드#디자인'}
본품#품질	1907
제품 전체#일반	1295
본품#일반	397
제품 전체#품질	396
제품 전체#디자인	230
본품#편의성	155
제품 전체#편의성	142
제품 전체#인지도	113
패키지/구성품#디자인	94
브랜드#일반	82
제품 전체#가격	74
패키지/구성품#편의성	52
패키지/구성품#일반	41
본품#다양성	26
본품#디자인	17
브랜드#품질	16
패키지/구성품#품질	15
브랜드#인지도	14
브랜드#가격	6
본품#가격	2
패키지/구성품#다양성	2
본품#인지도	1
패키지/구성품#가격	1


In [10]:
count_tags(dev, entity_property_pair)

tags found:  1254
tag set of df:  20
tag set of offered:  25
difference:  {'본품#인지도', '패키지/구성품#가격', '브랜드#디자인', '제품 전체#다양성', '본품#가격'}
본품#품질	473
제품 전체#일반	327
제품 전체#품질	97
본품#일반	94
제품 전체#디자인	56
제품 전체#편의성	38
본품#편의성	36
제품 전체#인지도	28
패키지/구성품#디자인	23
브랜드#일반	21
제품 전체#가격	18
패키지/구성품#편의성	13
패키지/구성품#일반	9
본품#다양성	5
패키지/구성품#품질	4
본품#디자인	4
브랜드#품질	3
브랜드#인지도	3
패키지/구성품#다양성	1
브랜드#가격	1


# Filter entity_property_pair and Drop rows accordingly

In [11]:
FILTER_MODE = False

In [12]:
if FILTER_MODE == True:
    filter = get_filter()
    train = remove_props(train, filter)
    dev = remove_props(dev, filter)
len(train), len(dev)

(4634, 1159)

# Preprocess

## Cleansing

### Before

In [13]:
# for el in train.sample(n=5).sentence_form:
#     print(el)

In [14]:
# train.sentence_form = train.sentence_form.apply(preprocess)
# dev.sentence_form = dev.sentence_form.apply(preprocess)
# test.sentence_form = test.sentence_form.apply(preprocess)
# total = pd.concat([train, dev])

### Test

In [15]:
# case = total.sentence_form.str.contains('r[^A-Za-z0-9가-힣\s]+', case=False, flags=0, na=None, regex=True)
# for e in total[case].sentence_form:
#     print(e)

### After

In [16]:
# for i, row in total[['id', 'sentence_form']].sample(n=5).iterrows():
#     print(row.id, '\t', row.sentence_form)

In [17]:
# total['check'] = total.sentence_form.str.find('OO')
# for row in total[total.check > -1].sentence_form:
#     print(row)
#     break

In [18]:
# total

## Augmentation

## Reformat

In [19]:
len(entity_property_pair)

25

In [20]:
decorate_form, decorate_acd_pair, decorate_asc_pair, decorate_acd_pair_split, decorate_asc_pair_split

(<function module.preprocess.decorate_form(form)>,
 <function module.preprocess.decorate_acd_pair(entity)>,
 <function module.preprocess.decorate_asc_pair(entity, sentiment)>,
 <function module.preprocess.decorate_acd_pair_split(entity)>,
 <function module.preprocess.decorate_asc_pair_split(entity, sentiment)>)

In [21]:
def reformat(df):
    ep =[]
    p = []
    for index, row in df.iterrows():
        utterance = row.sentence_form
        id = row.id
        
        # form = utterance
        form = decorate_form(utterance)

        for pair in entity_property_pair:
            isPairInOpinion = False
            if pd.isna(utterance):
                break
            for annotation in row.annotation:
                entity_property = annotation[0]
                sentiment = annotation[2]
                if entity_property == pair:
                    
                    # acd_pair = entity_property
                    # acd_pair = '#'.join([form, entity_property])
                    acd_pair = decorate_acd_pair(entity_property)
                    # acd_pair = decorate_acd_pair_split(entity_property)
                    
                    ep_append = [id, form, acd_pair, tf_name_to_id['True']]
                    ep.append(ep_append)
                    p.append([id, utterance, entity_property, sentiment])
                    isPairInOpinion = True
                    break
            if isPairInOpinion is False:
                
                # acd_pair = pair
                # acd_pair = '#'.join([form, pair])
                acd_pair = decorate_acd_pair(pair)
                # acd_pair = decorate_acd_pair_split(pair)
                
                ep_append = [id, form, acd_pair, tf_name_to_id['False']]
                ep.append(ep_append)
    return ep, p

In [22]:
def reformat_p_binary(df):
    p_binary = []
    for i, row in df.iterrows():
        row.id, row.form, row.pair, row.sentiment
        
        # form = row.form
        form = decorate_form(row.form)
        
        for sentiment in polarity_id_to_name:
            if sentiment == row.sentiment:

                # asc_pair = '#'.join([row.pair, row.sentiment])
                # asc_pair = '#'.join([form, row.pair, row.sentiment])
                asc_pair = decorate_asc_pair(row.pair, row.sentiment)
                # asc_pair = decorate_asc_pair_split(row.pair, row.sentiment)

                p_binary_append = [row.id, form, asc_pair, tf_name_to_id['True']]
                p_binary.append(p_binary_append)
            else:

                # asc_pair = '#'.join([row.pair, sentiment])
                # asc_pair = '#'.join([form, row.pair, sentiment])
                asc_pair = decorate_asc_pair(row.pair, sentiment)
                # asc_pair = decorate_asc_pair_split(row.pair, sentiment)

                p_binary_append = [row.id, form, asc_pair, tf_name_to_id['False']]
                p_binary.append(p_binary_append)
    return p_binary

In [23]:
len(train), len(dev)

(4634, 1159)

In [24]:
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'

]

In [25]:
ep_train, p_train = reformat(train)
ep_dev, p_dev = reformat(dev)

ep_train = pd.DataFrame(ep_train, columns=['id', 'form', 'pair', 'labels'])
ep_dev = pd.DataFrame(ep_dev, columns=['id', 'form', 'pair', 'labels'])

p_train = pd.DataFrame(p_train, columns=['id', 'form', 'pair', 'sentiment'])
p_dev = pd.DataFrame(p_dev, columns=['id', 'form', 'pair', 'sentiment'])

len(ep_train), len(ep_dev), len(p_train), len(p_dev)

(115850, 28975, 4967, 1231)

In [26]:
p_binary_train = reformat_p_binary(p_train)
p_binary_train = pd.DataFrame(p_binary_train, columns=['id', 'form', 'pair', 'labels'])

p_binary_dev = reformat_p_binary(p_dev)
p_binary_dev = pd.DataFrame(p_binary_dev, columns=['id', 'form', 'pair', 'labels'])

len(ep_train), len(ep_dev), len(p_binary_train), len(p_binary_dev)

(115850, 28975, 14901, 3693)

In [27]:
# ep_train.sort_values(['id', 'labels'], inplace=True)
# ep_dev.sort_values(['id', 'labels'], inplace=True)
# p_binary_train.sort_values(['id', 'labels'], inplace=True, ascending=[True, True])
# p_binary_dev.sort_values(['id', 'labels'], inplace=True, ascending=[True, True])

### Counting

In [28]:
print('binary_multi: ', end=''), print(len(ep_train), len(ep_dev), len(p_train), len(p_dev))
print('binary_binary: ', end=''), print(len(ep_train), len(ep_dev), len(p_binary_train), len(p_binary_dev))
ep_train = ep_train.drop_duplicates()
ep_dev = ep_dev.drop_duplicates()
p_train = p_train.drop_duplicates()
p_dev = p_dev.drop_duplicates()
p_binary_train = p_binary_train.drop_duplicates()
p_binary_dev = p_binary_dev.drop_duplicates()
print('\nafter drop_duplicates\n')
print('binary_multi: ', end=''), print(len(ep_train), len(ep_dev), len(p_train), len(p_dev))
print('binary_binary: ', end=''), print(len(ep_train), len(ep_dev), len(p_binary_train), len(p_binary_dev))

binary_multi: 115850 28975 4967 1231
binary_binary: 115850 28975 14901 3693

after drop_duplicates

binary_multi: 115850 28975 4967 1231
binary_binary: 115850 28975 14901 3693


(None, None)

### Validate Here

In [29]:
dfs = [ep_train, ep_dev, p_binary_train, p_binary_dev]
for df in dfs:
    for idx, row in df.iterrows():
        print(row.id, '\n',
            row.form, '\n',
            row.pair, '\n',
            row.labels,  '\n',)
        if idx == 999:
            break

nikluge-sa-2022-dev-01381 
 Target#자연에서 추출한 영양가득 #프로폴리스 와 #로얄제리추출물 이 피부를 매끄럽고 탄력있게 해줘요~ 
 Target#본품#가격 
 1 

nikluge-sa-2022-dev-01381 
 Target#자연에서 추출한 영양가득 #프로폴리스 와 #로얄제리추출물 이 피부를 매끄럽고 탄력있게 해줘요~ 
 Target#본품#다양성 
 1 

nikluge-sa-2022-dev-01381 
 Target#자연에서 추출한 영양가득 #프로폴리스 와 #로얄제리추출물 이 피부를 매끄럽고 탄력있게 해줘요~ 
 Target#본품#디자인 
 1 

nikluge-sa-2022-dev-01381 
 Target#자연에서 추출한 영양가득 #프로폴리스 와 #로얄제리추출물 이 피부를 매끄럽고 탄력있게 해줘요~ 
 Target#본품#인지도 
 1 

nikluge-sa-2022-dev-01381 
 Target#자연에서 추출한 영양가득 #프로폴리스 와 #로얄제리추출물 이 피부를 매끄럽고 탄력있게 해줘요~ 
 Target#본품#일반 
 1 

nikluge-sa-2022-dev-01381 
 Target#자연에서 추출한 영양가득 #프로폴리스 와 #로얄제리추출물 이 피부를 매끄럽고 탄력있게 해줘요~ 
 Target#본품#편의성 
 1 

nikluge-sa-2022-dev-01381 
 Target#자연에서 추출한 영양가득 #프로폴리스 와 #로얄제리추출물 이 피부를 매끄럽고 탄력있게 해줘요~ 
 Target#본품#품질 
 0 

nikluge-sa-2022-dev-01381 
 Target#자연에서 추출한 영양가득 #프로폴리스 와 #로얄제리추출물 이 피부를 매끄럽고 탄력있게 해줘요~ 
 Target#브랜드#가격 
 1 

nikluge-sa-2022-dev-01381 
 Target#자연에서 추출한 영양가득 #프로폴리스 와 #로얄제리추출물 이 피부를 매끄럽고 탄력있게 해줘요~ 
 Target#브랜드#디자인 
 1 

nikluge-sa-2

### Save

In [30]:
DATA_V = 'uncleaned_v8'
save_path = f'./dataset/{DATA_V}'
print(save_path)

./dataset/uncleaned_v8


In [31]:
!mkdir -p {save_path}

train.to_csv(f'{save_path}/raw_train.csv', index=False)
dev.to_csv(f'{save_path}/raw_dev.csv', index=False)
test.to_csv(f'{save_path}/raw_test.csv', index=False)

ep_train.to_csv(f'{save_path}/ce_train.csv', index=False)
ep_dev.to_csv(f'{save_path}/ce_dev.csv', index=False)

p_binary_train.to_csv(f'{save_path}/pc_binary_train.csv', index=False)
p_binary_dev.to_csv(f'{save_path}/pc_binary_dev.csv', index=False)

### Additional Length Test If Needed

In [166]:
# ep_train, ep_dev, p_binary_train, p_binary_dev

In [71]:
model_checkpoint = 'snunlp/KR-ELECTRA-discriminator'

train_path = f'./dataset/{DATA_V}/raw_train.csv'
dev_path = f'./dataset/{DATA_V}/raw_dev.csv'
test_path = f'./dataset/{DATA_V}/raw_test.csv'
train = pd.read_csv(train_path)
dev = pd.read_csv(dev_path)
test = pd.read_csv(test_path)

### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'
]
special_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']
emojis = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
emojis = list(set(demoji.findall(' '.join(emojis.sentence_form.to_list())).keys()))
ep_labels = pd.Series(entity_property_pair, name='sentence_form', copy=True)

tokens2add = special_tokens + emojis

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
print(len(tokenizer))
tokenizer_train_data = pd.concat([train.sentence_form, dev.sentence_form, test.sentence_form], ignore_index=True, verify_integrity=True).to_frame().drop_duplicates()
tokenizer_train_data = tokenizer_train_data.sentence_form.to_list()
new_tokenizer = tokenizer.train_new_from_iterator(tokenizer_train_data, vocab_size=1)
new_tokens = set(list(new_tokenizer.vocab.keys()) + tokens2add) - set(tokenizer.vocab.keys())
tokenizer.add_tokens(list(new_tokens))
print(len(new_tokenizer))
print(len(tokenizer))
# model.resize_token_embeddings(len(tokenizer))

30000



3060
30117


In [72]:
ep_train, ep_dev, p_binary_train, p_binary_dev
len_counter = []
for df in [ep_train, ep_dev, p_binary_train, p_binary_dev]:
    for idx, row in df.iterrows():
        len_counter.append(len(tokenizer(row["form"], row["pair"], truncation=True).input_ids))

In [73]:
max(len_counter)

204

### done here.

## Save Files

In [None]:
# save_path = './dataset/cleaned_v1'

# train.to_csv(f'{save_path}/raw_train.csv', index=False)
# dev.to_csv(f'{save_path}/raw_dev.csv', index=False)
# test.to_csv(f'{save_path}/raw_test.csv', index=False)

# ep_train.to_csv(f'{save_path}/ce_train.csv', index=False)
# ep_dev.to_csv(f'{save_path}/ce_dev.csv', index=False)
# p_train.to_csv(f'{save_path}/pc_train.csv', index=False)
# p_dev.to_csv(f'{save_path}/pc_dev.csv', index=False)
# p_binary_train.to_csv(f'{save_path}/pc_binary_train.csv', index=False)
# p_binary_dev.to_csv(f'{save_path}/pc_binary_dev.csv', index=False)

# ASC Augmentation

In [None]:
model_checkpoint = '/content/drive/MyDrive/aspect_based_sentiment_analysis/base_model/klue_roberta_base/v2/klue_roberta_base_mlm/checkpoint-19860'

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
sTokens = tokenizer.all_special_tokens

def delTokens(sent):
    sent = sent.split(' ')
    temp = []
    for e in sent:
        if e not in sTokens:
            temp.append(e)
    return ' '.join(temp)

In [None]:
positive, negative, neutral = p_train[p_train.sentiment == 'positive'], p_train[p_train.sentiment == 'negative'], p_train[p_train.sentiment == 'neutral']

In [None]:
len(positive), len(negative), len(neutral)

In [None]:
(58 * 3) * 4 * 3, (95 * 3) * 4 * 2 # bt ri rr

Back Translation / Random Insertion / Random Replacement / Random Swap / Random Deletion

In [None]:
def backTrans(text):
    aug1 = ts.papago(text, sleep_seconds=5, from_language='ko', to_language='en')
    aug1 = ts.papago(aug1, sleep_seconds=5, from_language='en', to_language='ko')

    aug2 = ts.papago(text, sleep_seconds=5, from_language='ko', to_language='ja')
    aug2 = ts.papago(aug2, sleep_seconds=5, from_language='ja', to_language='ko')

    return [aug1, aug2]

def randomInsert(num, sample, device):
    aug = naw.ContextualWordEmbsAug(
        model_path=model_checkpoint, action="insert", model_type='bert', top_k=5, aug_p=0.3, aug_min=1, aug_max=1, device=device)

    aug_result = aug.augment(sample, n=num, num_thread=12)
    aug_result = list(map(delTokens, aug_result))
    aug_result = list(set(aug_result))
    return aug_result

def randomReplace(num, sample, device):
    aug = naw.ContextualWordEmbsAug(
        model_path=model_checkpoint, action="insert", model_type='bert', top_k=5, aug_p=0.3, aug_min=1, aug_max=1, device=device)

    aug_result = aug.augment(sample, n=num, num_thread=12)
    aug_result = list(map(delTokens, aug_result))
    aug_result = list(set(aug_result))
    return aug_result

def randomSwap(num, sample):
    aug = naw.RandomWordAug(action='swap', aug_min=1, aug_max=1, aug_p=0.3)    
    aug_result = aug.augment(sample, n=num, num_thread=2)
    aug_result = list(set(aug_result))
    return aug_result

def randomSplit(num, sample):
    aug = naw.SplitAug(aug_min=1, aug_max=1, aug_p=0.3, min_char=3)
    aug_result = aug.augment(sample, n=num, num_thread=2)
    aug_result = list(set(aug_result))
    return aug_result

In [None]:
(58 * 3) * 5 * 4, (95 * 3) * 4 * 3 # bt ri rr

In [None]:
def backtransRoutine(data2augment, output_path):
    print('back translation started.')
    temp = []
    for row in data2augment:
        augs = backTrans(row[1])
        for aug in augs:
            if aug != '' and aug != row[1]:
                new = [row[0], aug, row[2], row[3]]
                if new not in data2augment:
                    temp.append(new)
    data2augment.extend(temp)
    print(len(f'back translation finished.\ncurrent count: {len(data2augment)}'))

    data_aug = pd.DataFrame(data2augment, columns=['id', 'sentence_form', 'entity_property', 'sentiment'])
    data_aug.to_csv(f'{output_path}', index=False)

    return data_aug

In [None]:
import os

def edaRoutine(data2augment, ri, rr, output_path):
    print(f'current count: {len(data2augment)}')
    print('random insertion started.')
    temp = []
    for row in data2augment:
        augs = randomInsert(ri, row[1], 'cuda')
        for aug in augs:
            if aug != '' and aug != row[1]:
                new = [row[0], aug, row[2], row[3]]
                if new not in data2augment:
                    temp.append(new)
    data2augment.extend(temp)
    print(f'random insertion finished.\ncurrent count: {len(data2augment)}')

    print('random replacement started.')
    temp = []
    for row in data2augment:
        augs = randomReplace(rr, row[1], 'cuda')
        for aug in augs:
            if aug != '' and aug != row[1]:
                new = [row[0], aug, row[2], row[3]]
                if new not in data2augment:
                    temp.append(new)
    data2augment.extend(temp)
    print(f'random replacement finished.\ncurrent count: {len(data2augment)}')

    print('random swap and split started.')
    while len(data2augment) < len(positive):
        temp = []
        k = random.randrange(len(negative))
        id, text, entity, sentiment = data2augment[k]

        selector = random.randint(0,1)
        if selector == 0:
            augs = randomSwap(1, text)
            for aug in augs:
                if aug != '' and aug != text:
                    new = [id, aug, entity, sentiment]
                    if new not in data2augment:
                        temp.append(new)
            data2augment.extend(temp)
        else:
            augs = randomSplit(1, text)
            for aug in augs:
                if aug != '' and aug != text:
                    new = [id, aug, entity, sentiment]
                    if new not in data2augment:
                        temp.append(new)
            data2augment.extend(temp)
        if len(data2augment)%25 == 0:
            print(f'random swap and split in progress.\ncurrent count: {len(data2augment)}')

    print(f'whole augmentation routine finished.\ntotal count: {len(data2augment)}')

    data_aug = pd.DataFrame(data2augment, columns=['id', 'sentence_form', 'entity_property', 'sentiment'])
    data_aug.to_csv(f'{output_path}', index=False)

    return data_aug

In [None]:
### negative
# # back translation

# data2augment = negative.values.tolist()

output_folder = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11'
output_file = 'negative_bt.csv'
output_path = os.path.join(output_folder, output_file)

# negative_bt = backtransRoutine(data2augment, output_path)
negative_bt = pd.read_csv(output_path)
negative_bt = negative_bt.values.tolist()
# RI / RR

ri = 4 # times - 1
rr = 3 # times - 1

output_folder = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11'
output_file = 'negative_aug.csv'
output_path = os.path.join(output_folder, output_file)

# negative_aug = edaRoutine(negative_bt, ri, rr, output_path)
negative_aug = pd.read_csv(output_path)

In [None]:
negative_aug
negative_aug = negative_aug.drop_duplicates()

In [None]:
# negative_aug.sample(n=15, frac=None, replace=False, weights=None, random_state=None, axis=None, ignore_index=False)

In [None]:
# negative_aug.sort_values('id').head(50).sentence_form

In [None]:
### neutral
# back translation

# data2augment = neutral.values.tolist()

output_folder = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11'
output_file = 'neutral_bt.csv'
output_path = os.path.join(output_folder, output_file)

# neutral_bt = backtransRoutine(data2augment, output_path)
neutral_bt = pd.read_csv(output_path)
neutral_bt = neutral_bt.values.tolist()

# RI / RR

ri = 3 # times - 1
rr = 2 # times - 1

output_folder = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11'
output_file = 'neutral_aug.csv'
output_path = os.path.join(output_folder, output_file)

# neutral_aug = edaRoutine(neutral_bt, ri, rr, output_path)
neutral_aug = pd.read_csv(output_path)

In [None]:
neutral_aug
neutral_aug = neutral_aug.drop_duplicates()

In [None]:
# neutral_aug.sample(n=15, frac=None, replace=False, weights=None, random_state=None, axis=None, ignore_index=False)

In [None]:
# neutral_aug.sort_values('id').head(50).sentence_form

In [None]:
p_train_aug = pd.concat([positive, negative_aug, neutral_aug])

In [None]:
def reformat_p_binary(df):
    p_binary = []
    for i, row in df.iterrows():
        row.id, row.sentence_form, row.entity_property, row.sentiment
        for sentiment in polarity_id_to_name:
            if sentiment == row.sentiment:
                p_binary.append([row.id, row.sentence_form, '#'.join([row.entity_property, row.sentiment]), tf_name_to_id['True']])
            else: 
                p_binary.append([row.id, row.sentence_form, '#'.join([row.entity_property, sentiment]), tf_name_to_id['False']])
    return p_binary

In [None]:
p_binary_train_aug = reformat_p_binary(p_train_aug)
p_binary_train_aug = pd.DataFrame(p_binary_train_aug, columns=['id', 'sentence_form', 'entity_property', 'labels'])

In [None]:
p_binary_train_aug

In [None]:
p_binary_dev = reformat_p_binary(p_dev)
p_binary_dev = pd.DataFrame(p_binary_dev, columns=['id', 'sentence_form', 'entity_property', 'labels'])

In [None]:
p_binary_dev

# Counting

In [None]:
len(ep_train), len(ep_dev), len(p_train), len(p_dev)

In [None]:
len(ep_train), len(ep_dev), len(p_binary_train_aug), len(p_binary_dev)

In [None]:
ep_train = ep_train.drop_duplicates()
p_binary_train_aug = p_binary_train_aug.drop_duplicates()
ep_dev = ep_dev.drop_duplicates()
p_binary_dev = p_binary_dev.drop_duplicates()
len(ep_train), len(ep_dev), len(p_binary_train_aug), len(p_binary_dev)

# Export

In [None]:
%cd /content/drive/MyDrive/aspect_based_sentiment_analysis/data/v11

# train.to_csv('raw_train.csv', index=False)
# dev.to_csv('raw_dev.csv', index=False)
# test.to_csv('raw_test.csv', index=False)

ep_train.to_csv('ce_train.csv', index=False)
p_binary_train_aug.to_csv('pc_binary_train_aug.csv', index=False)
ep_dev.to_csv('ce_dev.csv', index=False)
p_binary_dev.to_csv('pc_binary_dev.csv', index=False)

In [None]:
# emojis = pd.concat([ep_train.sentence_form, p_train.sentence_form, ep_dev.sentence_form, p_dev.sentence_form], ignore_index=True, verify_integrity=True).to_frame()
# emojis = list(set(demoji.findall(' '.join(emojis.sentence_form.to_list())).keys()))

In [None]:
df = pd.read_csv('ce_train.csv')
df[df.id == 'nikluge-sa-2022-train-00065']

In [None]:
df = pd.read_csv('ce_dev.csv')
df

In [None]:
df = pd.read_csv('pc_binary_train_aug.csv')
df

In [None]:
df = pd.read_csv('pc_binary_dev.csv')
df