In [1]:
from transformers import (
    AutoTokenizer, AutoModelForMaskedLM, 
)

from collections import Counter
import re, math, random, json
from copy import deepcopy
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pandas as pd

from module.preprocess import decorate_form, decorate_acd_pair, decorate_asc_pair, decorate_acd_pair_split, decorate_asc_pair_split
from module.utils import count_tags, make_token_classification_pair, remove_props, get_filter
from module.augmentation import back_trans, random_replace, random_insert, random_swap, random_split

# import demoji
# from cleantext import clean
# from pykospacing import Spacing
# from hanspell import spell_checker

Using state  server backend.


In [2]:
def jsonload(fname, encoding="utf-8"):
    with open(fname, encoding=encoding) as f:
        j = json.load(f)

    return j


# json 개체를 파일이름으로 깔끔하게 저장
def jsondump(j, fname):
    with open(fname, "w", encoding="UTF8") as f:
        json.dump(j, f, ensure_ascii=False)

# jsonl 파일 읽어서 list에 저장
def jsonlload(fname, encoding="utf-8"):
    json_list = []
    with open(fname, encoding=encoding) as f:
        for line in f.readlines():
            json_list.append(json.loads(line))
    return json_list

# Load Raw Data

In [3]:
train_json = './dataset/nikluge-sa-2022-train.jsonl'
dev_json = './dataset/nikluge-sa-2022-dev.jsonl'
test_json = './dataset/nikluge-sa-2022-test.jsonl'

train = jsonlload(train_json)
dev = jsonlload(dev_json)
test = jsonlload(test_json)
train = pd.DataFrame(train)
dev = pd.DataFrame(dev)
test = pd.DataFrame(test)

# train = pd.read_json(train_json, lines=True)
# dev = pd.read_json(dev_json, lines=True)
# test = pd.read_json(test_json, lines=True)

train = train.drop(2319)
dev = dev.drop(1692)

# Declare Stuff to use

In [4]:
### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'

]
more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

tf_id_to_name = ['True', 'False']
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}

polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

In [5]:
total = pd.concat([train, dev]).reset_index(drop=True)

In [6]:
sentiments = []
for idx, row in total.iterrows():
    temp = []
    for annotation in row.annotation:
        sentiment = annotation[2]
        temp.append(sentiment)
    temp = list(set(temp))
    sentiments.append(temp)
total['sentiments'] = sentiments

In [7]:
eps = []
for idx, row in total.iterrows():
    temp = []
    for annotation in row.annotation:
        ep = annotation[0]
        temp.append(ep)
    temp = list(set(temp))
    eps.append(temp)
total['eps'] = eps

In [8]:
minor = ['브랜드#가격',
    '패키지/구성품#다양성',
    '본품#가격',
    '본품#인지도',
    '패키지/구성품#가격']

In [9]:
def check_if_in_minor(eps):
    for ep in eps:
        if ep in minor:
            return True
    return False

indices_in_minor = total[total.eps.apply(check_if_in_minor) == True].index

In [10]:
rows_in_minor = total.iloc[indices_in_minor].copy()

In [11]:
total = total.drop(indices_in_minor)

In [12]:
total['stratified'] = total.annotation.apply(lambda x: x[0][2])
train, dev, _, _ = train_test_split(total, total['stratified'], test_size=0.2, random_state=42,  stratify=total['stratified'])
train.reset_index(inplace=True, drop=True)
dev.reset_index(inplace=True, drop=True)

In [13]:
train = pd.concat([train, rows_in_minor]).reset_index(drop=True)
train['stratified'] = train.annotation.apply(lambda x: x[0][2])

In [14]:
to_count = []
for sentiments in train.sentiments:
    for sentiment in sentiments:
        to_count.append(sentiment)
sentiment_counter = Counter(to_count)
sentiment_counter = sorted(sentiment_counter.items(), key=lambda x: x[1], reverse=True)

for k, v in sentiment_counter:
    print(f'{k}\t{v}')

positive	4460
neutral	117
negative	69


In [15]:
checker_one = train.sentence_form.apply(lambda x: len(x.split(' ')))
condition_one = checker_one > 4
checker_two = train.annotation.apply(len)
condition_two = checker_two == 1

rows2aug = train[(condition_one) & (condition_two)]

# count_tags(rows2aug, entity_property_pair)

In [16]:
sentiment2aug = ['negative', 'neutral']

In [17]:
dfs = {}
for sentiment in sentiment2aug:
    df = rows2aug[rows2aug.stratified == sentiment]
    df = df[['id', 'sentence_form', 'annotation']].values.tolist()
    dfs[sentiment] = df

In [18]:
dfs.keys()

dict_keys(['negative', 'neutral'])

# Augmentation

Back Translation / Random Insertion / Random Replacement / Random Swap / Random Deletion / Random Split

In [19]:
augmenters = [random_replace, random_insert]
target_num = 4460
gen_num = 3

In [20]:
for sentiment in tqdm(dfs.keys()):
    samples = dfs[sentiment]
    if len(samples) >= target_num:
        continue
    
    while True:
        rand_sample = random.randrange(len(samples))
        id = samples[rand_sample][0]
        sentence_form = samples[rand_sample][1]
        annotation = samples[rand_sample][2]
        
        rand_aug = random.randrange(len(augmenters))
        aug_results = augmenters[rand_aug](gen_num, sentence_form)
        
        for aug_result in aug_results:
            augged = [id, aug_result, annotation]
            if augged not in samples:
                samples.append(augged)
                
        if len(samples) > target_num:
            samples = samples[:target_num]
            dfs[sentiment] = samples
            break

100%|██████████| 2/2 [03:07<00:00, 93.71s/it]


In [21]:
major = train[train.stratified == 'positive'][['id', 'sentence_form', 'annotation']].values.tolist().copy()

In [22]:
minors = list(dfs.values())

for minor in minors:
    major.extend(minor)

In [23]:
augmented_asc_train = pd.DataFrame(major, columns=['id', 'sentence_form', 'annotation'])

In [24]:
4460 * 3

13380

In [25]:
def reformat(df):
    ep =[]
    p = []
    for index, row in df.iterrows():
        utterance = row.sentence_form
        id = row.id
        
        form = utterance
        # form = decorate_form(utterance)

        for pair in entity_property_pair:
            isPairInOpinion = False
            if pd.isna(utterance):
                break
            for annotation in row.annotation:
                entity_property = annotation[0]
                sentiment = annotation[2]
                if entity_property == pair:
                    acd_pair = entity_property
                    ep_append = [id, form, acd_pair, tf_name_to_id['True']]
                    ep.append(ep_append)
                    p.append([id, utterance, acd_pair, polarity_name_to_id[sentiment]])
                    isPairInOpinion = True
                    break
            if isPairInOpinion is False:
                acd_pair = pair
                ep_append = [id, form, acd_pair, tf_name_to_id['False']]
                ep.append(ep_append)
    return ep, p

In [26]:
len(augmented_asc_train), len(dev)

(13375, 1156)

In [27]:
ep_train, p_train = reformat(augmented_asc_train)
ep_dev, p_dev = reformat(dev)

ep_train = pd.DataFrame(ep_train, columns=['id', 'form', 'pair', 'labels'])
ep_dev = pd.DataFrame(ep_dev, columns=['id', 'form', 'pair', 'labels'])

p_train = pd.DataFrame(p_train, columns=['id', 'form', 'pair', 'labels'])
p_dev = pd.DataFrame(p_dev, columns=['id', 'form', 'pair', 'labels'])

len(ep_train), len(ep_dev), len(p_train), len(p_dev)

(334375, 28900, 13710, 1218)

### Counting

In [28]:
print('binary_multi: ', end=''), print(len(ep_train), len(ep_dev), len(p_train), len(p_dev))
ep_train = ep_train.drop_duplicates()
ep_dev = ep_dev.drop_duplicates()
p_train = p_train.drop_duplicates()
p_dev = p_dev.drop_duplicates()
print('\nafter drop_duplicates\n')
print('binary_multi: ', end=''), print(len(ep_train), len(ep_dev), len(p_train), len(p_dev))

binary_multi: 334375 28900 13710 1218

after drop_duplicates

binary_multi: 334375 28900 13710 1218


(None, None)

### Validate Here

In [29]:
# dfs = [ep_train, ep_dev, p_train, p_dev]
# for df in dfs:
#     for idx, row in df.iterrows():
#         print(row.id, '\n',
#             row.form, '\n',
#             row.pair, '\n',
#             row.labels,  '\n',)
#         if idx == 999:
#             break

### Save

In [30]:
DATA_V = 'uncleaned_v16'
save_path = f'./dataset/{DATA_V}'
print(save_path)

./dataset/uncleaned_v16


In [31]:
!mkdir -p {save_path}

# train.to_csv(f'{save_path}/raw_train.csv', index=False, encoding='utf-8-sig')
# dev.to_csv(f'{save_path}/raw_dev.csv', index=False, encoding='utf-8-sig')
# test.to_csv(f'{save_path}/raw_test.csv', index=False, encoding='utf-8-sig')

p_train.to_csv(f'{save_path}/pc_train_balanced.csv', index=False, encoding='utf-8-sig')
p_dev.to_csv(f'{save_path}/pc_dev.csv', index=False, encoding='utf-8-sig')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
