In [1]:
from transformers import (
    AutoConfig, ElectraTokenizerFast, ElectraForTokenClassification, 
    DataCollatorForTokenClassification,
    TrainingArguments, Trainer,
)

import re, math, random, json, os
from copy import deepcopy
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
from collections import Counter

from module.preprocess import decorate_form, decorate_acd_pair, decorate_asc_pair, decorate_acd_pair_split, decorate_asc_pair_split
from module.utils import count_tags, make_token_classification_pair, remove_props, align_tokens_and_labels, get_filter, generate_token_classification_data, adjust_target
import demoji

  from .autonotebook import tqdm as notebook_tqdm


# Declare Stuff to use

In [2]:
### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'

]
more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

tf_id_to_name = ['True', 'False']
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}

polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

In [3]:
DATA_V = 'uncleaned_v12'
save_path = f'./dataset/{DATA_V}'
print(save_path)

./dataset/uncleaned_v12


# ACD and ASC Preprocess

In [4]:
train = pd.read_json(f'{save_path}/tagger_train.json')
dev = pd.read_json(f'{save_path}/tagger_dev.json')

In [5]:
train = train[['id', 'sentence_form', 'annotation']]
dev = dev[['id', 'sentence_form', 'annotation']]

In [6]:
def reformat(df):
    tabsa =[]
    for index, row in df.iterrows():
        utterance = row.sentence_form
        id = row.id
        form = utterance
        # form = decorate_form(utterance)
        
        trg_ep_sents = {}
        for annotation in row.annotation:
            ep_and_sent = [annotation[0], annotation[2]]
            target = annotation[1][0]
            if target not in trg_ep_sents.keys():
                trg_ep_sents[target] = []
                trg_ep_sents[target].append(ep_and_sent)
            else:
                trg_ep_sents[target].append(ep_and_sent)

        for target in trg_ep_sents.keys():
            annotations = trg_ep_sents[target]
            for pair in entity_property_pair:
                for polarity in polarity_id_to_name:
                    isPairInOpinion = False
                    if pd.isna(utterance):
                        break
                    for annotation in annotations:
                        entity_property = annotation[0]
                        sentiment = annotation[1]
                        if entity_property == pair and sentiment == polarity:
                            tabsa_pair = '#'.join([target, entity_property, sentiment])
                            ep_append = [id, form, tabsa_pair, tf_name_to_id['True']]
                            tabsa.append(ep_append)
                            isPairInOpinion = True
                            break
                    if isPairInOpinion is False:
                        tabsa_pair = '#'.join([target, pair, polarity])
                        ep_append = [id, form, tabsa_pair, tf_name_to_id['False']]
                        tabsa.append(ep_append)
    return tabsa

In [7]:
tabsa_train = reformat(train)
tabsa_dev = reformat(dev)

tabsa_train = pd.DataFrame(tabsa_train, columns=['id', 'form', 'pair', 'labels'])
tabsa_dev = pd.DataFrame(tabsa_dev, columns=['id', 'form', 'pair', 'labels'])

len(tabsa_train), len(tabsa_dev)

(374100, 92325)

### Counting

In [8]:
print('before: ', end=''), print(len(tabsa_train), len(tabsa_dev))

tabsa_train = tabsa_train.drop_duplicates()
tabsa_dev = tabsa_dev.drop_duplicates()

print('after: ', end=''), print(len(tabsa_train), len(tabsa_dev))

before: 374100 92325
after: 374100 92325


(None, None)

### Validate Here

In [9]:
# for idx, row in train.iterrows():
#     if len(row.annotation) == 5:
#         print(row.id)
#         print()
#         print(row.sentence_form)
#         print()
#         for annotation in row.annotation:
#             print(annotation)

In [10]:
# df = tabsa_train
# for idx, row in df.iterrows():
#     if row.id == 'nikluge-sa-2022-train-00174':
#         print(row.id, '\n',
#             row.form, '\n',
#             row.pair, '\n',
#             row.labels, '\n',)

### Save

In [11]:
save_path = f'./dataset/{DATA_V}'
print(save_path)

./dataset/uncleaned_v12


In [12]:
!mkdir -p {save_path}

tabsa_train.to_csv(f'{save_path}/tabsa_train.csv', index=False, encoding='utf-8')
tabsa_dev.to_csv(f'{save_path}/tabsa_dev.csv', index=False, encoding='utf-8')

### Additional Length Test If Needed

In [None]:
# ep_train, ep_dev, p_binary_train, p_binary_dev

In [None]:
model_checkpoint = 'dataset/uncleaned_v11/tokenizer'
tokenizer = ElectraTokenizerFast.from_pretrained(model_checkpoint)

In [None]:
tabsa_train, tabsa_dev
len_counter = []
for df in [tabsa_train, tabsa_dev]:
    for idx, row in df.iterrows():
        len_counter.append(len(tokenizer(row["form"], row["pair"], truncation=True).input_ids))

In [None]:
max(len_counter)