In [None]:
import pandas as pd
import json
from module.load_json import *
from module.utils import *
from module.maps import *

from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, 
    TrainingArguments, Trainer,
)

In [None]:
train_path = 'dataset/NIKL_ABSA_2021_v1.0/EXSA2112203180.json'
dev_path = 'dataset/NIKL_ABSA_2021_v1.0/EXSA2122203180.json'

In [None]:
train = jsonload(train_path)
dev = jsonload(dev_path)
train = pd.DataFrame(train['document'])
dev = pd.DataFrame(dev['document'])
total = pd.concat([train, dev]).reset_index(drop=True)

In [None]:
total.domain.unique()
categories = ['제품 기타', '전자기기', '화장품/세정제']

checker = total.domain.apply(lambda x: x in categories)
total = total[checker]
total = total[total.isna().any(axis=1) == False]
total = total[['sentence', 'opinions']].reset_index(drop=True)

In [None]:
def collect_sentences(sentences):
    collected_sentences = []
    for sentence in sentences:
        collected_sentences.append(sentence['sentence_form'])
        collected_sentences = collected_sentences[:5]
    return ' '.join(collected_sentences)

In [None]:
def collect_opinions(opinions):
    collected_opinions = []
    for opinion in opinions:
        category = opinion['category']
        polarity = opinion['opinion polarity']
        if polarity != 'conflict':
            collected_opinions.append([category, [], polarity])
    return collected_opinions

In [None]:
total['sentence'] = total.sentence.apply(collect_sentences)
total['opinions'] = total.opinions.apply(collect_opinions)
total = total[total.opinions.apply(lambda x: x == []) == False]
total = total.reset_index()
total = total.set_axis(['id', 'sentence_form', 'annotation'], axis='columns')

In [None]:
model_checkpoint = 'snunlp/KR-ELECTRA-discriminator'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
total.sentence_form.iloc[0]

In [None]:
test = total.sentence_form.apply(lambda x: len(tokenizer.encode(x, '본품#품질', truncation=True))).iloc[0]
# tokenizer.decode(test)
test

In [None]:
test = [1,2,3,4,5,6,7,8,9]
test[:len(test)//2]

In [None]:
total.sentence_form.iloc[0]
# total.annotation.iloc[0]

In [None]:
count_tags(total, entity_property_pair)

In [None]:
DATA_V = 'uncleaned_v23'
save_path = f'./dataset/{DATA_V}'
print(save_path)

In [None]:
!mkdir -p {save_path}

total.to_json(f'{save_path}/absa2021.json', force_ascii=False)