# Modules and Global Variables

In [1]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, 
    ElectraForTokenClassification, ElectraForSequenceClassification, ElectraTokenizerFast
)

import torch, copy, json, re, os
from cleantext import clean
from tqdm import tqdm
from module.preprocess import preprocess
from module.preprocess import decorate_form, decorate_acd_pair, decorate_acd_pair_split, decorate_asc_pair, decorate_asc_pair_split
from module.score import evaluation_f1

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def jsonload(fname, encoding="utf-8"):
    with open(fname, encoding=encoding) as f:
        j = json.load(f)
    return j

def jsondump(j, fname):
    with open(fname, "w", encoding="UTF8") as f:
        json.dump(j, f, ensure_ascii=False)

def jsonlload(fname, encoding="utf-8"):
    json_list = []
    with open(fname, encoding=encoding) as f:
        for line in f.readlines():
            json_list.append(json.loads(line))
    return json_list

In [3]:
print(f'torch.__version__: {torch.__version__}')
print(f'torch.cuda.is_available(): {torch.cuda.is_available()}')
NGPU = torch.cuda.device_count()
print(f'NGPU: {NGPU}')

torch.__version__: 1.12.1
torch.cuda.is_available(): True
NGPU: 4


# Paths and Modes

In [4]:
EVAL_MODE = False
CATEGORY_FILTER = False
SENTIMENT_FILTER = False

RESULT_SAVE_NAME = 'monologg_koelectra_base_v3_discriminator_uncleaned_v11_tagger_acd_asc.json'

TAGGER_CHECKPOINT = 'training_results/monologg_koelectra_base_v3_discriminator_uncleaned_v11/target_tagger/monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-370'
ACD_CHECKPOINT = 'training_results/monologg_koelectra_base_v3_discriminator_uncleaned_v11/acd/monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-9750'
ASC_CHECKPOINT = 'training_results/monologg_koelectra_base_v3_discriminator_uncleaned_v11/asc/monologg_koelectra_base_v3_discriminator_uncleaned_v11/checkpoint-1190'

TEST_DATA_PATH = 'dataset/nikluge-sa-2022-test.jsonl'
EVAL_DATA_PATH = 'dataset/nikluge-sa-2022-dev.jsonl'

In [5]:
if EVAL_MODE == True:
    TEST_DATA_PATH = EVAL_DATA_PATH
print('>>>>> >>>>> >>>>> ', TEST_DATA_PATH, ' <<<<< <<<<< <<<<<', '\n', sep='')

test_data = jsonlload(TEST_DATA_PATH)

if EVAL_MODE == True:
    for row in test_data:
        for annotation in row['annotation']:
            annotation.pop(1)
            
    true_data = copy.deepcopy(test_data)
    
    for row in test_data:
        row['annotation'] = []

    for idx, row in enumerate(true_data):
        print(row)
        if idx == 4:
            break
    print()
for idx, row in enumerate(test_data):
    print(row)
    if idx == 4:
        break

>>>>> >>>>> >>>>> dataset/nikluge-sa-2022-test.jsonl <<<<< <<<<< <<<<<

{'id': 'nikluge-sa-2022-test-00001', 'sentence_form': '하나 사려고 알아보는 중인데 맘에드는거 발견', 'annotation': []}
{'id': 'nikluge-sa-2022-test-00002', 'sentence_form': '동양인 피부톤과 잘 어울리고 우아한 분위기를 풍긴다네?', 'annotation': []}
{'id': 'nikluge-sa-2022-test-00003', 'sentence_form': '근데 이건 마르살라보다 더 지나친 색 같은데..', 'annotation': []}
{'id': 'nikluge-sa-2022-test-00004', 'sentence_form': '나스 색조가 다 그렇지만서도 어데이셔스 라인은 진짜 색 기막히게 뽑는것 같다', 'annotation': []}
{'id': 'nikluge-sa-2022-test-00005', 'sentence_form': '색상만 보면 이걸 어떻게 발라.. 싶겠지만 의외로 너무너무 괜찮다', 'annotation': []}


# Inference Configs

In [6]:
if CATEGORY_FILTER == True:
    # entity_property_pair = ['본품#품질',
    #           '제품 전체#일반',
    #           '본품#일반',
    #           '제품 전체#품질',
    #           '제품 전체#디자인',
    #           '본품#편의성',
    #           '제품 전체#편의성',
    #           '제품 전체#인지도',
    #           '패키지/구성품#디자인',
    #           '브랜드#일반',
    #           '제품 전체#가격']  # 2716

    # entity_property_pair = ['본품#품질',
    #           '제품 전체#일반',
    #           '본품#일반',
    #           '제품 전체#품질',
    #           '제품 전체#디자인',
    #           '본품#편의성',
    #           '제품 전체#편의성',
    #           '제품 전체#인지도',
    #           '패키지/구성품#디자인',
    #           '브랜드#일반'] # 2676

    # entity_property_pair = ['본품#품질',
    #           '제품 전체#일반',
    #           '본품#일반',
    #           '제품 전체#품질',
    #           '제품 전체#디자인',
    #           '본품#편의성',
    #           '제품 전체#편의성',
    #           '제품 전체#인지도',
    #           '패키지/구성품#디자인']  # 2627

    # entity_property_pair = ['본품#품질',
    #           '제품 전체#일반',
    #           '본품#일반',
    #           '제품 전체#품질',
    #           '제품 전체#디자인',
    #           '본품#편의성',
    #           '제품 전체#편의성',
    #           '제품 전체#인지도']  # 2575

    # entity_property_pair = ['본품#품질',
    #           '제품 전체#일반',
    #           '본품#일반',
    #           '제품 전체#품질',
    #           '제품 전체#디자인',
    #           '본품#편의성',
    #           '제품 전체#편의성'] # 2509

    entity_property_pair = ['본품#품질',
            '제품 전체#일반',
            '본품#일반',
            '제품 전체#품질',
            '제품 전체#디자인',
            '본품#편의성'] # 2421

    # entity_property_pair = ['본품#품질',
    #           '제품 전체#일반',
    #           '본품#일반',
    #           '제품 전체#품질',
    #           '제품 전체#디자인'] # 2339
else:
    ### new
    entity_property_pair = [
        '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
        '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
        '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
        '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'
    ]

more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

tf_id_to_name = ['True', 'False']
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}

polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

target_tagger_labels = ['Other', 'TRG_B', 'TRG_I']
tag2id = {k: i for i, k in enumerate(target_tagger_labels)}
id2tag = {i: k for i, k in enumerate(target_tagger_labels)}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

len(entity_property_pair)

25

# Load Model and Tokenizer

In [7]:
tagger_model = ElectraForTokenClassification.from_pretrained(TAGGER_CHECKPOINT)
tagger_tokenizer = ElectraTokenizerFast.from_pretrained(TAGGER_CHECKPOINT)

acd_model = ElectraForSequenceClassification.from_pretrained(ACD_CHECKPOINT)
acd_tokenizer = ElectraTokenizerFast.from_pretrained(ACD_CHECKPOINT)

asc_model = ElectraForSequenceClassification.from_pretrained(ASC_CHECKPOINT)
asc_tokenizer = ElectraTokenizerFast.from_pretrained(ASC_CHECKPOINT)

In [8]:
decorate_form, decorate_acd_pair, decorate_acd_pair_split, decorate_asc_pair, decorate_asc_pair_split

(<function module.preprocess.decorate_form(form)>,
 <function module.preprocess.decorate_acd_pair(entity)>,
 <function module.preprocess.decorate_acd_pair_split(entity)>,
 <function module.preprocess.decorate_asc_pair(entity, sentiment)>,
 <function module.preprocess.decorate_asc_pair_split(entity, sentiment)>)

# Inference Logic

In [9]:
def predict_from_korean_form(tagger_tokenizer, acd_tokenizer, asc_tokenizer, tagger_model, acd_model, asc_model, data):
    tagger_model.to(device)
    tagger_model.eval()
    acd_model.to(device)
    acd_model.eval()
    asc_model.to(device)
    asc_model.eval()

    for sentence in tqdm(data):
        # form = sentence['sentence_form']
        form = 'Target ' + sentence['sentence_form']
        form = re.sub('#', '', form)
        form = re.sub('\xa0', ' ', form)
        
        sentence['annotation'] = []
        if type(form) != str:
            print("form type is wrong: ", form)
            continue

        tokens = tagger_tokenizer.tokenize(form)

        input_triplet = tagger_tokenizer(form, return_tensors='pt')
        input_triplet = {k:v.to(device) for k, v in input_triplet.items()}

        output = tagger_model(**input_triplet).logits
        pred = output.argmax(-1)[-1].tolist()[1:-1]

        starts = list({k:v for k, v in enumerate(pred) if v == 1}.keys())
        
        targets = []
        for start in starts:
            target = [tokens[start]]
            for tok, lab in zip(tokens[start+1:], pred[start+1:]):
                if lab != 2:
                    break
                else:
                    target.append(tok)
            targets.append(tagger_tokenizer.convert_tokens_to_string(target))
        
        if targets != []:
            for target in targets:
        
                for pair in entity_property_pair:
                    acd_pair = '#'.join([target, pair])
                    acd_encoded = acd_tokenizer(form, acd_pair, truncation=True, return_tensors="pt")
                    acd_encoded = {k:v.to(device) for k,v in acd_encoded.items()}
                    with torch.no_grad():
                        acd_outputs = acd_model(**acd_encoded)
                    
                    ce_predictions = acd_outputs['logits'].argmax(-1)
                    ce_result = tf_id_to_name[ce_predictions[0]]

                    if ce_result == 'True':
                        sentiments = ['positive', 'negative', 'neutral']
                        asc_pairs = []
                        for sentiment in sentiments:
                            asc_pair = '#'.join([target, pair, sentiment])
                            asc_pairs.append(asc_pair)

                        positive = asc_tokenizer(form, asc_pairs[0], truncation=True, return_tensors="pt")
                        positive = {k:v.to(device) for k,v in positive.items()}
                        negative = asc_tokenizer(form, asc_pairs[1], truncation=True, return_tensors="pt")
                        negative = {k:v.to(device) for k,v in negative.items()}
                        neutral = asc_tokenizer(form, asc_pairs[2], truncation=True, return_tensors="pt")
                        neutral = {k:v.to(device) for k,v in neutral.items()}

                        with torch.no_grad():
                            positive_outputs = asc_model(**positive)
                            negative_outputs = asc_model(**negative)
                            neutral_outputs = asc_model(**neutral)

                        pc_predictions = torch.tensor([positive_outputs['logits'][0][0], negative_outputs['logits'][0][0], neutral_outputs['logits'][0][0]]).argmax(-1)
                        pc_result = polarity_id_to_name[pc_predictions]

                        if SENTIMENT_FILTER == True:
                            if pc_result == 'positive':
                                if pair == '패키지/구성품#가격':
                                    print(f'{pair} found.')
                                    pair = '패키지/ 구성품#가격'
                                    print(f'corrected as {pair}')

                                sentence['annotation'].append([pair, pc_result])
                                # print(pair, pc_result)
                        else:
                            if pair == '패키지/구성품#가격':
                                print(f'{pair} found.')
                                pair = '패키지/ 구성품#가격'
                                print(f'corrected as {pair}')

                            sentence['annotation'].append([pair, pc_result])
                            # print(pair, pc_result)

    return data

In [10]:
pred_data = predict_from_korean_form(tagger_tokenizer, acd_tokenizer, asc_tokenizer, tagger_model, acd_model, asc_model, copy.deepcopy(test_data))
if EVAL_MODE == False:
    save_path = './'
    file_name = RESULT_SAVE_NAME

    jsondump(pred_data, os.path.join(save_path, file_name))
    pred_data = jsonload(os.path.join(save_path, file_name))
    
len(test_data), len(pred_data)

100%|██████████| 2127/2127 [19:42<00:00,  1.80it/s]


(2127, 2127)

In [11]:
for idx, row in enumerate(pred_data):
    print(row)
    if idx == 4:
        break

{'id': 'nikluge-sa-2022-test-00001', 'sentence_form': '하나 사려고 알아보는 중인데 맘에드는거 발견', 'annotation': [['제품 전체#일반', 'positive']]}
{'id': 'nikluge-sa-2022-test-00002', 'sentence_form': '동양인 피부톤과 잘 어울리고 우아한 분위기를 풍긴다네?', 'annotation': [['본품#품질', 'positive']]}
{'id': 'nikluge-sa-2022-test-00003', 'sentence_form': '근데 이건 마르살라보다 더 지나친 색 같은데..', 'annotation': [['제품 전체#일반', 'neutral']]}
{'id': 'nikluge-sa-2022-test-00004', 'sentence_form': '나스 색조가 다 그렇지만서도 어데이셔스 라인은 진짜 색 기막히게 뽑는것 같다', 'annotation': []}
{'id': 'nikluge-sa-2022-test-00005', 'sentence_form': '색상만 보면 이걸 어떻게 발라.. 싶겠지만 의외로 너무너무 괜찮다', 'annotation': [['본품#일반', 'positive']]}


# Scoring

In [None]:
if EVAL_MODE == True:
    print('ACD_CHECKPOINT: ', ACD_CHECKPOINT)
    print('ASC_CHECKPOINT: ', ASC_CHECKPOINT)
    print('INFERENCE DATA: ', TEST_DATA_PATH)

    print('EVAL_MODE :', EVAL_MODE)
    print('CATEGORY_FILTER: ', CATEGORY_FILTER)
    if CATEGORY_FILTER == True:
        print('CATEGORY_FILTER LENGTH: ', len(entity_property_pair))
        print('FILTER: ', entity_property_pair)
    print('SENTIMENT_FILTER: ', SENTIMENT_FILTER)

    result = evaluation_f1(true_data, pred_data)
    print(list(result.items())[0])
    print(list(result.items())[1])