# Modules and Global Variables

In [1]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
)

import torch, copy, os
from module.score import evaluation_f1
from module.load_json import *
from module.maps import *
from module.inference import *

In [3]:
print(f'torch.__version__: {torch.__version__}')
print(f'torch.cuda.is_available(): {torch.cuda.is_available()}')
NGPU = torch.cuda.device_count()
print(f'NGPU: {NGPU}')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

torch.__version__: 1.7.1
torch.cuda.is_available(): True
NGPU: 4


# Paths and Modes

In [4]:
EVAL_MODE = False

RESULT_SAVE_NAME = 'klue_roberta_base_mlm_fine_tuned_uncleaned_v13.json'

ACD_CHECKPOINT = 'training_results/klue_roberta_base_mlm_fine_tuned_uncleaned_v13/acd/klue_roberta_base_mlm_fine_tuned_uncleaned_v13/checkpoint-23440'
ASC_CHECKPOINT = 'training_results/klue_roberta_base_mlm_fine_tuned_uncleaned_v13/asc/klue_roberta_base_mlm_fine_tuned_uncleaned_v13/checkpoint-1000'

TEST_DATA_PATH = 'dataset/nikluge-sa-2022-test.jsonl'
EVAL_DATA_PATH = 'dataset/nikluge-sa-2022-dev.jsonl'

In [5]:
if EVAL_MODE == True:
    TEST_DATA_PATH = EVAL_DATA_PATH
print('>>>>> >>>>> >>>>> ', TEST_DATA_PATH, ' <<<<< <<<<< <<<<<', '\n', sep='')

test_data = jsonlload(TEST_DATA_PATH)

>>>>> >>>>> >>>>> dataset/nikluge-sa-2022-test.jsonl <<<<< <<<<< <<<<<

{'id': 'nikluge-sa-2022-test-00001', 'sentence_form': '하나 사려고 알아보는 중인데 맘에드는거 발견', 'annotation': []}
{'id': 'nikluge-sa-2022-test-00002', 'sentence_form': '동양인 피부톤과 잘 어울리고 우아한 분위기를 풍긴다네?', 'annotation': []}
{'id': 'nikluge-sa-2022-test-00003', 'sentence_form': '근데 이건 마르살라보다 더 지나친 색 같은데..', 'annotation': []}
{'id': 'nikluge-sa-2022-test-00004', 'sentence_form': '나스 색조가 다 그렇지만서도 어데이셔스 라인은 진짜 색 기막히게 뽑는것 같다', 'annotation': []}
{'id': 'nikluge-sa-2022-test-00005', 'sentence_form': '색상만 보면 이걸 어떻게 발라.. 싶겠지만 의외로 너무너무 괜찮다', 'annotation': []}


# Load Model and Tokenizer

In [7]:
acd_model = AutoModelForSequenceClassification.from_pretrained(ACD_CHECKPOINT)
acd_tokenizer = AutoTokenizer.from_pretrained(ACD_CHECKPOINT)

asc_model = AutoModelForSequenceClassification.from_pretrained(ASC_CHECKPOINT)
asc_tokenizer = AutoTokenizer.from_pretrained(ASC_CHECKPOINT)

# Inference

In [9]:
pred_data = inference_m(acd_tokenizer, asc_tokenizer, acd_model, asc_model, copy.deepcopy(test_data))

if EVAL_MODE == False:
    save_path = './'
    file_name = RESULT_SAVE_NAME

    jsondump(pred_data, os.path.join(save_path, file_name))
    pred_data = jsonload(os.path.join(save_path, file_name))
    
len(test_data), len(pred_data)

 79%|███████▉  | 1686/2127 [11:54<03:09,  2.33it/s]

패키지/구성품#가격 found.
corrected as 패키지/ 구성품#가격


 81%|████████  | 1722/2127 [12:09<03:00,  2.25it/s]

패키지/구성품#가격 found.
corrected as 패키지/ 구성품#가격


100%|██████████| 2127/2127 [15:03<00:00,  2.35it/s]


(2127, 2127)

# Evaluation

In [10]:
if EVAL_MODE == True:
    print('ACD_CHECKPOINT: ', ACD_CHECKPOINT)
    print('ASC_CHECKPOINT: ', ASC_CHECKPOINT)
    print('INFERENCE DATA: ', TEST_DATA_PATH)

    print('EVAL_MODE :', EVAL_MODE)

    result = evaluation_f1(true_data, pred_data)
    print(list(result.items())[0])
    print(list(result.items())[1])