# Modules and Global Variables

In [1]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, 
)

import torch, copy, json, re, os
from cleantext import clean

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(f'torch.__version__: {torch.__version__}')
print(f'torch.cuda.is_available(): {torch.cuda.is_available()}')
NGPU = torch.cuda.device_count()
print(f'NGPU: {NGPU}')

torch.__version__: 1.12.1
torch.cuda.is_available(): True
NGPU: 4


In [3]:
### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'
]

more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

tf_id_to_name = ['True', 'False']
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}

polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

len(entity_property_pair)

25

In [4]:
ACD_CHECKPOINT = 'acd/snunlp_kr_electra_discriminator_cleaned_v1/checkpoint-8790'
ASC_CHECKPOINT = 'asc/snunlp_kr_electra_discriminator_cleaned_v1/checkpoint-2250'

TEST_DATA_PATH = '../../dataset/nikluge-sa-2022-test.jsonl'

# Load Model, Tokenizer, and Collator

In [5]:
acd_model = AutoModelForSequenceClassification.from_pretrained(ACD_CHECKPOINT)
acd_tokenizer = AutoTokenizer.from_pretrained(ACD_CHECKPOINT)

asc_model = AutoModelForSequenceClassification.from_pretrained(ASC_CHECKPOINT)
asc_tokenizer = AutoTokenizer.from_pretrained(ASC_CHECKPOINT)

In [6]:
def jsonload(fname, encoding="utf-8"):
    with open(fname, encoding=encoding) as f:
        j = json.load(f)
    return j

def jsondump(j, fname):
    with open(fname, "w", encoding="UTF8") as f:
        json.dump(j, f, ensure_ascii=False)

def jsonlload(fname, encoding="utf-8"):
    json_list = []
    with open(fname, encoding=encoding) as f:
        for line in f.readlines():
            json_list.append(json.loads(line))
    return json_list

In [7]:
def preprocess(text):
    text = re.sub(r'&.+&', '<숨김>', text)
    text = re.sub(r'[^A-Za-z0-9가-힣]', ' ', text)
    
    text = clean(
        text,
        fix_unicode=False,
        to_ascii=False,
        lower=True,
        normalize_whitespace=True,
        no_line_breaks=False,
        strip_lines=True,
        keep_two_line_breaks=False,
        no_urls=False,
        no_emails=False,
        no_phone_numbers=False,
        no_numbers=True,
        no_digits=True,
        no_currency_symbols=False,
        no_punct=True,
        no_emoji=True,
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="0",
        replace_with_digit="0",
        replace_with_currency_symbol="<CUR>",
        replace_with_punct="",
        lang="en",
    )
    
    # sent = demoji.replace_with_desc(string=sent, sep= " ")
    # sent = re.sub(r'\s+', ' ', sent)
    # sent = re.sub(r'^#$', '', sent)
    
    # text = re.sub(r'[A-Za-z]+', 'alphabet', text)
    # text = re.sub(r'[`~$^+=|><]', '', text)
    # text = re.sub(r'[ㄱ-ㅎ|ㅏ-ㅣ]', '', text)
    
    # text = spacing(text)
    # text = spell_checker.check(text).checked
    return text.upper().strip()

In [8]:
def predict_from_korean_form(acd_tokenizer, asc_tokenizer, acd_model, asc_model, data):

    acd_model.to(device)
    acd_model.eval()
    asc_model.to(device)
    asc_model.eval()

    for sentence in data:
        form = preprocess(sentence['sentence_form'])
        sentence['annotation'] = []
        if type(form) != str:
            print("form type is arong: ", form)
            continue
        for pair in entity_property_pair:
            acd_encoded = acd_tokenizer(form, pair, truncation=True, return_tensors="pt")
            acd_encoded = {k:v.to(device) for k,v in acd_encoded.items()}

            with torch.no_grad():
                acd_outputs = acd_model(**acd_encoded)
            
            ce_predictions = acd_outputs['logits'].argmax(-1)
            ce_result = tf_id_to_name[ce_predictions[0]]

            if ce_result == 'True':
                sentiments = ['positive', 'negative', 'neutral']
                asc_pair = []
                for sentiment in sentiments:
                    asc_pair.append('#'.join([pair, sentiment]))

                positive = asc_tokenizer(form, asc_pair[0], truncation=True, return_tensors="pt")
                positive = {k:v.to(device) for k,v in positive.items()}
                negative = asc_tokenizer(form, asc_pair[1], truncation=True, return_tensors="pt")
                negative = {k:v.to(device) for k,v in negative.items()}
                neutral = asc_tokenizer(form, asc_pair[2], truncation=True, return_tensors="pt")
                neutral = {k:v.to(device) for k,v in neutral.items()}

                with torch.no_grad():
                    positive_outputs = asc_model(**positive)
                    negative_outputs = asc_model(**negative)
                    neutral_outputs = asc_model(**neutral)

                pc_predictions = torch.tensor([positive_outputs['logits'][0][0], negative_outputs['logits'][0][0], neutral_outputs['logits'][0][0]]).argmax(-1)
                pc_result = polarity_id_to_name[pc_predictions]

                if pair == '패키지/구성품#가격':
                    print(f'{pair} found.')
                    pair = '패키지/ 구성품#가격'
                    print(f'corrected as {pair}')

                sentence['annotation'].append([pair, pc_result])

    return data


In [9]:
test_data = jsonlload(TEST_DATA_PATH)
pred_data = predict_from_korean_form(acd_tokenizer, asc_tokenizer, acd_model, asc_model, copy.deepcopy(test_data))


save_path = './'
file_name = 'cleaned_v1_b2b.json'

jsondump(pred_data, os.path.join(save_path, file_name))
pred_data = jsonload(os.path.join(save_path, file_name))

패키지/구성품#가격 found.
corrected as 패키지/ 구성품#가격


In [10]:
len(test_data), len(pred_data)

(2127, 2127)

In [16]:
for e in pred_data:
    print(e['annotation'])

[['제품 전체#일반', 'positive']]
[]
[['본품#일반', 'negative']]
[]
[['본품#일반', 'positive'], ['제품 전체#일반', 'positive']]
[]
[['본품#품질', 'positive']]
[['본품#일반', 'neutral']]
[['본품#품질', 'positive']]
[['본품#일반', 'positive']]
[['제품 전체#일반', 'positive']]
[['제품 전체#일반', 'positive']]
[['본품#품질', 'positive']]
[['본품#품질', 'positive']]
[['패키지/구성품#편의성', 'positive']]
[['본품#일반', 'positive']]
[['본품#품질', 'positive']]
[['본품#품질', 'positive']]
[['본품#품질', 'positive']]
[['본품#품질', 'positive']]
[['제품 전체#가격', 'positive']]
[['제품 전체#일반', 'positive']]
[]
[['본품#품질', 'positive'], ['패키지/구성품#디자인', 'positive']]
[]
[['본품#품질', 'positive']]
[]
[]
[]
[['본품#일반', 'positive'], ['본품#품질', 'positive']]
[['본품#일반', 'positive']]
[['본품#품질', 'positive']]
[]
[]
[['본품#품질', 'positive']]
[['본품#품질', 'positive']]
[['본품#품질', 'positive']]
[['제품 전체#일반', 'positive']]
[['본품#품질', 'positive']]
[['본품#품질', 'positive']]
[['본품#일반', 'neutral'], ['본품#품질', 'neutral']]
[['제품 전체#일반', 'neutral']]
[['본품#품질', 'positive']]
[['본품#품질', 'positive']]
[['본품#품질', 'positive']]
[['제품 

# Inference Test

In [None]:
# form = '패키지에 보니 허브한방추출물과 옷나무 껍질 추출물이 들어갔다고 해서 한방향이 날줄알았는데 제품제형은 투명하고 향은 상큼한 향이랄까요?'
# form = '최근 북한 미사일 발사 등 도발 및 한미·한미일 대응'
# pair = '패키지/구성품#일반'

In [None]:
# # tokenized_data = acd_tokenizer(form, pair, padding='max_length', max_length=256, truncation=True)
# acd_encoded = acd_tokenizer(form, pair, truncation=True, return_tensors="pt")
# acd_encoded = {k:v.to(device) for k,v in acd_encoded.items()}

# with torch.no_grad():
#     acd_outputs = acd_model(**acd_encoded)

# ce_predictions = acd_outputs['logits'].argmax(-1)
# ce_result = tf_id_to_name[ce_predictions[0]]
# print(acd_outputs['logits'])
# print(ce_predictions)
# print(ce_result)

In [None]:
# asc_encoded = asc_tokenizer(form, pair, truncation=True, return_tensors="pt")
# asc_encoded = {k:v.to(device) for k,v in asc_encoded.items()}

# with torch.no_grad():
#     asc_outputs = asc_model(**asc_encoded)

# pc_predictions = asc_outputs['logits'].argmax(-1)
# pc_result = polarity_id_to_name[pc_predictions[0]]
# print(asc_outputs['logits'])
# print(pc_predictions)
# print(pc_result)