# Modules and Global Variables

In [4]:
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, 
)

import torch, copy, json, re, os

In [5]:
### new
entity_property_pair = [
    '본품#가격', '본품#다양성', '본품#디자인', '본품#인지도', '본품#일반', '본품#편의성', '본품#품질',
    '브랜드#가격', '브랜드#디자인', '브랜드#인지도', '브랜드#일반', '브랜드#품질',
    '제품 전체#가격', '제품 전체#다양성', '제품 전체#디자인', '제품 전체#인지도', '제품 전체#일반', '제품 전체#편의성', '제품 전체#품질',
    '패키지/구성품#가격', '패키지/구성품#다양성', '패키지/구성품#디자인', '패키지/구성품#일반', '패키지/구성품#편의성', '패키지/구성품#품질'
]

more_tokens = ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']

tf_id_to_name = ['True', 'False']
tf_name_to_id = {tf_id_to_name[i]: i for i in range(len(tf_id_to_name))}

polarity_id_to_name = ['positive', 'negative', 'neutral']
polarity_name_to_id = {polarity_id_to_name[i]: i for i in range(len(polarity_id_to_name))}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

NGPU = torch.cuda.device_count()
if NGPU > 1:
    model = torch.nn.DataParallel(model, device_ids=list(range(NGPU)))

len(entity_property_pair)

25

In [6]:
ACD_CHECKPOINT = '/content/drive/MyDrive/aspect_based_sentiment_analysis/aspect_category_detection/snunlp_kr_electra_discriminator/snunlp_kr_electra_discriminator_v1/checkpoint-56250'
ASC_CHECKPOINT = '/content/drive/MyDrive/aspect_based_sentiment_analysis/aspect_sentiment_classification_binary/snunlp_kr_electra_discriminator/v2/snunlp_kr_electra_discriminator_v2/checkpoint-7194'

ROOT_PATH = '/content/drive/MyDrive/aspect_based_sentiment_analysis'
TEST_DATA_PATH = '/content/drive/MyDrive/aspect_based_sentiment_analysis/data/NIKL_ABSA_2022_COMPETITION/nikluge-sa-2022-test.jsonl'

# Load Model, Tokenizer, and Collator

In [7]:
acd_model = AutoModelForSequenceClassification.from_pretrained(ACD_CHECKPOINT)
acd_tokenizer = AutoTokenizer.from_pretrained(ACD_CHECKPOINT)

asc_model = AutoModelForSequenceClassification.from_pretrained(ASC_CHECKPOINT)
asc_tokenizer = AutoTokenizer.from_pretrained(ASC_CHECKPOINT)

In [8]:
def jsonload(fname, encoding="utf-8"):
    with open(fname, encoding=encoding) as f:
        j = json.load(f)
    return j

def jsondump(j, fname):
    with open(fname, "w", encoding="UTF8") as f:
        json.dump(j, f, ensure_ascii=False)

def jsonlload(fname, encoding="utf-8"):
    json_list = []
    with open(fname, encoding=encoding) as f:
        for line in f.readlines():
            json_list.append(json.loads(line))
    return json_list

In [9]:
def preprocess(sent):
    sent = sent.strip()
    # sent = demoji.replace_with_desc(string=sent, sep= " ")
    sent = re.sub(r'\s+', ' ', sent)
    sent = re.sub(r'#', '', sent)
    return sent

In [10]:
def predict_from_korean_form(acd_tokenizer, asc_tokenizer, acd_model, asc_model, data):

    acd_model.to(device)
    acd_model.eval()
    asc_model.to(device)
    asc_model.eval()

    for sentence in data:
        form = preprocess(sentence['sentence_form'])
        sentence['annotation'] = []
        if type(form) != str:
            print("form type is arong: ", form)
            continue
        for pair in entity_property_pair:
            acd_encoded = acd_tokenizer(form, pair, truncation=True, return_tensors="pt")
            acd_encoded = {k:v.to(device) for k,v in acd_encoded.items()}

            with torch.no_grad():
                acd_outputs = acd_model(**acd_encoded)
            
            ce_predictions = acd_outputs['logits'].argmax(-1)
            ce_result = tf_id_to_name[ce_predictions[0]]

            if ce_result == 'True':
                sentiments = ['positive', 'negative', 'neutral']
                asc_pair = []
                for sentiment in sentiments:
                    asc_pair.append('#'.join([pair, sentiment]))

                positive = asc_tokenizer(form, asc_pair[0], truncation=True, return_tensors="pt")
                positive = {k:v.to(device) for k,v in positive.items()}
                negative = asc_tokenizer(form, asc_pair[1], truncation=True, return_tensors="pt")
                negative = {k:v.to(device) for k,v in negative.items()}
                neutral = asc_tokenizer(form, asc_pair[2], truncation=True, return_tensors="pt")
                neutral = {k:v.to(device) for k,v in neutral.items()}

                with torch.no_grad():
                    positive_outputs = asc_model(**positive)
                    negative_outputs = asc_model(**negative)
                    neutral_outputs = asc_model(**neutral)

                pc_predictions = torch.tensor([positive_outputs['logits'][0][0], negative_outputs['logits'][0][0], neutral_outputs['logits'][0][0]]).argmax(-1)
                pc_result = polarity_id_to_name[pc_predictions]

                if pair == '패키지/구성품#가격':
                    print(f'{pair} found.')
                    pair = '패키지/ 구성품#가격'
                    print(f'corrected as {pair}')

                sentence['annotation'].append([pair, pc_result])

    return data


In [11]:
test_data = jsonlload(TEST_DATA_PATH)
pred_data = predict_from_korean_form(acd_tokenizer, asc_tokenizer, acd_model, asc_model, copy.deepcopy(test_data))


save_path = '/content/drive/MyDrive/aspect_based_sentiment_analysis/submission'
file_name = 'test.json'

jsondump(pred_data, os.path.join(save_path, file_name))
pred_data = jsonload(os.path.join(save_path, file_name))

패키지/구성품#가격 found.
corrected as 패키지/ 구성품#가격


In [12]:

save_path = '/content/drive/MyDrive/aspect_based_sentiment_analysis/submission'
file_name = 'test.json'
os.path.join(save_path, file_name)

'/content/drive/MyDrive/aspect_based_sentiment_analysis/submission/test.json'

In [13]:
len(test_data), len(pred_data)

(2127, 2127)

In [18]:
pred_data[:50]

[{'id': 'nikluge-sa-2022-test-00001',
  'sentence_form': '하나 사려고 알아보는 중인데 맘에드는거 발견',
  'annotation': [['제품 전체#일반', 'positive']]},
 {'id': 'nikluge-sa-2022-test-00002',
  'sentence_form': '동양인 피부톤과 잘 어울리고 우아한 분위기를 풍긴다네?',
  'annotation': [['본품#품질', 'positive']]},
 {'id': 'nikluge-sa-2022-test-00003',
  'sentence_form': '근데 이건 마르살라보다 더 지나친 색 같은데..',
  'annotation': []},
 {'id': 'nikluge-sa-2022-test-00004',
  'sentence_form': '나스 색조가 다 그렇지만서도 어데이셔스 라인은 진짜 색 기막히게 뽑는것 같다',
  'annotation': []},
 {'id': 'nikluge-sa-2022-test-00005',
  'sentence_form': '색상만 보면 이걸 어떻게 발라.. 싶겠지만 의외로 너무너무 괜찮다',
  'annotation': [['본품#일반', 'positive']]},
 {'id': 'nikluge-sa-2022-test-00006',
  'sentence_form': '생각보다 드라큘라 백작같지 않던데?',
  'annotation': []},
 {'id': 'nikluge-sa-2022-test-00007',
  'sentence_form': '보통 밝은 조명에서 보면 그리 부담스럽지 않다',
  'annotation': [['본품#품질', 'positive']]},
 {'id': 'nikluge-sa-2022-test-00008',
  'sentence_form': '바르면 정말 괜찮은데.. 그냥 컬러만 보면 부담스러운건 사실 ㅋㅋㅋ',
  'annotation': [['본품#일반', 'neutral']]}

# Inference Test

In [15]:
# form = '패키지에 보니 허브한방추출물과 옷나무 껍질 추출물이 들어갔다고 해서 한방향이 날줄알았는데 제품제형은 투명하고 향은 상큼한 향이랄까요?'
# form = '최근 북한 미사일 발사 등 도발 및 한미·한미일 대응'
# pair = '패키지/구성품#일반'

In [16]:
# # tokenized_data = acd_tokenizer(form, pair, padding='max_length', max_length=256, truncation=True)
# acd_encoded = acd_tokenizer(form, pair, truncation=True, return_tensors="pt")
# acd_encoded = {k:v.to(device) for k,v in acd_encoded.items()}

# with torch.no_grad():
#     acd_outputs = acd_model(**acd_encoded)

# ce_predictions = acd_outputs['logits'].argmax(-1)
# ce_result = tf_id_to_name[ce_predictions[0]]
# print(acd_outputs['logits'])
# print(ce_predictions)
# print(ce_result)

In [17]:
# asc_encoded = asc_tokenizer(form, pair, truncation=True, return_tensors="pt")
# asc_encoded = {k:v.to(device) for k,v in asc_encoded.items()}

# with torch.no_grad():
#     asc_outputs = asc_model(**asc_encoded)

# pc_predictions = asc_outputs['logits'].argmax(-1)
# pc_result = polarity_id_to_name[pc_predictions[0]]
# print(asc_outputs['logits'])
# print(pc_predictions)
# print(pc_result)