# 면접 답변 의도 분석


In [1]:
# AttributeError: module 'numpy' has no attribute 'bool'.
# https://stackoverflow.com/questions/74893742/how-to-solve-attributeerror-module-numpy-has-no-attribute-bool
import numpy as np

np.bool = np.bool_

In [2]:
import os

import gluonnlp as nlp
import pandas as pd
import torch
from kobert import BERTDataset
from kobert_tokenizer import KoBERTTokenizer
from tqdm.notebook import tqdm



In [3]:
PRETRAINED = "skt/kobert-base-v1"

tokenizer = KoBERTTokenizer.from_pretrained(PRETRAINED)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(
    tokenizer.vocab_file, padding_token="[PAD]"
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'XLNetTokenizer'. 
The class this function is called from is 'KoBERTTokenizer'.


In [4]:
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "4"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [5]:
dataset_test = pd.read_csv(
    "./data/intent_answers_Validation.tsv", sep="\t", encoding="utf8"
)

dataset_test[:3]

Unnamed: 0,id,document,label
0,0,어 그 리더십을 사용해서 또 어 관련자들을 또 독려하고 그 목표를 향해서 그 목표를...,11
1,1,그래서 저는 다양한 사람들과 문제에 대해서 함께 고민하고 새로운 아이디어 또 해결할...,11
2,2,저는 제가 팀장 성향이라고 생각합니다.,3


In [6]:
model = torch.load("./models/KoBERT/KoBERT.pt")
model

BERTClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [7]:
tok = tokenizer
tok

KoBERTTokenizer(name_or_path='skt/kobert-base-v1', vocab_size=8002, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=True, special=True),
}

In [8]:
# Setting parameters
max_len = 128
batch_size = 64

In [9]:
def predict(sentence):
    dataset = [[sentence, "0"]]
    test = BERTDataset(dataset, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(
        test, batch_size=batch_size, num_workers=2
    )

    model.eval()

    answer = 0

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(
        test_dataloader
    ):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)
        valid_length = valid_length
        label = label.long().to(device)
        out = model(token_ids, valid_length, segment_ids)
        for logits in out:
            logits = logits.detach().cpu().numpy()
            answer = np.argmax(logits)
    return answer

In [10]:
intent_labels = pd.read_csv("./data/intent_labels.tsv", sep="\t", encoding="utf8")
intent_labels

Unnamed: 0,category,expression,label
0,attitude,m_critical,0
1,attitude,m_direct,1
2,attitude,m_holistic,2
3,background,c_person,3
4,background,c_private,4
5,background,c_value,5
6,etc,c_sincere_co,6
7,etc,c_sincere_job,7
8,personality,c_adp,8
9,personality,c_chl,9


In [11]:
acc = 0

for idx, row in tqdm(dataset_test.iterrows(), total=len(dataset_test)):
    sentence = row["document"]
    p = predict(sentence)
    if idx < 3:
        print(f"{sentence = }")
        print(f"predict: {intent_labels.iloc[p][['category','expression']].to_dict()}")
        print(f"actual: {intent_labels.iloc[row['label']][['category','expression']].to_dict()}")
        print()
    acc += 1 if p == row["label"] else 0

print(f"{acc} / {len(dataset_test)} = { 100 * (acc / len(dataset_test))}")

  0%|          | 0/468 [00:00<?, ?it/s]

sentence = '어 그 리더십을 사용해서 또 어 관련자들을 또 독려하고 그 목표를 향해서 그 목표를 달성하도록 하는 그런 리더십이 있다고 생각하기 때문에 혼자 뿐만 아니라 다른 사람들의 또 이야기를 듣고 함께 해결하는 편이라고 생각합니다.'
predict: {'category': 'background', 'expression': 'c_person'}
actual: {'category': 'personality', 'expression': 'c_cop'}

sentence = '그래서 저는 다양한 사람들과 문제에 대해서 함께 고민하고 새로운 아이디어 또 해결할 수 있는 문제점들을 같이 찾아내고 해결하는 편입니다.'
predict: {'category': 'personality', 'expression': 'c_adp'}
actual: {'category': 'personality', 'expression': 'c_cop'}

sentence = '저는 제가 팀장 성향이라고 생각합니다.'
predict: {'category': 'background', 'expression': 'c_person'}
actual: {'category': 'background', 'expression': 'c_person'}

353 / 468 = 75.42735042735043
