## Infer

In [1]:
!nvcc -V

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2019 NVIDIA Corporation
Built on Sun_Jul_28_19:07:16_PDT_2019
Cuda compilation tools, release 10.1, V10.1.243


In [2]:
import sys, subprocess
subprocess.call([sys.executable, '-m', 'pip', 'install', 'gluonnlp', 'torch', 'sentencepiece', 'tqdm', 
                 'onnxruntime', 'transformers', 'git+https://git@github.com/SKTBrain/KoBERT.git@master'])

0

## KoBERT 라이브러리 등 로딩

만약에 `import model` 에서 에러가 발생할 경우에 Kernel 리스타트 하세요.

In [3]:
import io, os
import random
import pandas as pd
import numpy as np
import mxnet as mx
from mxnet.gluon import nn, rnn
from mxnet import nd, gluon, autograd
import gluonnlp as nlp
import time
import itertools
import random
import sys

sys.path.append(os.path.join(os.getcwd(), 'src'))

from model import get_mxnet_kobert_model
from kobert.utils import get_tokenizer
from bert import BERTDatasetTransform, BERTDataset, BERTClassifier

import warnings
warnings.filterwarnings('ignore')

## 사용할 GPU 할당

In [4]:
num_gpus = mx.context.num_gpus()
ctx = mx.cpu(0)
print("Number of GPUS:" , num_gpus)
if num_gpus > 0:
    ctx = mx.gpu(0)
    print("GPU is assigned")
else:
    ctx = mx.cpu(0)
    print("CPU is assigned")    

Number of GPUS: 4
GPU is assigned


## KoBERT 모델 및 vocab 로딩

In [5]:
bert_base, vocab = get_mxnet_kobert_model(use_decoder=False, 
                                          use_classifier=False, 
                                          ctx=ctx)

using cached model
using cached model


## Sentence Classification Classifier 생성

In [6]:
all_labels = ['0','1'] 
param_path = 'model_save/net_nsmc.params'

def get_bert_classifier(bert_base, classes, ctx, model_params_path):
    num_classes = len(classes)
    print("num of classes: ", num_classes)
    bert_classifier = nlp.model.BERTClassifier(bert_base, 
                                               num_classes=num_classes, 
                                               dropout=0.5)

    # Only need to initialize the classifier layer.
    bert_classifier.classifier.initialize(init= mx.init.Normal(0.02), ctx= ctx)
    bert_classifier.hybridize(static_alloc=True)
    bert_classifier.load_parameters(model_params_path)    

    return bert_classifier

ko_sent_sims_classifier = get_bert_classifier(bert_base, 
                                          all_labels, 
                                          ctx, 
                                          param_path)




num of classes:  2


## 토큰나이저 생성

In [7]:
tokenizer = get_tokenizer() # kobert 토큰나이저
kobert_tokenizer = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


## Data 준비

In [8]:
dataset_test = nlp.data.TSVDataset('preproc/L-test/test.tab', 
                                     field_indices=[1,2,3], 
                                     num_discard_samples=1)
dataset_test[0]

['하루한끼 깻잎', '밀양 깻잎', '1']

In [9]:
from bert2 import data
vocabulary = vocab
def transform_bert_input_type(dataset, idx, bert_tokenizer, max_seq_length=100, pair=True):
    '''
    자연어 입력값인 sentence1, sentence2, label 을
    아래와 같은 버트 입력 형태로 변경
    token_id
    segment_id
    valid_length
    label
    '''
    # The labels for the two classes [(0 = not similar) or  (1 = similar)]
    all_labels = ["0", "1"]

    # whether to transform the data as sentence pairs.
    # for single sentence classification, set pair=False
    # for regression task, set class_labels=None
    # for inference without label available, set has_label=False
    pair = True
    transform = data.transform.BERTDatasetTransform(bert_tokenizer, max_seq_length,
                                                    class_labels=all_labels,
                                                    has_label=True,
                                                    pad=True,
                                                    pair=pair)    

    data_train = dataset.transform(transform)

    return data_train
    

data_test = transform_bert_input_type(dataset_test, 0, kobert_tokenizer)
data_test[0]

(array([   2, 4937, 7828, 5649,  517,    0, 7146,    3, 2181, 6853,  517,
           0, 7146,    3,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
           1], dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0

## 샘플 추론

In [23]:
def eval_sample(sent_classifier, dataset_test, data_test, num_test=3, verbose=False):
    # num_test = len(data_test)
    # num_test = 3
    y_true = []
    y_pred = []
    
    def print_detail(raw_input, bert_input, logit, prob, actual_label, pred_label):
        print("######### 문장1 , 문장2, 유사도 [0: No, 1: Yes] ########")
        print(raw_input)    
        print("### Token ID ###")        
        print(bert_input)
        print("### Logit: 마지막 2개의 뉴런의 값  [0:유사 안함, 1: 유사] ###", logit)   
        print('\n')
        print("### Probability: 마지막 2개의 확률 값 [0:유사 안함, 1: 유사]  ###", prob)   
        print('\n')
        print(f"Actual Value, Pred Value: {actual_label} , {pred_label}")
        print('\n\n')        
    
    for i in range(num_test):   
        input_ids = mx.nd.array([data_test[i][0]])
        token_type_ids = mx.nd.array([data_test[i][1]])
        valid_length = mx.nd.array([data_test[i][2]])

        input_ids = input_ids.as_in_context(ctx)
        token_type_ids = token_type_ids.as_in_context(ctx)
        valid_length = valid_length.as_in_context(ctx)

        logit = sent_classifier.forward(input_ids, token_type_ids, valid_length)
        prob = mx.nd.softmax(logit[0])
        k = [1]
        pred_label = prob.argsort()[-k[-1]:][::-1].astype(int).asnumpy()[0] # Predict Label

        actual_label = int(data_test[i][-1]) # Actual Value
        if actual_label != pred_label:
            print(f"{i} : @@@ INCORRECT Prediction @@@ ")
            print_detail(dataset_test[i], data_test[i][0], logit, prob, actual_label, pred_label)
            
        elif verbose:
            print(f"{i} : %%% CORRECT Prediction %%%%")            
            print_detail(dataset_test[i], data_test[i][0], logit, prob, actual_label, pred_label)

            
            

# eval_sample(ko_sent_sims_classifier, dataset_test, data_test, num_test=20, verbose=True)        
eval_sample(ko_sent_sims_classifier, dataset_test, data_test, num_test=20, verbose=False)        


1 : @@@ INCORRECT Prediction @@@ 
######### 문장1 , 문장2, 유사도 [0: No, 1: Yes] ########
['산지뚝심) 금산 추부 GAP 깻잎', '밀양 깻잎', '1']
### Token ID ###
[   2 2640 7318 5985 6745  517   40 1235 6516 4541 6398  650  266  517
    0 7146    3 2181 6853  517    0 7146    3    1    1    1    1    1
    1    1    1    1    1    1    1    1    1    1    1    1    1    1
    1    1    1    1    1    1    1    1    1    1    1    1    1    1
    1    1    1    1    1    1    1    1    1    1    1    1    1    1
    1    1    1    1    1    1    1    1    1    1    1    1    1    1
    1    1    1    1    1    1    1    1    1    1    1    1    1    1
    1    1]
### Logit: 마지막 2개의 뉴런의 값  [0:유사 안함, 1: 유사] ### 
[[ 1.8476654 -2.5164464]]
<NDArray 1x2 @gpu(0)>


### Probability: 마지막 2개의 확률 값 [0:유사 안함, 1: 유사]  ### 
[0.98743397 0.01256604]
<NDArray 2 @gpu(0)>


Actual Value, Pred Value: 1 , 0



3 : @@@ INCORRECT Prediction @@@ 
######### 문장1 , 문장2, 유사도 [0: No, 1: Yes] ########
['산지뚝심) 금산 추부 GAP 깻잎', '밀양 깻잎', '1']


## 토큰 분석
문장이 어떻게 토큰화 되었는지를 확인 함

In [20]:
        
def analyze_tokens(vocab, dataset_test, data_test, sample_id):
    idx2token = vocab.idx_to_token
    print(dataset_test[sample_id])
    
    tokens = []
    for i, idx in enumerate(data_test[sample_id][0]):
        tokens.append(idx2token[idx])
        # print(idx2token[idx])    
        if i == 32:
            break
    print(tokens)


In [24]:
analyze_tokens(vocab, dataset_test, data_test, sample_id=1)
analyze_tokens(vocab, dataset_test, data_test, sample_id=8)

['산지뚝심) 금산 추부 GAP 깻잎', '밀양 깻잎', '1']
['[CLS]', '▁산', '지', '뚝', '심', '▁', ')', '▁금', '산', '▁추', '부', '▁G', 'AP', '▁', '[UNK]', '잎', '[SEP]', '▁밀', '양', '▁', '[UNK]', '잎', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]']
["Hav'eat 경상도 친환경 깻잎", '산지뚝심) 금산 추부 GAP 깻잎', '1']
['[CLS]', '▁H', 'a', 'v', "▁'", '▁', 'e', 'at', '▁경', '상', '도', '▁친환경', '▁', '[UNK]', '잎', '[SEP]', '▁산', '지', '뚝', '심', '▁', ')', '▁금', '산', '▁추', '부', '▁G', 'AP', '▁', '[UNK]', '잎', '[SEP]', '[PAD]']


## 종합 테스트

In [25]:
def evaluate_model(model, dataset_test, data_test, labels, ctx):
    
    num_test = len(data_test)
    # num_test = 10

    y_true = []
    y_pred = []
    
    for i in range(num_test):   
        # print(dataset_test[i])
        
        input_ids = mx.nd.array([data_test[i][0]])
        token_type_ids = mx.nd.array([data_test[i][1]])
        valid_length = mx.nd.array([data_test[i][2]])

        input_ids = input_ids.as_in_context(ctx)
        token_type_ids = token_type_ids.as_in_context(ctx)
        valid_length = valid_length.as_in_context(ctx)

        tid_true = int(data_test[i][-1])
        logit = ko_sent_sims_classifier.forward(input_ids, token_type_ids, valid_length)
        # print("last logit: ", logit)
        prob = mx.nd.softmax(logit[0])
        # print("prob: ", prob)
        k = [1]
        topk_pred = prob.argsort()[-k[-1]:][::-1].astype(int).asnumpy()
        y_true.append(tid_true)
        y_pred.append(topk_pred[0])
        
    from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score  
    prec = precision_score(y_true, y_pred, labels=labels, average=None)
    f1 = f1_score(y_true, y_pred, labels=labels, average=None)
    rec = recall_score(y_true, y_pred, labels=labels, average=None)    
    acc = accuracy_score(y_true, y_pred)
    print(classification_report(y_true, y_pred, target_names = ['0:Non-Similar','1:Similar']))


labels=[0,1]
evaluate_model(ko_sent_sims_classifier, dataset_test, data_test, labels, ctx)


               precision    recall  f1-score   support

0:Non-Similar       0.62      1.00      0.77        10
    1:Similar       1.00      0.40      0.57        10

     accuracy                           0.70        20
    macro avg       0.81      0.70      0.67        20
 weighted avg       0.81      0.70      0.67        20

