# 버트 임베팅으로 유사도 구하기

In [1]:
import sys, subprocess
subprocess.call([sys.executable, '-m', 'pip', 'install', 'gluonnlp', 'torch', 'sentencepiece', 'tqdm', 
                 'onnxruntime', 'transformers', 'git+https://git@github.com/SKTBrain/KoBERT.git@master'])

0

## KoBERT 라이브러리 등 로딩

만약에 `import model` 에서 에러가 발생할 경우에 Kernel 리스타트 하세요.

In [2]:
import io, os
import random
import pandas as pd
import numpy as np
import mxnet as mx
from mxnet.gluon import nn, rnn
from mxnet import nd, gluon, autograd
import gluonnlp as nlp
import time
import itertools
import random
import sys

sys.path.append(os.path.join(os.getcwd(), 'src'))

from model import get_mxnet_kobert_model
from kobert.utils import get_tokenizer
from bert import BERTDatasetTransform, BERTDataset, BERTClassifier

import warnings
warnings.filterwarnings('ignore')

## 사용할 GPU 할당

In [3]:
num_gpus = mx.context.num_gpus()
ctx = mx.cpu(0)
print("Number of GPUS:" , num_gpus)
if num_gpus > 0:
    ctx = mx.gpu(0)
    print("GPU is assigned")
else:
    ctx = mx.cpu(0)
    print("CPU is assigned")    

Number of GPUS: 4
GPU is assigned


## KoBERT 모델 및 vocab 로딩

In [4]:
kr_bert_base, vocab = get_mxnet_kobert_model(use_decoder=False, 
                                          use_classifier=False, 
                                          ctx=ctx)

using cached model
using cached model


## 버트 토큰나이저 생성

In [5]:
tokenizer = get_tokenizer() # kobert 토큰나이저
kobert_tokenizer = nlp.data.BERTSPTokenizer(tokenizer, vocab, lower=False)

using cached model


In [6]:
kobert_tokenizer("오뚜기 잔라면 5개 패키지")

['▁오', '뚜', '기', '▁잔', '라면', '▁5', '개', '▁패키지']

## 버트 입력 트랜스포머 생성 
- 자연어 --> 버트 입력 형태로 변경 하는 변환기

In [7]:
from bert2 import data

pair = True
max_seq_length = 64
transform = data.transform.BERTDatasetTransform(kobert_tokenizer, 
                                                max_seq_length,
                                                has_label=False,
                                                pad=False,
                                                pair=False)    
transform(["오뚜기 잔라면 5개 패키지"])

(array([   2, 3417, 5984, 5561, 3941, 6009,  611, 5357, 4820,    3],
       dtype=int32),
 array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32),
 array(10, dtype=int32))

## 샘플 추론

In [8]:
def eval_sample(sent, kobert_model, bert_input_transform, bert_tokenizer, verbose=False):
    '''
    버트의 문장, 토큰 임베팅 추출
    '''

    sample = bert_input_transform(sent)

    input_ids = mx.nd.array([sample[0]])
    token_type_ids = mx.nd.array([sample[1]])
    valid_length = mx.nd.array([sample[2]])

    input_ids = input_ids.as_in_context(ctx)
    token_type_ids = token_type_ids.as_in_context(ctx)
    valid_length = valid_length.as_in_context(ctx)

    tokens_em, cls_em,  = kobert_model.forward(input_ids, token_type_ids, valid_length)
    
    if verbose:
        print("Orginal Sentence: ", sent)
        print("Tokens: ", bert_tokenizer(sent[0]))        
        print("Bert Input Transformation: ", sample)    
    
    return tokens_em.asnumpy(), cls_em.asnumpy()

from scipy import spatial

def eval_cosine_dist(vec1, vec2, sent1, sent2, verbose=False ):
    '''
    벡터 차원이 3이면 모든 벡터를 평균을 냄. 이 조건은 토큰 벡터임
    벡터 차원이 2이면 단순히 벡터 거리 비교
    '''
    vector_type = None
    if (np.ndim(vec1) > 2) &  (np.ndim(vec2) > 2):
        avg_vec1 = np.mean(vec1, axis=1)
        avg_vec2 = np.mean(vec2, axis=1)
        dist = spatial.distance.cosine(avg_vec1, avg_vec2)
        vector_type = "token vectors"
        
    else:
        dist = spatial.distance.cosine(vec1, vec2)
        vector_type = "sentence vector"        

        
    if verbose:
        print("vector type: ", vector_type)
        print(f"sent1, sent2: {sent1}, {sent2}")
        print('class dist: {0}'.format(dist))            
        print('\n')                    
        
        
    return dist




In [9]:

sent1 = ['오뚜기 매운 진라면 5개 입']
sent2 = ['오뚜기 진라면 5개 패키지']
sent3 = ['농심 신라면 5개 입']
sent4 = ['오뚜기 매운 진라면 10개 입']

tokens_em1, cls_em1 = eval_sample(sent1, kr_bert_base, transform, kobert_tokenizer, verbose=True)            
tokens_em2, cls_em2 = eval_sample(sent2, kr_bert_base, transform, kobert_tokenizer)            
tokens_em3, cls_em3 = eval_sample(sent3, kr_bert_base, transform, kobert_tokenizer)            
tokens_em4, cls_em4 = eval_sample(sent4, kr_bert_base, transform, kobert_tokenizer)            

eval_cosine_dist(cls_em1, cls_em2, sent1, sent2, verbose=True )
eval_cosine_dist(tokens_em1, tokens_em2, sent1, sent2, verbose=True )

eval_cosine_dist(cls_em1, cls_em3, sent1, sent3, verbose=True )
eval_cosine_dist(tokens_em1, tokens_em3, sent1, sent3, verbose=True )

eval_cosine_dist(cls_em1, cls_em4, sent1, sent4, verbose=True )
eval_cosine_dist(tokens_em1, tokens_em4, sent1, sent4, verbose=True )


Orginal Sentence:  ['오뚜기 매운 진라면 5개 입']
Tokens:  ['▁오', '뚜', '기', '▁매', '운', '▁진', '라면', '▁5', '개', '▁입']
Bert Input Transformation:  (array([   2, 3417, 5984, 5561, 1986, 7010, 4360, 6009,  611, 5357, 3836,
          3], dtype=int32), array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32), array(12, dtype=int32))
vector type:  sentence vector
sent1, sent2: ['오뚜기 매운 진라면 5개 입'], ['오뚜기 진라면 5개 패키지']
class dist: 0.3838374614715576


vector type:  token vectors
sent1, sent2: ['오뚜기 매운 진라면 5개 입'], ['오뚜기 진라면 5개 패키지']
class dist: 0.08472758531570435


vector type:  sentence vector
sent1, sent2: ['오뚜기 매운 진라면 5개 입'], ['농심 신라면 5개 입']
class dist: 0.20076775550842285


vector type:  token vectors
sent1, sent2: ['오뚜기 매운 진라면 5개 입'], ['농심 신라면 5개 입']
class dist: 0.12713992595672607


vector type:  sentence vector
sent1, sent2: ['오뚜기 매운 진라면 5개 입'], ['오뚜기 매운 진라면 10개 입']
class dist: 0.06314051151275635


vector type:  token vectors
sent1, sent2: ['오뚜기 매운 진라면 5개 입'], ['오뚜기 매운 진라면 10개 입']
class dist: 0.0113

0.011301159858703613