In [1]:
import os
import urllib.request
import zipfile
import tarfile
import glob
import io

### 사전 학습된 BERT 모델 다운로드

In [2]:
data_dir = "./data/"
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

In [3]:
# Vocabulary 다운로드
vocab_dir = "./vocab/"
if not os.path.exists(vocab_dir):
    os.mkdir(vocab_dir)

save_path="./vocab/bert-base-uncased-vocab.txt"
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
urllib.request.urlretrieve(url, save_path)

('./vocab/bert-base-uncased-vocab.txt',
 <http.client.HTTPMessage at 0x7f0c022da910>)

In [4]:
# Pre-trained BERT의 가중치 다운로드
weights_dir = "./weights/"
if not os.path.exists(weights_dir):
    os.mkdir(weights_dir)

save_path = "./weights/bert-base-uncased.tar.gz"
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz"
urllib.request.urlretrieve(url, save_path)

# 압축 해제
archive_file = "./weights/bert-base-uncased.tar.gz"  # Uncased : 소문자 형식
tar = tarfile.open(archive_file, 'r:gz')
tar.extractall('./weights/')  
tar.close()  

### Data

IMDb : http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [5]:
target_dir_path="./data/"

if not os.path.exists(target_dir_path):
    os.mkdir(target_dir_path)
    
url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
save_path = "./data/aclImdb_v1.tar.gz"
urllib.request.urlretrieve(url, save_path)

tar = tarfile.open('./data/aclImdb_v1.tar.gz')
tar.extractall('./data/') 
tar.close()  

In [6]:
target_dir_path="./data/aclImdb/"

if os.path.exists(target_dir_path):
    # Train
    f = open('./data/IMDb_train.tsv','w')

    path = './data/aclImdb/train/pos/'
    for fname in glob.glob(os.path.join(path,'*.txt')):
        with io.open(fname, 'r', encoding="utf-8") as ff:
            text = ff.readline()
            text = text.replace('\t', " ")
            text = text+'\t'+'1'+'\t'+'\n'
            f.write(text)

    path = './data/aclImdb/train/neg/'
    for fname in glob.glob(os.path.join(path,'*.txt')):
        with io.open(fname, 'r', encoding="utf-8") as ff:
            text = ff.readline()
            text = text.replace('\t', " ")
            text = text+'\t'+'0'+'\t'+'\n'
            f.write(text)

    f.close()
    
    # Test
    f = open('./data/IMDb_test.tsv','w')

    path = './data/aclImdb/test/pos/'
    for fname in glob.glob(os.path.join(path,'*.txt')):
        with io.open(fname, 'r', encoding="utf-8") as ff:
            text = ff.readline()
            text = text.replace('\t', " ")
            text = text+'\t'+'1'+'\t'+'\n'
            f.write(text)

    path = './data/aclImdb/test/neg/'

    for fname in glob.glob(os.path.join(path,'*.txt')):
        with io.open(fname, 'r', encoding="utf-8") as ff:
            text = ff.readline()
            text = text.replace('\t', " ")        
            text = text+'\t'+'0'+'\t'+'\n'
            f.write(text)

    f.close()

## BERT 구현

In [7]:
import math
import numpy as np

import torch
from torch import nn


### 0. BERT_Base 네트워크의 Config 불러오기

In [8]:
import json

config_file = "./weights/bert_config.json"

# json 형식으로 읽기
json_file = open(config_file, 'r')
config = json.load(json_file)

config

{'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'max_position_embeddings': 512,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'type_vocab_size': 2,
 'vocab_size': 30522}

In [9]:
config['hidden_size']

768

In [10]:
! pip install attrdict

Collecting attrdict
  Downloading attrdict-2.0.1-py2.py3-none-any.whl (9.9 kB)
Installing collected packages: attrdict
Successfully installed attrdict-2.0.1


In [11]:
# dictionary 변수를 object 변수로
from attrdict import AttrDict 

config = AttrDict(config)
config

AttrDict({'attention_probs_dropout_prob': 0.1, 'hidden_act': 'gelu', 'hidden_dropout_prob': 0.1, 'hidden_size': 768, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 512, 'num_attention_heads': 12, 'num_hidden_layers': 12, 'type_vocab_size': 2, 'vocab_size': 30522})

In [12]:
config.hidden_size

768

### 1. LayerNormalization 

Tensorflow 버전

Tensor의 마지막 채널 (768차원으로 표현되는 개별 단어의 vector representation)에 대한 layer 정규화

divide-by-zero를 방지하는 eps 추가

![image](https://user-images.githubusercontent.com/44194558/151758377-334c467c-20ec-4472-b66b-b14b6b6d40a6.png)

![image](https://user-images.githubusercontent.com/44194558/151758398-bdc09ad0-1840-4da9-906f-c7225b97f2e4.png)

In [13]:
class BertLayerNorm(nn.Module):
    
    def __init__(self, hidden_size, eps=1e-12):
        super(BertLayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(hidden_size))  # weights
        self.beta = nn.Parameter(torch.zeros(hidden_size))  # bias
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)  # 개별 단어 벡터마다 평균 계산
        s = (x - u).pow(2).mean(-1, keepdim=True)  # 분산 계산 (broad casting)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)

        return self.gamma * x + self.beta

### 2. Embeddings

![image](https://user-images.githubusercontent.com/44194558/151758604-d80865e4-7fab-42e5-a83d-838d75c959d3.png)

참고 : https://github.com/gymoon10/Paper-Review/blob/main/NLP/BERT.md

In [14]:
class BertEmbeddings(nn.Module):  

    def __init__(self, config):
        super(BertEmbeddings, self).__init__()

        # Token Embedding (단어 ID - > 단어 벡터)
        self.word_embeddings = nn.Embedding(
            config.vocab_size, config.hidden_size, padding_idx=0)  # vocab_size=30522 / hidden_size=768
        
        # Positional Embedding
        self.position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size)  # max_position_embeddings=512 (한 문장은 512개의 단어로 구성)

        # Segment(sentence) Embedding
        self.token_type_embeddings = nn.Embedding(
            config.type_vocab_size, config.hidden_size)  # type_vocab_size=2
        
        # LayerNormalization
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        
        # Dropout
        self.dropout = nn.Dropout(config.hidden_dropout_prob)


    def forward(self, input_ids, token_type_ids=None):
        '''
        입력
        input_ids:  [batch_size, seq_len] 문장의 단어 ID 나열, seq_len=512
        token_type_ids: [batch_size, seq_len] 각 단어가 1번째 문장인지, 2번째 문장인지를 나타내는 id
        
        출력
        embeddings : [batch_size, seq_len, hidden_size], hidden_size=768
        '''
        # Token Embedding
        words_embeddings = self.word_embeddings(input_ids)  

        # Segment Embedding
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)  # 문장의 모든 단어를 첫 번째 문장으로(0)
        token_type_embeddings = self.token_type_embeddings(token_type_ids)

        # Positional Embedding
        seq_length = input_ids.size(1)  # seq_length=512
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)  # torch.Size([seq_length])
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # [batch, seq_length]로 차원 확장 (각 구성 요소는 동일, 일종의 행 복사)
        position_embeddings = self.position_embeddings(position_ids)
        
        # Output Embedding
        embeddings = words_embeddings + position_embeddings + token_type_embeddings

        embeddings = self.LayerNorm(embeddings)
        embeddings = self.dropout(embeddings)

        return embeddings  # [batch_size, seq_len, hidden_size] 

#### Process of Embedding

In [15]:
import random

input_ids = []
for _ in range(64):
    res = []
    for _ in range(512):
        res.append(random.randrange(1, 10000))
    input_ids.append(res)

token_type_ids = []
for _ in range(64):
    res = []
    for _ in range(512):
        res.append(random.randrange(0, 2))
    token_type_ids.append(res)

input_ids = torch.LongTensor(input_ids)
print('input_ids :', input_ids.shape)

token_type_ids = torch.LongTensor(token_type_ids)
print('token_type_ids :', token_type_ids.shape)

input_ids : torch.Size([64, 512])
token_type_ids : torch.Size([64, 512])


In [16]:
# Word Embedding
word_embeddings = nn.Embedding(
            config.vocab_size, config.hidden_size, padding_idx=0)

word_embeddings = word_embeddings(input_ids)
print('word embeddings :', word_embeddings.shape)

# Segment Embedding
token_type_embeddings = nn.Embedding(
            config.type_vocab_size, config.hidden_size)

token_type_embeddings = token_type_embeddings(token_type_ids)
print('token_type_embeddings :', token_type_embeddings.shape)

# Positional Embedding
seq_length = input_ids.size(1)  # 512
position_ids = torch.arange(
            seq_length, dtype=torch.long, device=input_ids.device)  # torch.Size([512])
position_ids = position_ids.unsqueeze(0).expand_as(input_ids)  # (64, 512)

position_embeddings = nn.Embedding(
            config.max_position_embeddings, config.hidden_size)

position_embeddings = position_embeddings(position_ids)
print('position_embeddings :', position_embeddings.shape)

embeddings = word_embeddings + position_embeddings + token_type_embeddings

word embeddings : torch.Size([64, 512, 768])
token_type_embeddings : torch.Size([64, 512, 768])
position_embeddings : torch.Size([64, 512, 768])


In [17]:
position_ids

tensor([[  0,   1,   2,  ..., 509, 510, 511],
        [  0,   1,   2,  ..., 509, 510, 511],
        [  0,   1,   2,  ..., 509, 510, 511],
        ...,
        [  0,   1,   2,  ..., 509, 510, 511],
        [  0,   1,   2,  ..., 509, 510, 511],
        [  0,   1,   2,  ..., 509, 510, 511]])

### 3. BERT Layer

Transformer에 해당

- BertAttention : self-attention 계산
- BertIntermediate : self-attention의 출력 처리
- BertOutput : BertAttention + BertIntermediate

<br/>

**입력** : [batch_size, seq_len, hidden_size] - [64, 512, 768]

**출력** : [batch_size, seq_len, hidden_size] - [64, 512, 768]

<br/>

**구성**

1. BertAttention

   - BertSelfAttention
   - BertSelfOutput

2. BertIntermediate

3. BertOutput

#### 3.1 BertSelfOutput

BertSelfAttention의 출력을 처리하는 F.C layer



In [18]:
class BertSelfOutput(nn.Module):

    def __init__(self, config):
        super(BertSelfOutput, self).__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

    def forward(self, hidden_states, input_tensor):
        '''
        hidden_states: BertSelfAttention의 출력 텐서
        input_tensor: Embeddings 모듈 또는 앞단의 BertLayer의 출력
        '''
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)

        return hidden_states

#### 3.2 BertSelfAttention & BertAttention

**입력** : [seq_len, hidden_size] = [512, 768]  (Embedding 모듈의 출력 or 앞 단의BertLayer 출력)

In [19]:
class BertSelfAttention(nn.Module):

    def __init__(self, config):
        super(BertSelfAttention, self).__init__()

        self.num_attention_heads = config.num_attention_heads  # 12

        self.attention_head_size = int(
            config.hidden_size / config.num_attention_heads)  # 768/12=64
        self.all_head_size = self.num_attention_heads * self.attention_head_size  # =hidden_size=768

        self.query = nn.Linear(config.hidden_size, self.all_head_size)  # nn.Linear(768, 768)
        self.key = nn.Linear(config.hidden_size, self.all_head_size)
        self.value = nn.Linear(config.hidden_size, self.all_head_size)

        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)

    def transpose_for_scores(self, x):  
        '''
        Multi-Head-Attention 용으로 텐서의 형태 변환
        [batch_size, seq_len, hidden] -> [batch_size, 12, seq_len, hidden / 12]  /  [64, 512, 768] -> [64, 12, 512, 64]
        ''' 
        # 일종의 리스트 합 연산
        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)  # torch.size([64, 512]) + torch.size([12, 64])
        x = x.view(*new_x_shape)  # torch.Size([64, 512, 12, 64])

        return x.permute(0, 2, 1, 3)  # torch.Size([64, 12, 512, 64]) - [batch, heads, seq_len, hidden / heads]

    def forward(self, hidden_states, attention_mask, attention_show_fig=False):
        '''
        hidden_states: Embeddings 모듈 or 앞 단의 BertLayer 출력 - [64, 512, 768]
        attention_mask: Transformer의 마스크와 같은 기능의 마스킹
        attention_show_flg: Self-Attention의 가중치를 반환할지의 플래그
        '''
        # Q K V (Multi-Head-Attention 전부를 한꺼번에 변환)
        mixed_query_layer = self.query(hidden_states)
        mixed_key_layer = self.key(hidden_states)
        mixed_value_layer = self.value(hidden_states)

        # Multi-Head-Attention용으로 텐서 변환
        query_layer = self.transpose_for_scores(mixed_query_layer)
        key_layer = self.transpose_for_scores(mixed_key_layer)
        value_layer = self.transpose_for_scores(mixed_value_layer)
        
        # Attention score
        attention_scores = torch.matmul(
            query_layer, key_layer.transpose(-1, -2))  # [64, 12, 512, 64] x [64, 12, 64, 512] = [64, 12, 512, 512]
        attention_scores = attention_scores / math.sqrt(self.attention_head_size)  # 한 문장을 구성하는 512개 단어들 간의 정규화된 유사도

        # Masking (masking된 부분은 -inf, 나머지는 0)
        attention_scores = attention_scores + attention_mask
        
        # Attention dist (정규화)
        attention_probs = nn.Softmax(dim=-1)(attention_scores)
        attention_probs = self.dropout(attention_probs)

        # Attention map
        context_layer = torch.matmul(attention_probs, value_layer)  # [64, 12, 512, 512] x [64, 12, 512, 64] = [64, 12, 512, 64]
        
        # Multi-Head-Attention의 텐서 형식을 원래대로
        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()  # [64, 512, 12, 64]
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)  # [64, 512, 768]
        context_layer = context_layer.view(*new_context_layer_shape)  # [64, 512, 768]

        if attention_show_fig == True:
            return context_layer, attention_probs

        elif attention_show_fig == False:
            return context_layer

In [20]:
class BertAttention(nn.Module):
    '''BertLayer 모듈의 Self-Attention 부분'''
    def __init__(self, config):
        super(BertAttention, self).__init__()
        self.selfattn = BertSelfAttention(config)
        self.output = BertSelfOutput(config)

    def forward(self, input_tensor, attention_mask, attention_show_fig=False):
        '''
        input_tensor: Embeddings 모듈 or 앞 단의 BertLayer 출력
        attention_mask: Transformer의 마스크와 같은 기능의 마스킹
        attention_show_flg: Self-Attention의 가중치를 반환할지의 플래그
        '''
        if attention_show_fig == True:
            self_output, attention_probs = self.selfattn(input_tensor, attention_mask, attention_show_fig)
            attention_output = self.output(self_output, input_tensor)

            return attention_output, attention_probs

        elif attention_show_fig == False:
            self_output = self.selfattn(input_tensor, attention_mask, attention_show_fig)
            attention_output = self.output(self_output, input_tensor)
            
            return attention_output

#### 3.2.1 Process of BertSelfAttention

In [21]:
# Input
hidden_states = embeddings
print('hidden_states :', hidden_states.shape)
print()

num_attention_heads = config.num_attention_heads  # 12
attention_head_size = int(
            config.hidden_size / config.num_attention_heads)  # 768/12=64
all_head_size = num_attention_heads * attention_head_size  # =hidden_size=768

# Query Key Value
query = nn.Linear(config.hidden_size, all_head_size)  # nn.Linear(768, 768)
key = nn.Linear(config.hidden_size, all_head_size)
value = nn.Linear(config.hidden_size, all_head_size)

mixed_query_layer = query(hidden_states)
mixed_key_layer = key(hidden_states)
mixed_value_layer = value(hidden_states)
print('mixed query key value :', mixed_query_layer.shape)

# MHA용으로 텐서 변환 (transpose_for_scores)
def transpose_for_scores(x):
    new_x_shape = x.size()[:-1] + (num_attention_heads, attention_head_size)
    x = x.view(*new_x_shape)
    return x.permute(0, 2, 1, 3)

query_layer = transpose_for_scores(mixed_query_layer)
key_layer = transpose_for_scores(mixed_key_layer)
value_layer = transpose_for_scores(mixed_value_layer)
print('query key value for multi-heads :', query_layer.shape)

hidden_states : torch.Size([64, 512, 768])

mixed query key value : torch.Size([64, 512, 768])
query key value for multi-heads : torch.Size([64, 12, 512, 64])


In [22]:
# Attention scores
attention_scores = torch.matmul(
    query_layer, key_layer.transpose(-1, -2))  # [64, 12, 512, 64] x [64, 12, 64, 512]
attention_scores = attention_scores / \
    math.sqrt(attention_head_size)
print('attention_scores :', attention_scores.shape)

# Attention dist (정규화)
attention_probs = nn.Softmax(dim=-1)(attention_scores)
print('attention_probs :', attention_probs.shape)

# Attention map
context_layer = torch.matmul(attention_probs, value_layer)  # [64, 12, 512, 512] x [64, 12, 512, 64]
print('context_layer :', context_layer.shape)

attention_scores : torch.Size([64, 12, 512, 512])
attention_probs : torch.Size([64, 12, 512, 512])
context_layer : torch.Size([64, 12, 512, 64])


In [23]:
# Multi-Head-Attention의 텐서 형식을 원래대로
context_layer = context_layer.permute(0, 2, 1, 3).contiguous()  # [64, 512, 12, 64]
new_context_layer_shape = context_layer.size()[:-2] + (all_head_size, )  # [64, 512, 768]
context_layer = context_layer.view(*new_context_layer_shape)
print('context_layer :', context_layer.shape)  # [64, 512, 768]

context_layer : torch.Size([64, 512, 768])


#### 3.2.2 Process of BertAttention

In [24]:
selfattn = BertSelfAttention(config)
selfattn

BertSelfAttention(
  (query): Linear(in_features=768, out_features=768, bias=True)
  (key): Linear(in_features=768, out_features=768, bias=True)
  (value): Linear(in_features=768, out_features=768, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [25]:
output = BertSelfOutput(config)
output

BertSelfOutput(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (LayerNorm): BertLayerNorm()
  (dropout): Dropout(p=0.1, inplace=False)
)

In [26]:
input_tensor = context_layer

attention_mask = torch.randn([64, 12, 512, 512])

print('input_tensor :', input_tensor.shape)
print('attention_mask :', attention_mask.shape)

input_tensor : torch.Size([64, 512, 768])
attention_mask : torch.Size([64, 12, 512, 512])


In [27]:
self_output, attention_probs = selfattn(input_tensor, attention_mask, attention_show_fig=True)
print('self_output :', self_output.shape)
print('attention_probs :', attention_probs.shape)

attention_output = output(self_output, input_tensor)
print('attention_output :', attention_output.shape)

self_output : torch.Size([64, 512, 768])
attention_probs : torch.Size([64, 12, 512, 512])
attention_output : torch.Size([64, 512, 768])


### 3.3 BertIntermediate

BERT TransformerBlock 모듈의 Feed-Forward

In [28]:
def gelu(x):
    '''Gaussian Error Linear Unit
    '''
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [29]:
class BertIntermediate(nn.Module):

    def __init__(self, config):
        super(BertIntermediate, self).__init__()
        
        # F.C layer
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)  # nn.Linear(768, 3072)
        # GeLU
        self.intermediate_act_fn = gelu

    def forward(self, hidden_states):
        '''
        hidden_states:  BertAttention 출력 - [64, 512, 768]
        '''
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)  # GELU에 의한 활성화
        
        return hidden_states  # [64, 512, 3072]

### 3.4 BertOutput

BERT TransformerBlock 모듈의 Feed-Forward

In [30]:
class BertOutput(nn.Module):

    def __init__(self, config):
        super(BertOutput, self).__init__()

        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)  # nn.Linear(3072, 768)
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

        self.dropout = nn.Dropout(config.hidden_dropout_prob)  # 0.1

    def forward(self, hidden_states, input_tensor):
        '''
        hidden_states:  BertIntermediate 출력 - [64, 512, 3072]
        input_tensor: BertAttention의 출력 - [64, 512, 768]
        '''
        hidden_states = self.dense(hidden_states)
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.LayerNorm(hidden_states + input_tensor)

        return hidden_states  # [64, 512, 768]

### 3.5 BertLayer

In [31]:
class BertLayer(nn.Module):

    def __init__(self, config):
        super(BertLayer, self).__init__()

        # Self-Attention
        self.attention = BertAttention(config)

        # Self-Attention의 출력을 처리하는 F.C layer
        self.intermediate = BertIntermediate(config)

        # Self-Attention의 출력과 BertLayer에 원래의 입력을 더하는 layer
        self.output = BertOutput(config)

    def forward(self, hidden_states, attention_mask, attention_show_fig=False):
        '''
        hidden_states: Embedder 모듈의 출력 텐서 [batch_size, seq_len, hidden_size]
        attention_mask: masking
        attention_show_flg: Self-Attention의 가중치를 반환할지의 플래그
        '''
        if attention_show_fig == True:
            attention_output, attention_probs = self.attention(
                hidden_states, attention_mask, attention_show_fig)
            intermediate_output = self.intermediate(attention_output)
            layer_output = self.output(intermediate_output, attention_output)
            
            return layer_output, attention_probs

        elif attention_show_fig == False:
            attention_output = self.attention(
                hidden_states, attention_mask, attention_show_fig)
            intermediate_output = self.intermediate(attention_output)
            layer_output = self.output(intermediate_output, attention_output)

            return layer_output  # [batch_size, seq_length, hidden_size]

In [32]:
BertLayer(config)

BertLayer(
  (attention): BertAttention(
    (selfattn): BertSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

### 4. BertEncoder

BertLayer 모듈의 반복

In [33]:
nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])

ModuleList(
  (0): BertLayer(
    (attention): BertAttention(
      (selfattn): BertSelfAttention(
        (query): Linear(in_features=768, out_features=768, bias=True)
        (key): Linear(in_features=768, out_features=768, bias=True)
        (value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (output): BertSelfOutput(
        (dense): Linear(in_features=768, out_features=768, bias=True)
        (LayerNorm): BertLayerNorm()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (intermediate): BertIntermediate(
      (dense): Linear(in_features=768, out_features=3072, bias=True)
    )
    (output): BertOutput(
      (dense): Linear(in_features=3072, out_features=768, bias=True)
      (LayerNorm): BertLayerNorm()
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (1): BertLayer(
    (attention): BertAttention(
      (selfattn): BertSelfAttention(
        (query): Linear(in_features=768, out_fe

In [34]:
class BertEncoder(nn.Module):
    def __init__(self, config):
        super(BertEncoder, self).__init__()

        # 12개의 BertLayer 모듈 생성
        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])

    def forward(self, hidden_states, attention_mask, output_all_encoded_layers=True, attention_show_fig=False):
        '''
        hidden_states: Embeddings 모듈 출력
        attention_mask: masking
        output_all_encoded_layers: 모든 TransformerBlock 모듈의 출력을 반환할 지, 최종 층의 출력만 반환할 지
        attention_show_flg: Self-Attention의 가중치를 반환할지의 플래그
        '''
        # 반환 값 리스트
        all_encoder_layers = []

        for layer_module in self.layer:
            if attention_show_fig == True:
                hidden_states, attention_probs = layer_module(
                    hidden_states, attention_mask, attention_show_fig)
                
            elif attention_show_fig == False:
                hidden_states = layer_module(
                    hidden_states, attention_mask, attention_show_fig)
                
            if output_all_encoded_layers:  # 중간 layer 출력들도 저장
                all_encoder_layers.append(hidden_states)

        if not output_all_encoded_layers:  # 마지막 layer의 출력만 저장
            all_encoder_layers.append(hidden_states)

        if attention_show_fig == True:
            return all_encoder_layers, attention_probs

        elif attention_show_fig == False:
            return all_encoder_layers

### 5. BertPooler

In [35]:
class BertPooler(nn.Module):
    '''입력 문장의 첫번째 단어 [cls] 토큰의 feature 반환, 유지'''
    def __init__(self, config):
        super(BertPooler, self).__init__()

        self.dense = nn.Linear(config.hidden_size, config.hidden_size)  # nn.Linear(768, 768)
        self.activation = nn.Tanh()

    def forward(self, hidden_states):
        # [CLS] 토큰의 feature 획득
        first_token_tensor = hidden_states[:, 0]
      
        pooled_output = self.dense(first_token_tensor)
        pooled_output = self.activation(pooled_output)

        return pooled_output

#### 5.1 동작 확인

In [42]:
input_ids = torch.LongTensor([[31, 51, 12, 23, 99], [15, 5, 1, 0, 0]])
print("입력 단어 ID열의 텐서 크기: ", input_ids.shape, '- [batch, seq_length]')  

# 마스크
attention_mask = torch.LongTensor([[1, 1, 1, 1, 1], [1, 1, 1, 0, 0]])
print("입력 마스크의 텐서 크기: ", attention_mask.shape, '- [batch, seq_length]')

extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # 해당 위치에 새로운 차원이 추가됨
extended_attention_mask = extended_attention_mask.to(dtype=torch.float32)
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
print("확장된 마스크의 텐서 크기: ", extended_attention_mask.shape, '- [batch, 1, 1, seq_length]')

# 문장의 ID. 두 미니 배치 각각에 대해, 0은 첫번째 문장을, 1은 2번째 문장을 나타냄
token_type_ids = torch.LongTensor([[0, 0, 1, 1, 1], [0, 1, 1, 1, 1]])
print("입력 문장 ID의 텐서 크기: ", token_type_ids.shape, '- [batch, seq_length]')

입력 단어 ID열의 텐서 크기:  torch.Size([2, 5]) - [batch, seq_length]
입력 마스크의 텐서 크기:  torch.Size([2, 5]) - [batch, seq_length]
확장된 마스크의 텐서 크기:  torch.Size([2, 1, 1, 5]) - [batch, 1, 1, seq_length]
입력 문장 ID의 텐서 크기:  torch.Size([2, 5]) - [batch, seq_length]


In [37]:
extended_attention_mask

tensor([[[[    -0.,     -0.,     -0.,     -0.,     -0.]]],


        [[[    -0.,     -0.,     -0., -10000., -10000.]]]])

In [38]:
embeddings = BertEmbeddings(config)
encoder = BertEncoder(config)
pooler = BertPooler(config)

In [51]:
out1 = embeddings(input_ids, token_type_ids)
print("BertEmbeddings의 출력 텐서 크기: ", out1.shape, '- [batch, seq_length, hidden]')
print()

out2 = encoder(out1, extended_attention_mask)  # default : Encoder를 구성하는 12개 BertLayer의 모든 hidden states 반환
print("BertEncoder 최종 layer의 출력 텐서 크기: ", len(out2), '- # of Attention Heads')
print()

out3 = pooler(out2[-1])  # Encoder를 구성하는 마지막 BertLayer의 출력
print("BertPooler의 입력 텐서 크기: ", out2[-1].shape)
print("[CLS] 토큰에 해당하는 텐서 크기: ", out2[-1][:, 0].shape)
print("BertPooler의 출력 텐서 크기: ", out3.shape, '- [batch, hidden]')

BertEmbeddings의 출력 텐서 크기:  torch.Size([2, 5, 768]) - [batch, seq_length, hidden]

BertEncoder 최종 layer의 출력 텐서 크기:  12 - # of Attention Heads

BertPooler의 입력 텐서 크기:  torch.Size([2, 5, 768])
[CLS] 토큰에 해당하는 텐서 크기:  torch.Size([2, 768])
BertPooler의 출력 텐서 크기:  torch.Size([2, 768]) - [batch, hidden]


### 6. BERT 모델

구현한 모듈을 전부 연결

In [52]:
class BertModel(nn.Module):

    def __init__(self, config):
        super(BertModel, self).__init__()
        
        # 위에서 구현한 모듈 작성
        self.embeddings = BertEmbeddings(config)  # 입력 문장 처리
        self.encoder = BertEncoder(config)  # BertLayer 12개로 구성
        self.pooler = BertPooler(config)  # [CLS]토큰 처리
    
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, output_all_encoded_layers=True, attention_show_fig=False):
        '''
        input_ids:  [batch_size, seq_length] 문장의 단어 ID 나열 - [64, 512]
        token_type_ids:  [batch_size, seq_length] 각 단어가 1번째 문장인지, 2번째 문장인지를 나타내는 id - [64, 512]
        attention_mask: masking - [64, 512]
        output_all_encoded_layers: BertEncoder를 구성하는 모든 BertLayer의 출력을 반환할 지, 최종 층의 출력만 반환할 지
        attention_show_flg: Self-Attention의 가중치를 반환할지의 플래그
        '''
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        if token_type_ids is None:
            token_type_ids = torch.zeros_like(input_ids)

        # mask 변형 : [batch, seq_length] -> [batch, 1, 1, seq_length] (Multi-Head-Attention용으로 사용 가능하도록)
        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
        extended_attention_mask = extended_attention_mask.to(dtype=torch.float32)
        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0

        # Forward
        # 1. BertEmbeddings
        embedding_output = self.embeddings(input_ids, token_type_ids)
        
        # 2. BertEncoder (12개의 BertLayer로 구성)
        if attention_show_fig == True:  # 12개 layer의 모든 hidden_state 출력 반환
            encoded_layers, attention_probs = self.encoder(embedding_output,
                                                           extended_attention_mask,
                                                           output_all_encoded_layers, attention_show_fig)
        elif attention_show_fig == False:  # 마지막 12번 째 layer의 hidden_state 출력만을 반환
            encoded_layers = self.encoder(embedding_output,
                                          extended_attention_mask,
                                          output_all_encoded_layers, attention_show_fig)
            
        # 3. BertPooler
        pooled_output = self.pooler(encoded_layers[-1])  # Encoder의 마지막 BertLayer출력 이용

        if not output_all_encoded_layers:
            encoded_layers = encoded_layers[-1]

        if attention_show_fig == True:
            return encoded_layers, pooled_output, attention_probs

        elif attention_show_fig == False:
            return encoded_layers, pooled_output

#### 6.1 동작 확인

In [56]:
input_ids = torch.LongTensor([[31, 51, 12, 23, 99], [15, 5, 1, 0, 0]])
attention_mask = torch.LongTensor([[1, 1, 1, 1, 1], [1, 1, 1, 0, 0]])
token_type_ids = torch.LongTensor([[0, 0, 1, 1, 1], [0, 1, 1, 1, 1]])

# BERT model
net = BertModel(config)
net

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (selfattn): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense):

In [62]:
# forward
encoded_layers, pooled_output, attention_probs = net(
    input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, attention_show_fig=True)  # 최종 BertLayer의 출력만

print("encoded_layers의 텐서 크기: ", encoded_layers.shape, '- [batch, seq_length, hidden]')
print("pooled_output의 텐서 크기: ", pooled_output.shape, '- [batch, hidden]')
print("attention_probs의 텐서 크기: ", attention_probs.shape, '- [batch, # of heads, seq_length, seq_length]')  # 각 head 별로 문장은 구성하는 5개 단어들간의 self-attention dist

encoded_layers의 텐서 크기:  torch.Size([2, 5, 768]) - [batch, seq_length, hidden]
pooled_output의 텐서 크기:  torch.Size([2, 768]) - [batch, hidden]
attention_probs의 텐서 크기:  torch.Size([2, 12, 5, 5]) - [batch, # of heads, seq_length, seq_length]


### 7. Application


In [63]:
# 학습된 모델 load
weights_path = "./weights/pytorch_model.bin"
loaded_state_dict = torch.load(weights_path)

for state in loaded_state_dict.keys():
    print(state)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.gamma
bert.embeddings.LayerNorm.beta
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.gamma
bert.encoder.layer.0.attention.output.LayerNorm.beta
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.gamma
bert.encoder.layer.0.output.LayerNorm.beta
bert.encoder.layer.1.attention.self.query.weight
bert.encode

In [66]:
net = BertModel(config)
net.eval()  # inference

param_names = [] 
for name, param in net.named_parameters():
    print(name)  # 사전 학습된 모델의 state_dict와 이름이 다른 것을 확인
    param_names.append(name)

embeddings.word_embeddings.weight
embeddings.position_embeddings.weight
embeddings.token_type_embeddings.weight
embeddings.LayerNorm.gamma
embeddings.LayerNorm.beta
encoder.layer.0.attention.selfattn.query.weight
encoder.layer.0.attention.selfattn.query.bias
encoder.layer.0.attention.selfattn.key.weight
encoder.layer.0.attention.selfattn.key.bias
encoder.layer.0.attention.selfattn.value.weight
encoder.layer.0.attention.selfattn.value.bias
encoder.layer.0.attention.output.dense.weight
encoder.layer.0.attention.output.dense.bias
encoder.layer.0.attention.output.LayerNorm.gamma
encoder.layer.0.attention.output.LayerNorm.beta
encoder.layer.0.intermediate.dense.weight
encoder.layer.0.intermediate.dense.bias
encoder.layer.0.output.dense.weight
encoder.layer.0.output.dense.bias
encoder.layer.0.output.LayerNorm.gamma
encoder.layer.0.output.LayerNorm.beta
encoder.layer.1.attention.selfattn.query.weight
encoder.layer.1.attention.selfattn.query.bias
encoder.layer.1.attention.selfattn.key.weight
e

In [68]:
# 새로운 state_dict 구성
# loaded_state_dict와 param_names는 파라미터들의 이름은 다르지만, 동일한 순서로 대응됨
new_state_dict = net.state_dict().copy()
for index, (key_name, value) in enumerate(loaded_state_dict.items()):
    name = param_names[index]  # 현재 네트워크의 파라미터명을 취득
    new_state_dict[name] = value  # 파라미터 값 삽입
    print(str(key_name)+" → "+str(name)) 

    if index+1 >= len(param_names):
        break

bert.embeddings.word_embeddings.weight → embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight → embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight → embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.gamma → embeddings.LayerNorm.gamma
bert.embeddings.LayerNorm.beta → embeddings.LayerNorm.beta
bert.encoder.layer.0.attention.self.query.weight → encoder.layer.0.attention.selfattn.query.weight
bert.encoder.layer.0.attention.self.query.bias → encoder.layer.0.attention.selfattn.query.bias
bert.encoder.layer.0.attention.self.key.weight → encoder.layer.0.attention.selfattn.key.weight
bert.encoder.layer.0.attention.self.key.bias → encoder.layer.0.attention.selfattn.key.bias
bert.encoder.layer.0.attention.self.value.weight → encoder.layer.0.attention.selfattn.value.weight
bert.encoder.layer.0.attention.self.value.bias → encoder.layer.0.attention.selfattn.value.bias
bert.encoder.layer.0.attention.output.dense.weight → encoder.

In [69]:
# 새로운 state_dict를 BERT 모델에 제공
net.load_state_dict(new_state_dict)

<All keys matched successfully>

#### 7.1 BERT용 Tokenizer

In [70]:
import collections

def load_vocab(vocab_file):
    """text 형식의 vocab 파일의 내용을 사전에 저장"""
    
    vocab = collections.OrderedDict()  # (단어, id) 순서의 사전 변수 (ordered_dict)
    ids_to_tokens = collections.OrderedDict()  # (id, 단어) 순서의 사전 변수
    
    index = 0
    with open(vocab_file, "r", encoding="utf-8") as reader:
        while True:
            token = reader.readline()
            if not token:
                break
            token = token.strip()

            # 저장
            vocab[token] = index
            ids_to_tokens[index] = token
            index += 1

    return vocab, ids_to_tokens

In [71]:
vocab_file = "./vocab/bert-base-uncased-vocab.txt"
vocab, ids_to_tokens = load_vocab(vocab_file)

In [None]:
#vocab

In [None]:
#ids_to_tokens

#### 7.2 단어 Bank의 문맥에 따른 의미 변화를 벡터 표현으로 계산

In [72]:
! git clone https://github.com/gymoon10/utils.git

Cloning into 'utils'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 47 (delta 8), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (47/47), done.


In [73]:
from utils.tokenizer import BasicTokenizer, WordpieceTokenizer

# BasicTokenizer, WordpieceTokenizer는, 참고 문헌[2] 그대로입니다
# https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/tokenization.py
# sub-word로 단어 분할을 실시하는 클래스들입니다.
class BertTokenizer(object):
    '''BERT용의 문장 단어 분할 클래스를 구현'''

    def __init__(self, vocab_file, do_lower_case=True):
        '''
        vocab_file: vocabulary에의 경로
        do_lower_case: 전처리에서 단어를 소문자로 바꾸는지 여부
        '''

        # vocabulary의 로드
        self.vocab, self.ids_to_tokens = load_vocab(vocab_file)

        # 분할 처리 함수를 "utils" 폴더에서 imoprt, sub-word로 단어 분할을 실시
        never_split = ("[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]")
        # (주석)위 단어는 도중에 분할하지 않는다. 이를 통해 하나의 단어로 간주함

        self.basic_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
                                              never_split=never_split)
        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)

    def tokenize(self, text):
        '''문장의 단어를 분할하는 함수'''
        split_tokens = []  # 분할 후 단어들
        for token in self.basic_tokenizer.tokenize(text):
            for sub_token in self.wordpiece_tokenizer.tokenize(token):
                split_tokens.append(sub_token)
        return split_tokens

    def convert_tokens_to_ids(self, tokens):
        """분할된 단어 목록을 ID로 변환하는 함수"""
        ids = []
        for token in tokens:
            ids.append(self.vocab[token])

        return ids

    def convert_ids_to_tokens(self, ids):
        """ID를 단어로 변환하는 함수"""
        tokens = []
        for i in ids:
            tokens.append(self.ids_to_tokens[i])
        return tokens

In [74]:
# seq_length=8
text_1 = "[CLS] I accessed the bank account. [SEP]"
text_2 = "[CLS] He transferred the deposit money into the bank account. [SEP]"
text_3 = "[CLS] We play soccer at the bank of the river. [SEP]"

# Load Tokenizer
tokenizer = BertTokenizer(
    vocab_file="./vocab/bert-base-uncased-vocab.txt", do_lower_case=True)

# Tokenization
tokenized_text_1 = tokenizer.tokenize(text_1)
tokenized_text_2 = tokenizer.tokenize(text_2)
tokenized_text_3 = tokenizer.tokenize(text_3)

# 확인
print(tokenized_text_1)

['[CLS]', 'i', 'accessed', 'the', 'bank', 'account', '.', '[SEP]']


In [79]:
indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
indexed_tokens_3 = tokenizer.convert_tokens_to_ids(tokenized_text_3)

# 각 문장에서 bank의 위치
bank_posi_1 = np.where(np.array(tokenized_text_1) == "bank")[0][0]  # 4
bank_posi_2 = np.where(np.array(tokenized_text_2) == "bank")[0][0]  # 8
bank_posi_3 = np.where(np.array(tokenized_text_3) == "bank")[0][0]  # 6

# list -> tensor
tokens_tensor_1 = torch.tensor([indexed_tokens_1])
tokens_tensor_2 = torch.tensor([indexed_tokens_2])
tokens_tensor_3 = torch.tensor([indexed_tokens_3])

# bank의 단어 id
bank_word_id = tokenizer.convert_tokens_to_ids(["bank"])[0]

# 확인
print(tokens_tensor_1)
print(tokens_tensor_2)

tensor([[  101,  1045, 11570,  1996,  2924,  4070,  1012,   102]])
tensor([[  101,  2002,  4015,  1996, 12816,  2769,  2046,  1996,  2924,  4070,
          1012,   102]])


In [76]:
# 문장을 BERT로 처리
with torch.no_grad():  # inference
    encoded_layers_1, _ = net(tokens_tensor_1, output_all_encoded_layers=True)
    encoded_layers_2, _ = net(tokens_tensor_2, output_all_encoded_layers=True)
    encoded_layers_3, _ = net(tokens_tensor_3, output_all_encoded_layers=True)

In [91]:
print(len(encoded_layers_1))
print(encoded_layers_1[0].shape)  # [batch, seq_length, hidden]

12
torch.Size([1, 8, 768])


In [77]:
bank_vector_0 = net.embeddings.word_embeddings.weight[bank_word_id]

# 문장1의 BertEncoder의 첫 번째 BertLayer의 출력
bank_vector_1_1 = encoded_layers_1[0][0, bank_posi_1]

# 문장1의 BertEncoder의 최종 12 번째 BertLayer의 출력
bank_vector_1_12 = encoded_layers_1[11][0, bank_posi_1]

# 문장2, 3도 마찬가지로 적용
bank_vector_2_1 = encoded_layers_2[0][0, bank_posi_2]
bank_vector_2_12 = encoded_layers_2[11][0, bank_posi_2]
bank_vector_3_1 = encoded_layers_3[0][0, bank_posi_3]
bank_vector_3_12 = encoded_layers_3[11][0, bank_posi_3]

In [85]:
print(bank_vector_1_12.shape)

torch.Size([768])


In [78]:
import torch.nn.functional as F

print("bank의 초기 벡터와 문장1의 1단 bank의 유사도: ",
      F.cosine_similarity(bank_vector_0, bank_vector_1_1, dim=0))
print("bank의 초기 벡터와 문장1의 12단 bank의 유사도: ",
      F.cosine_similarity(bank_vector_0, bank_vector_1_12, dim=0))

print("문장1의 1층 bank와 문장2의 1단 bank의 유사도: ",
      F.cosine_similarity(bank_vector_1_1, bank_vector_2_1, dim=0))
print("문장1의 1층 bank와 문장3의 1단 bank의 유사도: ",
      F.cosine_similarity(bank_vector_1_1, bank_vector_3_1, dim=0))

print("문장1의 12층 bank와 문장2의 12단 bank의 유사도: ",
      F.cosine_similarity(bank_vector_1_12, bank_vector_2_12, dim=0))
print("문장1의 12층 bank와 문장3의 12단 bank의 유사도: ",
      F.cosine_similarity(bank_vector_1_12, bank_vector_3_12, dim=0))

bank의 초기 벡터와 문장1의 1단 bank의 유사도:  tensor(0.6814, grad_fn=<DivBackward0>)
bank의 초기 벡터와 문장1의 12단 bank의 유사도:  tensor(0.2276, grad_fn=<DivBackward0>)
문장1의 1층 bank와 문장2의 1단 bank의 유사도:  tensor(0.8968)
문장1의 1층 bank와 문장3의 1단 bank의 유사도:  tensor(0.7584)
문장1의 12층 bank와 문장2의 12단 bank의 유사도:  tensor(0.8796)
문장1의 12층 bank와 문장3의 12단 bank의 유사도:  tensor(0.4814)
