### 0. Setting

BERT.ipynb와 동일

In [8]:
import os
import urllib.request
import zipfile
import tarfile
import glob
import io

In [4]:
# data
data_dir = "./data/"
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

# vocab
vocab_dir = "./vocab/"
if not os.path.exists(vocab_dir):
    os.mkdir(vocab_dir)

save_path="./vocab/bert-base-uncased-vocab.txt"
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
urllib.request.urlretrieve(url, save_path)


# weights
weights_dir = "./weights/"
if not os.path.exists(weights_dir):
    os.mkdir(weights_dir)

save_path = "./weights/bert-base-uncased.tar.gz"
url = "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz"
urllib.request.urlretrieve(url, save_path)

archive_file = "./weights/bert-base-uncased.tar.gz"  
tar = tarfile.open(archive_file, 'r:gz')
tar.extractall('./weights/')  
tar.close()

In [5]:
import json

# Config
config_file = "./weights/bert_config.json"

json_file = open(config_file, 'r')
config = json.load(json_file)

config

{'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'max_position_embeddings': 512,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'type_vocab_size': 2,
 'vocab_size': 30522}

In [6]:
class BertLayerNorm(nn.Module):

    def __init__(self, hidden_size, eps=1e-12):
        super(BertLayerNorm, self).__init__()
        self.gamma = nn.Parameter(torch.ones(hidden_size))  
        self.beta = nn.Parameter(torch.zeros(hidden_size))  
        self.variance_epsilon = eps

    def forward(self, x):
        u = x.mean(-1, keepdim=True)
        s = (x - u).pow(2).mean(-1, keepdim=True)
        x = (x - u) / torch.sqrt(s + self.variance_epsilon)
        
        return self.gamma * x + self.beta

In [7]:
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))

In [1]:
import math
import numpy as np

import torch
from torch import nn

### 1. 사전 학습 과제 - Masked Language Model                  

In [9]:
class MaskedWordPredictions(nn.Module):
    def __init__(self, config):
        super(MaskedWordPredictions, self).__init__()

        # BERT 출력 변환 모듈
        self.transform = BertPredictionHeadTransform(config)

        # self.transform의 출력을 입력으로 받아, 각 위치의 단어가 어떤 것인지 맞추는 F.C layer
        self.decoder = nn.Linear(in_features=config.hidden_size,  # 768
                                 out_features=config.vocab_size, bias=False)  # 30522
        self.bias = nn.Parameter(torch.zeros(config.vocab_size))  # 30522

    def forward(self, hidden_states):
        '''
        hidden_states: BERT 모델의 출력 [batch_size, seq_len, hidden_size]
        '''
        # 출력 변환
        hidden_states = self.transform(hidden_states)
        
        # 각 위치의 단어가 vocab의 어느 단어에 해당하는지 클래스 분류 수행
        hidden_states = self.decoder(hidden_states) + self.bias  

        return hidden_states  # [batch, seq_len, vocab_size]

class BertPredictionHeadTransform(nn.Module):
    '''MaskedWordPredictions에서, BERT의 출력을 변환 (입출력 크기는 동일)'''
    def __init__(self, config):
        super(BertPredictionHeadTransform, self).__init__()
        
        # F.C layer
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)  # nn.Linear(768, 768)
        
        # GeLU
        self.transform_act_fn = gelu

        # LayerNormalization
        self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)

    def forward(self, hidden_states):
        hidden_states = self.dense(hidden_states)
        hidden_states = self.transform_act_fn(hidden_states)
        hidden_states = self.LayerNorm(hidden_states)
        
        return hidden_states                               

### 2. 사전 학습 과제 - Next Sentence Prediction

BERT.ipynb의 5. BertPooler의 출력(입력 문장의 첫 번째 단어 [CLS]토큰의 벡터 representation - [batch, hidden])을 입력으로 받음

In [10]:
class SeqRelationship(nn.Module):
    def __init__(self, config, out_features):
        super(SeqRelationship, self).__init__()
        self.seq_relationship = nn.Linear(config.hidden_size, out_features)  # (768, 2) - 다음 문장인지 아닌 지

    def forward(self, pooled_output):
        return self.seq_relationship(pooled_output)

### 3. BertPreTrainingHeads

BERT의 사전 학습 과제 1, 2를 수행하는 어댑터 모듈

In [11]:
class BertPreTrainingHeads(nn.Module):
    def __init__(self, config):
        super(BertPreTrainingHeads, self).__init__()
        
        # 사전 학습 과제 1 - MLM 수행 모듈
        self.predictions = MaskedWordPredictions(config)  # vocab의 어느 단어에 해당하는지

        # 사전 학습 과제 2 - NSP 수행 모듈
        self.seq_relationship = SeqRelationship(config, out_features=2)  # 다음 문장인지, 아닌지

    def forward(self, sequence_output, pooled_output):
        '''
        sequence_output : [batch_size, seq_len, hidden_size]
        pooled_output : [batch_size, hidden_size] - [CLS] 토큰의 feature
        '''
        # 입력 문장의 masking된 각 단어가 vocab의 어떤 단어인지 판정
        prediction_scores = self.predictions(sequence_output)  # [batch_size, seq_len, vocab_size=30522]
        
        # [CLS] 토큰에 대한 feature를 통해 1번째, 2번째 문장이 연결되어 있는 지 판정
        seq_relationship_score = self.seq_relationship(pooled_output)  # [batch, 2]

        return prediction_scores, seq_relationship_score

### 4. BERT 모델에 연결

BERT 모델에 사전 학습 과제용 어댑터 모듈 BertPreTrainingHeads 연결

In [12]:
class BertForMaskedLM(nn.Module):

    def __init__(self, config, net_bert):
        super(BertForMaskedLM, self).__init__()

        # BERT 모델
        self.bert = net_bert

        # 사전 학습 과제용 어댑터 모듈
        self.cls = BertPreTrainingHeads(config)

    def forward(self, input_ids, token_type_ids=None, attention_mask=None):
        '''
        input_ids:  [batch_size, seq_length] 문장의 단어 ID 나열
        token_type_ids:  [batch_size, seq_length] 각 단어가 1번째 문장인지, 2번째 문장인지를 나타내는 id
        attention_mask: masking
        '''
        # BERT 모델의 forward
        encoded_layers, pooled_output = self.bert(  # 마지막 12번째 layer의 hidden_state, [CLS] 토큰의 hidden_state 반환
            input_ids, token_type_ids, attention_mask, output_all_encoded_layers=False, attention_show_flg=False)
        
        # 사전 학습 과제 수행
        prediction_scores, seq_relationship_score = self.cls(
            encoded_layers, pooled_output)
        
        return prediction_scores, seq_relationship_score

### 5. 학습된 BERT 모델 load

In [13]:
! git clone https://github.com/gymoon10/utils.git

Cloning into 'utils'...
remote: Enumerating objects: 47, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (45/45), done.[K
remote: Total 47 (delta 8), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (47/47), done.


In [15]:
! pip install attrdict

Collecting attrdict
  Downloading attrdict-2.0.1-py2.py3-none-any.whl (9.9 kB)
Installing collected packages: attrdict
Successfully installed attrdict-2.0.1


In [25]:
from utils.bert import get_config, BertModel, BertTokenizer

In [22]:
config = get_config("./weights/bert_config.json")
config

AttrDict({'attention_probs_dropout_prob': 0.1, 'hidden_act': 'gelu', 'hidden_dropout_prob': 0.1, 'hidden_size': 768, 'initializer_range': 0.02, 'intermediate_size': 3072, 'max_position_embeddings': 512, 'num_attention_heads': 12, 'num_hidden_layers': 12, 'type_vocab_size': 2, 'vocab_size': 30522})

In [24]:
# BERT 모델
net_bert = BertModel(config)
net_bert.eval()

# BERT 모델에 사전 학습 과제용 어댑터 모듈 탑재
net = BertForMaskedLM(config, net_bert)
net.eval()

# 학습된 가중치 로드
weights_path = "./weights/pytorch_model.bin"
loaded_state_dict = torch.load(weights_path)


param_names = []  
for name, param in net.named_parameters():
    param_names.append(name)

new_state_dict = net.state_dict().copy()
for index, (key_name, value) in enumerate(loaded_state_dict.items()):
    name = param_names[index]  
    new_state_dict[name] = value 
   # print(str(key_name)+"→"+str(name))  

    if index+1 >= len(param_names):
        break

# 새로운 state_dict를 BERT 모델에 제공
net.load_state_dict(new_state_dict)

<All keys matched successfully>

### 6. MLM 수행

In [26]:
text = "[CLS] I accessed the bank account. [SEP] We play soccer at the bank of the river. [SEP]"

# Tokenizer
tokenizer = BertTokenizer(
    vocab_file="./vocab/bert-base-uncased-vocab.txt", do_lower_case=True)

# Tokenize
tokenized_text = tokenizer.tokenize(text)

print(tokenized_text)

['[CLS]', 'i', 'accessed', 'the', 'bank', 'account', '.', '[SEP]', 'we', 'play', 'soccer', 'at', 'the', 'bank', 'of', 'the', 'river', '.', '[SEP]']


In [27]:
# masking (13번째 단어)
masked_index = 13
tokenized_text[masked_index] = '[MASK]'

print(tokenized_text)

['[CLS]', 'i', 'accessed', 'the', 'bank', 'account', '.', '[SEP]', 'we', 'play', 'soccer', 'at', 'the', '[MASK]', 'of', 'the', 'river', '.', '[SEP]']


In [28]:
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
print(indexed_tokens)

[101, 1045, 11570, 1996, 2924, 4070, 1012, 102, 2057, 2377, 4715, 2012, 1996, 103, 1997, 1996, 2314, 1012, 102]


In [29]:
def seq2id(indexed_tokens):
    '''띄어쓰기된 단어 ID열을 문장 ID로. [SEP]으로 나누기'''
    segments_ids = []
    seq_id = 0

    for word_id in indexed_tokens:
        segments_ids.append(seq_id)  # seq_id=o or 1을 추가

        # [SEP]를 발견하면 2번째 문장이 되므로 이후 id를 1로
        if word_id == 102:  # ID 102가 [SEP]이다
            seq_id = 1

    return segments_ids

segments_ids = seq2id(indexed_tokens)
print(segments_ids)

[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [30]:
# 텐서 변환
tokens_tensor = torch.tensor([indexed_tokens])
segments_tensors = torch.tensor([segments_ids])

# Inference
with torch.no_grad():
    prediction_scores, seq_relationship_score = net(tokens_tensor, segments_tensors)

# ID -> 단어
predicted_index = torch.argmax(prediction_scores[0, masked_index]).item()
predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
print(predicted_token)

bank


### 7. NSP 수행

클래스 0 : 두 문장이 연속 (의미를 가짐)

클래스 1 : 두 문장은 서로 관계가 없음

In [32]:
# text = "[CLS] I accessed the bank account. [SEP] We play soccer at the bank of the river. [SEP]"
print(seq_relationship_score)
print(torch.sigmoid(seq_relationship_score))  # 클래스 1 (NSP를 제대로 수행)

tensor([[-1.5349,  3.1654]])
tensor([[0.1773, 0.9595]])
