In [1]:
!pip install tokenizers

Collecting tokenizers
  Downloading tokenizers-0.11.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.8 MB)
[K     |████████████████████████████████| 6.8 MB 8.0 MB/s 
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.11.4


In [2]:
import json
import spacy
 
import torch
import torch.nn as nn
from torch.utils import data
from torch.utils.data import Dataset, DataLoader
import torchtext
from torch.utils.data.sampler import SubsetRandomSampler, Sampler


import sys

from tokenizers import Tokenizer
from tokenizers.models import BPE # byte-pair encoding
from tokenizers.trainers import BpeTrainer
from spacy.lang.en import English

from tqdm.auto import tqdm


import numpy as np
device = torch.device('cuda')

In [3]:
# file 1에서 정의한 dataset class
class TokenIndexer():
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer
    
    def __call__(self, tokens):
        return [self.tokenizer.token_to_id(token) for token in tokens]

    def convert_ids_to_tokens(self, ids):
        return [self.tokenizer.id_to_token(id) for id in ids]


class SentencePairDataset(Dataset):
    def __init__(self):
        # hyper parameters
        self.max_len = 128
        self.mask_prob = 0.15
        self.max_pred = 20 

        labels = ("0", "1")

        # data 다운로드 및 전처리
        dataset = torchtext.datasets.WikiText2(root='.data', split='train')
        dataset = [d.strip() for d in dataset if len(d)>10] # 길이가 짧아 무의미한 데이터는 제외하고, strip을 통해 데이터를 정리해주겠습니다.
        
        
        # data 처리
        nlp = English()
        nlp.add_pipe(nlp.create_pipe('sentencizer'))
        doc = nlp(dataset[0])
        self.data=[]
        self.data_length = 0
        self.data_index = {}
        self.sentence_lengths = []
        for i,context in enumerate(tqdm(dataset)):
            doc = nlp(context)
            sentences = [sent.string.strip() for sent in doc.sents]
            if len(sentences)<=3: # can not obtain positive, negative pair in dataset
                continue
            self.data.append(sentences)
            for j in range(len(sentences)-1): # drop last sentence (no positive pair)
                self.data_index[j+self.data_length] = (len(self.data)-1,j)
            self.data_length += len(sentences)-1
            self.sentence_lengths.append(len(sentences))

        
        
        
        # tokenizer
        save_data = '\n'.join(['\n'.join(d) for d in self.data])
        with open('data.txt','w') as f:
          f.write(save_data)
        self.tokenizer = Tokenizer(BPE(unk_token="<unk>"))
        trainer = BpeTrainer(special_tokens=["[PAD]", "<unk>", "[CLS]", "[SEP]", "[MASK]"])
        files = ['data.txt']
        self.tokenizer.train(files, trainer)
        self.vocab_words = list(self.tokenizer.get_vocab().keys())
        self.token_indexer = TokenIndexer(self.tokenizer)


    def get_negative_sample(self, index):
        i, j = self.data_index[index]
        max_j = self.sentence_lengths[i]
        while True:
            random_j = np.random.randint(0,max_j)
            if random_j != (j+1) and random_j != j :
                break
        i, new_j = self.data_index[index+(random_j-j)]
        return self.data[i][new_j]

    def get_positive_sample(self, index):
        i, j = self.data_index[index]
        return self.data[i][j+1]
    
    
    def process_instance(self, instance):
        # -3  for special tokens [CLS], [SEP], [SEP]
        is_next, tokens_a, tokens_b = instance
        while len(tokens_a)+len(tokens_b) > self.max_len-3:
            if len(tokens_a)>len(tokens_b) : 
                tokens_a = tokens_a[:-1]
            else:
                tokens_b = tokens_b[:-1]
        
        tokens = ['[CLS]'] + tokens_a + ['[SEP]'] + tokens_b + ['[SEP]']
        segment_ids = [0]*(len(tokens_a)+2) + [1]*(len(tokens_b)+1)
        input_mask = [1]*len(tokens)
        
        #masked LM
        masked_tokens, masked_pos = [], []
        n_pred = min(self.max_pred, max(1, int(round(len(tokens)*self.mask_prob))))


        candidate_position = [i for i, token in enumerate(tokens) if token != '[CLS]' and token != '[SEP]']
        np.random.shuffle(candidate_position)

        for pos in candidate_position[:n_pred]:
            masked_tokens.append(tokens[pos])
            masked_pos.append(pos)
            if np.random.random() < 0.8: # 80%
                tokens[pos] = '[MASK]'
            elif np.random.random() < 0.5: # 10%
                random_word = self.vocab_words[np.random.randint(0, len(self.vocab_words)-1)]
                tokens[pos] = random_word

        # when n_pred < max_pred, we only calculate loss within n_pred
        masked_weights = [1]*len(masked_tokens)

        # Token Indexing
        input_ids = self.token_indexer(tokens)
        masked_ids = self.token_indexer(masked_tokens)

        # Zero Padding
        n_pad = self.max_len - len(input_ids)
        input_ids.extend([0]*n_pad)
        segment_ids.extend([0]*n_pad)
        input_mask.extend([0]*n_pad)

        # Zero Padding for masked target
        if self.max_pred > n_pred:
            n_pad = self.max_pred - n_pred
            masked_ids.extend([0]*n_pad)
            masked_pos.extend([0]*n_pad)
            masked_weights.extend([0]*n_pad)
        
        
        return (input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next)
    
    

    def __getitem__(self, index):
        i,j = self.data_index[index]
        sentence_a = self.data[i][j]
        token_a = self.tokenizer.encode(sentence_a).tokens
        
        is_next = np.random.random()
        if is_next < 0.5:
            is_next = 0
            sentence_b = self.get_negative_sample(index)
        else:
            is_next = 1
            sentence_b = self.get_positive_sample(index)
        token_b = self.tokenizer.encode(sentence_b).tokens
            
        instance = (is_next, token_a, token_b)
        input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next = self.process_instance(instance)
        
        input_ids = torch.tensor(input_ids).long()
        segment_ids = torch.tensor(segment_ids).long()
        input_mask = torch.tensor(input_mask).long()
        masked_ids = torch.tensor(masked_ids).long()
        masked_pos = torch.tensor(masked_pos).long()
        masked_weights = torch.tensor(masked_weights).long()
        is_next = torch.tensor(is_next).long()
        
        
        return input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next
    
    def __len__(self):
        return self.data_length

In [4]:
dataset = SentencePairDataset()

100%|██████████| 4.48M/4.48M [00:00<00:00, 8.83MB/s]


  0%|          | 0/23627 [00:00<?, ?it/s]

In [5]:
# file 2에서 정의한 모델
def gelu(x):
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
class LayerNorm(nn.Module):
    def __init__(self, cfg, variance_epsilon=1e-12):
        super().__init__()
        self.gamma = nn.Parameter(torch.ones(cfg.dim))
        self.beta  = nn.Parameter(torch.zeros(cfg.dim))
        self.variance_epsilon = variance_epsilon

    def forward(self, x):
        # get mean, variance
        u = x.mean(-1, keepdim=True) # 각 단어별 mean
        s = (x - u).pow(2).mean(-1, keepdim=True) # 각 단어별 variance
        
        # normalize
        x = (x - u) / torch.sqrt(s + self.variance_epsilon) # (x - mean)/std 
        
        return self.gamma * x + self.beta # gamma, beta를 이용해 mean, std 조정

class Embeddings(nn.Module):
    "The embedding module from word, position and token_type embeddings."
    def __init__(self, cfg):
        super().__init__()
        self.tok_embed = nn.Embedding(cfg.vocab_size, cfg.dim) # token embedding
        self.pos_embed = nn.Embedding(cfg.max_len, cfg.dim) # position embedding
        self.seg_embed = nn.Embedding(cfg.n_segments, cfg.dim) # segment(token type) embedding

        self.norm = LayerNorm(cfg)
        self.drop = nn.Dropout(cfg.p_drop_hidden)

    def forward(self, x, seg):
        seq_len = x.size(1)
        pos = torch.arange(seq_len, dtype=torch.long, device=x.device) # 0,1,2,3,4,5, ..., seq_len-1
        pos = pos.unsqueeze(0).expand_as(x) # (S,) -> (B, S)

        e = self.tok_embed(x) + self.pos_embed(pos) + self.seg_embed(seg)
        return self.drop(self.norm(e))
class Attention(nn.Module):
    #Scaled Dot Product Attention
    
    def forward(self, query, key, value, mask=None, dropout=None):
        scores = torch.matmul(query, key.transpose(-2, -1)) \
                 / math.sqrt(query.size(-1)) # scale

        if mask is not None:
            scores = scores.masked_fill(mask == 0, -1e9)

        p_attn = F.softmax(scores, dim=-1)

        if dropout is not None:
            p_attn = dropout(p_attn)

        return torch.matmul(p_attn, value), p_attn

class MultiHeadedSelfAttention(nn.Module):
    """ Multi-Headed Dot Product Attention """
    def __init__(self, cfg):
        super().__init__()
        self.proj_q = nn.Linear(cfg.dim, cfg.dim)
        self.proj_k = nn.Linear(cfg.dim, cfg.dim)
        self.proj_v = nn.Linear(cfg.dim, cfg.dim)
        self.drop = nn.Dropout(cfg.p_drop_attn)
        self.scores = None # for visualization
        self.n_heads = cfg.n_heads

    def forward(self, x, mask):
        """
        x, q(query), k(key), v(value) : (B(batch_size), S(seq_len), D(dim))
        mask : (B(batch_size) x S(seq_len))
        * split D(dim) into (H(n_heads), W(width of head)) ; D = H * W
        """
        B,S,D = x.shape


        # (B, S, D) -proj-> (B, S, D) -split-> (B, S, H, W) -trans-> (B, H, S, W)
        q, k, v = self.proj_q(x), self.proj_k(x), self.proj_v(x)
        
        q, k, v = (x.reshape(B,S,self.n_heads,-1).transpose(1, 2)
                   for x in [q, k, v])
        # (B, H, S, W) @ (B, H, W, S) -> (B, H, S, S) -softmax-> (B, H, S, S)
        scores = q @ k.transpose(-2, -1) / np.sqrt(k.size(-1)) # @ == torch.matmul (dot product)
        if mask is not None:
            mask = mask[:, None, None, :].float()
            scores -= 10000.0 * (1.0 - mask)
        scores = self.drop(F.softmax(scores, dim=-1))
        # (B, H, S, S) @ (B, H, S, W) -> (B, H, S, W) -trans-> (B, S, H, W)
        h = (scores @ v).transpose(1, 2).contiguous()
        # -merge-> (B, S, D)
        h = h.reshape(B,S,D)
        self.scores = scores
        return h



class PositionWiseFeedForward(nn.Module):
    """ FeedForward Neural Networks for each position """
    def __init__(self, cfg):
        super().__init__()
        self.fc1 = nn.Linear(cfg.dim, cfg.dim_ff)
        self.fc2 = nn.Linear(cfg.dim_ff, cfg.dim)

    def forward(self, x):
        # (B, S, D) -> (B, S, D_ff) -> (B, S, D)
        return self.fc2(gelu(self.fc1(x)))

class Block(nn.Module):
    """ Transformer Block """
    def __init__(self, cfg):
        super().__init__()
        self.attn = MultiHeadedSelfAttention(cfg)
        self.proj = nn.Linear(cfg.dim, cfg.dim)
        self.norm1 = LayerNorm(cfg)
        self.pwff = PositionWiseFeedForward(cfg)
        self.norm2 = LayerNorm(cfg)
        self.drop = nn.Dropout(cfg.p_drop_hidden)

    def forward(self, x, mask):
        h = self.attn(x, mask)
        h = self.norm1(x + self.drop(self.proj(h)))
        h = self.norm2(h + self.drop(self.pwff(h)))
        return h


class Transformer(nn.Module):
    """ Transformer with Self-Attentive Blocks"""
    def __init__(self, cfg):
        super().__init__()
        self.embed = Embeddings(cfg)
        self.blocks = nn.ModuleList([Block(cfg) for _ in range(cfg.n_layers)])

    def forward(self, x, seg, mask):
        h = self.embed(x, seg)
        for block in self.blocks:
            h = block(h, mask)
        return h




In [6]:
# pretrain 을 위해 정의한 BERT 모델
class BERT4Pretrain(nn.Module):
    "Bert Model for Pretrain : Masked LM and next sentence classification"
    def __init__(self,cfg):
        super().__init__()
        self.transformer = Transformer(cfg)
        self.fc = nn.Linear(cfg.dim, cfg.dim)
        self.activ1 = nn.Tanh()
        self.linear = nn.Linear(cfg.dim, cfg.dim)
        self.activ2 = gelu
        self.norm = LayerNorm(cfg)

        # next sentence prediction classifier
        self.classifier = nn.Linear(cfg.dim, 2)

        # word classification layer 

        # decoder is shared with embedding layer

        # word embedding layer parameter 그대로 사용 

        embed_weight = self.transformer.embed.tok_embed.weight
        n_vocab, n_dim = embed_weight.size()
        self.decoder = nn.Linear(n_dim, n_vocab, bias=False)
        self.decoder.weight = embed_weight
        self.decoder_bias = nn.Parameter(torch.zeros(n_vocab)) # 0으로 초기화한다는 것을 명시하기 위함
        

    def forward(self, input_ids, segment_ids, input_mask, masked_pos):
        # transformer (encoder)
        h = self.transformer(input_ids, segment_ids, input_mask)

        # linear + activation
        pooled_h = self.activ1(self.fc(h[:, 0]))

        # mask된 곳들에 대해서만 language modeling 
        masked_pos = masked_pos[:, :, None].expand(-1, -1, h.size(-1)) # mask인 곳만 1, 나머지는 0 [B, S, D]
        h_masked = torch.gather(h, 1, masked_pos) # mask가 1인 곳에 있는 값만 가져옴 
        h_masked = self.norm(self.activ2(self.linear(h_masked))) # linear -> activation -> normalization
        logits_lm = self.decoder(h_masked) + self.decoder_bias # word classification layer [B, vocab_size]

        # NSP
        pooled_h = self.activ1(self.fc(h[:, 0])) # linear + activation [B, F]
        logits_clsf = self.classifier(pooled_h) # [CLS] -> NSP classification 전용 [B, 2]

        return logits_lm, logits_clsf

In [15]:
sample_config = {
    "dim": 768,
    "dim_ff": 3072,
    "n_layers": 12,
    "p_drop_attn": 0.1,
    "n_heads": 12,
    "p_drop_hidden": 0.1,
    "max_len": 512,
    "n_segments": 2,
    "vocab_size": 30522
}

class AttributeDict(dict):
    def __getattr__(self, name):
        return self[name]
model_config = AttributeDict(sample_config)



model = BERT4Pretrain(model_config)
model = model.to(device)

In [8]:
model

BERT4Pretrain(
  (transformer): Transformer(
    (embed): Embeddings(
      (tok_embed): Embedding(30522, 768)
      (pos_embed): Embedding(512, 768)
      (seg_embed): Embedding(2, 768)
      (norm): LayerNorm()
      (drop): Dropout(p=0.1, inplace=False)
    )
    (blocks): ModuleList(
      (0): Block(
        (attn): MultiHeadedSelfAttention(
          (proj_q): Linear(in_features=768, out_features=768, bias=True)
          (proj_k): Linear(in_features=768, out_features=768, bias=True)
          (proj_v): Linear(in_features=768, out_features=768, bias=True)
          (drop): Dropout(p=0.1, inplace=False)
        )
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (norm1): LayerNorm()
        (pwff): PositionWiseFeedForward(
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
        )
        (norm2): LayerNorm()
        (drop): Dropout(p=0.1, inplace=False)
      )


In [16]:
# 학습용 hyperparameters
train_config = AttributeDict({
    "batch_size": 16,
    "lr": 1e-4,
    "n_epochs": 5,
    "warmup": 0.1,
    "total_steps": 1000
})

In [17]:
# loss 와 optimize 기법 정의
criterion1 = nn.CrossEntropyLoss(reduction='none') # reduction='none': 평균을 내지 않음
criterion2 = nn.CrossEntropyLoss()

# [Batch size, Class 개수], [Batch size] -> loss: scalar
# reduction='none' -> [Batch size]


def get_loss(model, batch): 
    input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next = batch
    # input_ids: token index들 (CLS 문장 SEP 문장 3  0 0 0 0  0 0 0 ...)
    # segment_ids: 앞문장, 뒷문장 구분 ( 0 0 0 0 0 1 1 1 1 1 1....)
    # input_mask: 입력과 패딩을 구분 ( 1 1 1  1 0 0 0 0 ...)
    # masked_ids: MLM을 위해 [MASK]로 바꾼 애들(random word, 가만히 유지)의 원래 token index
    # masked_pos: [MASK]로 바꾼 애들의 문장 내 위치 [3 15 32]
    # masked_weights: [MASK]로 바꾼 것 만큼의 1 [1 1 1 0 0 0 0...]
    # is_next: next sentence prediction label (True, False)

    logits_lm, logits_clsf = model(input_ids, segment_ids, input_mask, masked_pos)

    # MLM
    # logits_lm: [B, mask 개수, class 개수(vocabulary size)] -> [B, class 개수(vocab size), mask 개수]
    # masked_ids: [B, mask 개수]
    loss_lm = criterion1(logits_lm.transpose(1, 2), masked_ids) # for masked LM

    # loss: [B]
    loss_lm = (loss_lm*masked_weights.float()).mean()

    # NSP
    loss_clsf = criterion2(logits_clsf, is_next) # for sentence classification
    return loss_lm + loss_clsf



optimizer = torch.optim.Adam(model.parameters(), train_config['lr'])

In [12]:
class Sequential_indices_Sampler(Sampler):
    def __init__(self, data_source):
        self.data_source = data_source

    def __iter__(self):
        return iter(self.data_source)

    def __len__(self):
        return len(self.data_source)


# small dataset
train_indices = range(0,2000,1) # 2000개 데이터로 학습
val_indices = range(2000,2100,1) # 100개 데이터로 평가

# train_sampler = SubsetRandomSampler(train_indices) # SubsetRandomSampler: 앞에서부터 랜덤하게 추출
# val_sampler = Sequential_indices_Sampler(val_indices) # Sequential_indices_Sampler: 앞에서부터 순서대로 추출

train_loader = data.DataLoader(dataset, batch_size=train_config.batch_size, sampler=train_sampler)
val_loader = data.DataLoader(dataset, batch_size=train_config.batch_size, sampler=val_sampler)

In [18]:
import math
import torch.nn.functional as F

# train mode
model.train() 

for epoch in range(train_config.n_epochs):
    loss_sum = 0
    for i, batch in enumerate(tqdm(train_loader)):
        batch = [t.to(device) for t in batch]

        loss = get_loss(model, batch).mean()

        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()
    
        loss_sum += loss.item()
        
    print('Epoch {}/{} : Average Loss {:.3f}'.format(epoch+1, train_config.n_epochs, loss_sum/(i+1)))

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 1/5 : Average Loss 24.509


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 2/5 : Average Loss 18.686


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 3/5 : Average Loss 16.401


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 4/5 : Average Loss 14.465


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch 5/5 : Average Loss 12.877


In [19]:
# validation mode
model.eval() 

with torch.no_grad():
    loss_sum = 0
    for i, batch in enumerate(tqdm(train_loader)):
        batch = [t.to(device) for t in batch]
        loss = get_loss(model, batch).mean()
        loss_sum += loss.item()
        
    print('Validation Average Loss {:.3f}'.format(loss_sum/(i+1)))

  0%|          | 0/125 [00:00<?, ?it/s]

Validation Average Loss 12.069


In [20]:
input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next = batch
logits_lm, logits_clsf = model(input_ids, segment_ids, input_mask, masked_pos)


In [21]:
logits_lm[0].size()

torch.Size([20, 30522])

In [22]:
example_idx = 12
word2idx = dataset.tokenizer.get_vocab()
idx2word = {word2idx[word]:word for word in word2idx}

def get_sentence(logits, GT=True):
    if not GT:
        idxs = logits.argmax(1).tolist()
    else:
        idxs = logits.tolist()
    words = [idx2word[idx] for idx in idxs if idx2word[idx]!='[PAD]']
    return words

print('#  model input\n')
print(' '.join(get_sentence(input_ids[example_idx])), '\n')

print('\n\n')
print('#  model prediction at masked position\n')
print(' '.join(get_sentence(logits_lm[example_idx], GT=False)))


#  model input

[CLS] [MASK] temples  had  several  sanctu [MASK] each  with a  cult  stat ue  representing  one of the  gods  in a  group  such as a  family  [MASK] [MASK] . [SEP] The  [MASK] resi ding  [MASK] templ es of  Egypt  collectively  represented the  entire  pan the on  . [SEP] 




#  model prediction at masked position

<unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>


In [None]:
# positional embedding
# - sinusoidal function 
# - learnable parameter
#   - 성능이 좋아서 사용
#   - 하이퍼파라미터 정도? 