In [1]:
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
import os
os.chdir(os.path.dirname(os.getcwd()))

In [3]:
import numpy as np
from tqdm.auto import tqdm

from hcrot import layers, optim
from hcrot.dataset import *
from hcrot.utils import *

In [4]:
with open('datasets/data.txt', 'r', encoding='utf-8') as f:
    sentences = f.read().split('\n')

## Tokenize

In [None]:
import re
from collections import defaultdict

PUNCTUATIONS = ['?','!','@','#','$','%','&','*','(',')','-','=','+',',','.',';',':','\'']
LOWER_ALPHAS = [chr(i) for i in range(97,97+26)]
UPPER_ALPHAS = [chr(i) for i in range(65,65+26)]
NUMBERS = ['0','1','2','3','4','5','6','7','8','9']
WHITE = chr(0x0120)
SPECIAL_TOKENS = ["<eos>", "<pad>", "<unk>"]

dictionary = defaultdict(int)

for sentence in sentences:
    tokenized = re.findall(r"\w+|[^\w\s]", sentence)
    for i, word in enumerate(tokenized):
        word = WHITE + word if i != 0 else word
        dictionary[word] += 1

vocabulary =  PUNCTUATIONS + LOWER_ALPHAS + UPPER_ALPHAS + SPECIAL_TOKENS + [WHITE] + NUMBERS
MAX_ITERATIONS = 10000
max_vocab_size = 100000
iterations = 0

def split_word(word, vocab):
    i, word_splited = 0, []
    while i < len(word):
        matched = False
        for j in range(len(word), i, -1):
            if word[i:j] in vocab:
                word_splited.append(word[i:j])
                i = j
                matched = True
                break
        if not matched:
            word_splited.append(word[i])
            i += 1
    return word_splited

while iterations < MAX_ITERATIONS and len(vocabulary) < max_vocab_size:
    pairs = defaultdict(int)
    for word, cnt in dictionary.items():
        tokens = split_word(word, set(vocabulary))
        for a, b in zip(tokens[:-1],tokens[1:]):
            pairs[a + b] += cnt
        
    if not pairs:
        break
    
    best_pair = max(pairs.items(), key=lambda x: x[1])[0]
    vocabulary.append(best_pair)
    
    new_dictionary = defaultdict(int)
    for word, cnt in dictionary.items():
        tokens = split_word(word, set(vocabulary))
        new_word = ''.join(tokens)
        new_dictionary[new_word] += cnt
    dictionary = new_dictionary
    iterations += 1

In [73]:
class BPETokenizer:
    def __init__(self, vocab: list, max_len = 50):
        self.vocab = vocab
        self.max_len = max_len
        
        self.token2ids = {token:i for i, token in enumerate(vocab)}
        self.ids2token = {i:token for token, i in self.token2ids.items()}
        
        self.white = chr(0x0120)
        self.EOS = self.token2ids["<eos>"]
        self.PAD = self.token2ids["<pad>"]
        self.UNK = self.token2ids["<unk>"]
        self.special_tokens = [self.EOS, self.PAD, self.UNK]

    def __call__(self, inputs: list):
        token_ids = self.encode(inputs)
        for i, ids in enumerate(token_ids):
            token_ids[i] = ids + [self.EOS] + [self.PAD] * (self.max_len - len(ids) - 1)
        return np.array(token_ids)

    @property
    def vocab_size(self):
        return len(self.vocab)

    def encode(self, inputs: list):
        if isinstance(inputs, str):
            inputs = [inputs]
        
        encoded = []
        for string in inputs:
            tokens = self.tokenize(string)
            ids = [self.token2ids[token] for token in tokens]
            encoded.append(ids)
        return encoded

    def decode(self, token_ids: list, skip_special_tokens: bool = False):
        decoded = []
        for tokens in token_ids:
            if not skip_special_tokens:
                tokens = [self.ids2token[token] for token in tokens]
            else:
                tokens = [self.ids2token[token] for token in tokens if token not in self.special_tokens]
            tokens = ''.join(tokens)
            decoded.append(tokens.replace(self.white, ' '))
        return decoded

    def tokenize(self, sentence: str):
        tokens = []
        for i, word in enumerate(sentence.split()):
            word = self.white + word if i != 0 else word
            tokens += self.word_tokenize(word)
        return tokens

    def word_tokenize(self, word: str):
        while True:
            i, r = 0, []
            while i < len(word):
                if i + 1 < len(word) and (word[i] + word[i+1]) in self.vocab:
                    r.append(word[i] + word[i+1])
                    i += 1
                elif word[i] in self.vocab:
                    r.append(word[i])
                else:
                    r.append(self.UNK)
                i += 1
            if word == r:
                break
            word = r
        return word
    
tokenizer = BPETokenizer(vocab=vocabulary)

## Modeling

In [13]:
def get_sinusoid_encoding_table(n_seq, d_hidn):
    # refs: https://paul-hyun.github.io/transformer-01/
    def cal_angle(position, i_hidn):
        return position / np.power(10000, 2 * (i_hidn // 2) / d_hidn)
    def get_posi_angle_vec(position):
        return [cal_angle(position, i_hidn) for i_hidn in range(d_hidn)]

    sinusoid_table = np.array([get_posi_angle_vec(i_seq) for i_seq in range(n_seq)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # even index sin 
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # odd index cos

    return sinusoid_table

class GPT(layers.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, max_len=16):
        super().__init__()
        self.embed_size = embed_size
        self.embedding = layers.Embedding(vocab_size, embed_size)
        self.positional_encoding = np.expand_dims(get_sinusoid_encoding_table(max_len, embed_size), axis=0)
        self.transformer_decoder_layer = layers.TransformerDecoderLayer(
            d_model=embed_size,
            nhead=num_heads,
            dim_feedforward=embed_size * 4,
            batch_first=True
        )
        self.transformer_decoder = layers.TransformerDecoder(
            self.transformer_decoder_layer,
            num_layers=num_layers,
        )
        self.fc_out = layers.Linear(embed_size, vocab_size)

    def forward(self, tgt):
        tgt_len = tgt.shape[1]
        tgt_mask = self._generate_square_subsequent_mask(tgt_len)

        tgt_emb = self.embedding(tgt) + self.positional_encoding[:, :tgt_len, :]

        output = self.transformer_decoder(tgt_emb, tgt_emb, tgt_mask=tgt_mask)
        output = self.fc_out(output)

        return output

    def _generate_square_subsequent_mask(self, sz):
        mask = np.triu(np.ones((sz, sz)), 1)
        return mask

## Train

### GPT

In [None]:
embed_size = 64
num_heads = 4
num_layers = 1
max_len = 52
vocab_size = tokenizer.vocab_size

model = GPT(vocab_size, embed_size, num_heads, num_layers, max_len=max_len)
# model.load_state_dict(load('notebooks/gpt.pickle'))

criterion = layers.CrossEntropyLoss()
optimizer = optim.AdamW(model, lr_rate=1e-4, betas=(0.9, 0.95))

tokenized = tokenizer(sentences)
inputs = tokenized[:, :-1]
targets = tokenized[:, 1:]

dataloader = Dataloader(inputs, targets, batch_size=4, shuffle=True)
mini_batch_len = len(dataloader)

num_epochs = 600
pbar = tqdm(range(num_epochs))
for epoch in pbar:
    total_loss = 0
    for inputs_, targets_ in dataloader:
        bsz, seq_len = inputs_.shape
        outputs = model(inputs_)
        outputs = outputs.reshape(-1, vocab_size)
        targets_ = targets_.reshape(-1)
        loss = criterion(outputs, targets_)
        total_loss += loss.item()

        dz = criterion.backward()
        dz = dz.reshape(bsz, seq_len, -1)
        optimizer.update(dz)
    
    pbar.set_description(f'Loss: {total_loss/mini_batch_len:.5f}')

Loss: 0.00001: 100%|██████████| 200/200 [09:30<00:00,  2.85s/it]


In [None]:
# save(model.state_dict(), 'notebooks/gpt.pickle')

## Predict

### GPT

In [95]:
def generate_sentence(model, start_sentence, max_len):
    generated = tokenizer.encode(start_sentence)
    input_seq = np.array(generated)
    
    model.eval()
    while len(input_seq[0]) < max_len:
        logits = model(input_seq)
        next_token_logits = logits[-1, -1]
        next_token = np.argmax(next_token_logits).item()
        generated[0].append(next_token)
        if next_token == tokenizer.EOS:
            break
        input_seq = np.array(generated)
    return tokenizer.decode(generated, skip_special_tokens=True)[0]

def generate_with_top_k_and_top_p(model, start_sentence, max_len, top_k=0, top_p=1.0):
    def top_k_top_p_filtering(logits, top_k=0, top_p=0.1):
        filter_value = float('-inf')
        if top_k > 0:
            indices_to_remove = logits < np.take_along_axis(logits, np.argsort(-logits,axis=-1)[:,:top_k], axis=-1)[..., -1, None]
            logits[indices_to_remove] = filter_value
        
        if top_p < 1.:
            sorted_indices = np.argsort(-logits)
            sorted_logits = -np.sort(-logits)
            
            cumulative_probs = np.cumsum(softmax(sorted_logits, dim=-1), axis=-1)
            
            sorted_indices_to_remove = cumulative_probs > top_p
            
            # shift
            sorted_indices_to_remove = np.roll(sorted_indices_to_remove, 1)
            sorted_indices_to_remove[:,0] = 0
            
            indices_to_remove = np.zeros_like(logits, dtype=bool)
            np.put_along_axis(indices_to_remove, sorted_indices, sorted_indices_to_remove, axis=-1)
            
            logits[indices_to_remove] = filter_value
            
        return logits
    
    generated = tokenizer.encode(start_sentence)
    input_seq = np.array(generated)
    
    model.eval()
    while len(input_seq[0]) < max_len:
        logits = model(input_seq)
        next_token_logits = logits[-1]
        next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
        probs = softmax(next_token_logits, dim=-1)[-1]
        next_token = np.random.choice(np.arange(probs.shape[0]), size=1, p=probs)
        generated[0].append(next_token[0])
        if next_token == tokenizer.EOS:
            break
        input_seq = np.array(generated)
        
    return tokenizer.decode(generated, skip_special_tokens=True)[0]

# Great minds think alike, but they also think differently.
input_sentence = "Great minds think alike, but"
print(f'input_sentence: {input_sentence}')
print(f'greedy: {generate_sentence(model, input_sentence, max_len=max_len)}')
print(f'top_k_top_p: {generate_with_top_k_and_top_p(model, input_sentence, max_len=max_len, top_k=64, top_p=0.8)}')

input_sentence: Great minds think alike, but
greedy: Great minds think alike, but they also think differentlyhen clockreatreat minds shine creat minds shine brightly worth beat decre choose integrity they require what weighs
top_k_top_p: Great minds think alike, but they also think differentlyhen clockreat minds shines creat minds shine brightly scat minds shineson they some crossreat
