In [None]:
import numpy as np
from tqdm.auto import tqdm

from hcrot import layers, optim
from hcrot.utils import softmax

In [2]:
sentences = [
    "The quick brown fox jumps over the lazy dog",
    "A journey of a thousand miles begins with a single step",
    "To be or not to be that is the question",
    "All that glitters is not gold but it is very valuable",
    "Knowledge is power but enthusiasm pulls the switch",
    "The only thing we have to fear is fear itself",
    "In the end we will remember not the words of our enemies",
    "Life is what happens when you are busy making other plans",
    "To succeed in life you need two things ignorance and confidence",
    "The future belongs to those who believe in the beauty of their dreams",
    "Every cloud has a silver lining if you know where to look",
    "Actions speak louder than words when it truly matters",
    "A picture is worth a thousand words but the memory lasts longer",
    "The pen is mightier than the sword when used wisely",
    "Time waits for no one, so make every second count",
    "Good things come to those who wait with patience",
    "You miss 100% of the shots you do not take, so go for it",
    "The early bird catches the worm but the second mouse gets the cheese",
    "Do not count your chickens before they hatch or you will be disappointed",
    "Practice makes perfect but nobody is truly perfect",
    "Rome was not built in a day, and neither are dreams",
    "A friend in need is a friend indeed, cherish them",
    "The grass is always greener on the other side of the fence",
    "You can not judge a book by its cover, so read the story",
    "Better late than never, but never late is better",
    "When life gives you lemons, make lemonade with a smile",
    "Every rose has its thorn, but do not let it deter you",
    "Two heads are better than one when solving a problem",
    "The road to success is always under construction",
    "An apple a day keeps the doctor away, but laughter is the best medicine"
]

vocab = {}
for sentence in sentences:
    for word in sentence.split():
        if word not in vocab:
            vocab[word] = len(vocab)

vocab['<pad>'] = len(vocab.keys())
vocab['<eos>'] = len(vocab.keys())

inverse_vocab = {v: k for k, v in vocab.items()}
vocab_size = len(vocab)

def tokenize(sentence):
    return [vocab[word] for word in sentence.split()]

data = [tokenize(sentence) for sentence in sentences]

max_len = max(len(sentence) for sentence in data)
padded_data = [sentence + [vocab['<pad>']] * (max_len - len(sentence)) + [vocab['<eos>']] for sentence in data]
padded_data = np.array(padded_data)

In [3]:
def get_sinusoid_encoding_table(n_seq, d_hidn):
    # refs: https://paul-hyun.github.io/transformer-01/
    def cal_angle(position, i_hidn):
        return position / np.power(10000, 2 * (i_hidn // 2) / d_hidn)
    def get_posi_angle_vec(position):
        return [cal_angle(position, i_hidn) for i_hidn in range(d_hidn)]

    sinusoid_table = np.array([get_posi_angle_vec(i_seq) for i_seq in range(n_seq)])
    sinusoid_table[:, 0::2] = np.sin(sinusoid_table[:, 0::2])  # even index sin 
    sinusoid_table[:, 1::2] = np.cos(sinusoid_table[:, 1::2])  # odd index cos

    return sinusoid_table

class GPT(layers.Module):
    def __init__(self, vocab_size, embed_size, num_heads, num_layers, max_len=16):
        super().__init__()
        self.embed_size = embed_size
        self.embedding = layers.Embedding(vocab_size, embed_size)
        self.positional_encoding = np.expand_dims(get_sinusoid_encoding_table(max_len, embed_size), axis=0)
        self.transformer_decoder_layer = layers.TransformerDecoderLayer(
            d_model=embed_size,
            nhead=num_heads,
            dim_feedforward=embed_size * 4,
            batch_first=True
        )
        self.transformer_decoder = layers.TransformerDecoder(
            self.transformer_decoder_layer,
            num_layers=num_layers,
        )
        self.fc_out = layers.Linear(embed_size, vocab_size)

    def forward(self, tgt):
        tgt_len = tgt.shape[1]
        tgt_mask = self._generate_square_subsequent_mask(tgt_len)

        tgt_emb = self.embedding(tgt) + self.positional_encoding[:, :tgt_len, :]

        output = self.transformer_decoder(tgt_emb, tgt_emb, tgt_mask=tgt_mask)
        output = self.fc_out(output)

        return output

    def _generate_square_subsequent_mask(self, sz):
        mask = np.triu(np.ones((sz, sz)), 0)
        return mask

In [4]:
embed_size = 256
num_heads = 4
num_layers = 6

model = GPT(vocab_size, embed_size, num_heads, num_layers)
criterion = layers.CrossEntropyLoss()
optimizer = optim.Adam(model, lr_rate=1e-3)

inputs = padded_data[:, :-1]
targets = padded_data[:, 1:]
bsz, seq_len = inputs.shape

num_epochs = 50
pbar = tqdm(range(num_epochs))
for epoch in pbar:
    outputs = model.forward(inputs)
    
    outputs = outputs.reshape(-1, vocab_size)
    targets = targets.reshape(-1)
    loss = criterion(outputs, targets)

    dz = criterion.backward()
    dz = dz.reshape(bsz, seq_len, -1)
    optimizer.update(dz)
    
    pbar.set_description(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():5f}')

Epoch 50/50, Loss: 0.145368: 100%|██████████| 50/50 [05:35<00:00,  6.71s/it]


In [7]:
sentences

['The quick brown fox jumps over the lazy dog',
 'A journey of a thousand miles begins with a single step',
 'To be or not to be that is the question',
 'All that glitters is not gold but it is very valuable',
 'Knowledge is power but enthusiasm pulls the switch',
 'The only thing we have to fear is fear itself',
 'In the end we will remember not the words of our enemies',
 'Life is what happens when you are busy making other plans',
 'To succeed in life you need two things ignorance and confidence',
 'The future belongs to those who believe in the beauty of their dreams',
 'Every cloud has a silver lining if you know where to look',
 'Actions speak louder than words when it truly matters',
 'A picture is worth a thousand words but the memory lasts longer',
 'The pen is mightier than the sword when used wisely',
 'Time waits for no one, so make every second count',
 'Good things come to those who wait with patience',
 'You miss 100% of the shots you do not take, so go for it',
 'The ea

In [85]:
def generate_sentence(model, start_sentence, max_len):
    generated = [vocab[token] for token in start_sentence.split()]
    input_seq = np.expand_dims(np.array(generated),0)
    
    while len(input_seq[0]) < max_len:
        # print(' '.join(inverse_vocab[token] for token in input_seq[0]))
        output = model.forward(input_seq)
        next_token_logits = output[-1, -1]
        next_token = np.argmax(next_token_logits).item()
        generated.append(next_token)
        if next_token == vocab['<eos>']:
            break
        input_seq = np.array([generated])
        
    return ' '.join(inverse_vocab[token] for token in generated)

def generate_with_top_k_and_top_p(model, start_sentence, max_len, top_k=0, top_p=1.0):
    def top_k_top_p_filtering(logits, top_k=0, top_p=0.1):
        filter_value = float('-inf')
        if top_k > 0:
            indices_to_remove = logits < np.take_along_axis(logits, np.argsort(-logits,axis=-1)[:,:top_k], axis=-1)[..., -1, None]
            logits[indices_to_remove] = filter_value
        
        if top_p < 1.:
            sorted_indices = np.argsort(-logits)
            sorted_logits = -np.sort(-logits)
            
            cumulative_probs = np.cumsum(softmax(sorted_logits, dim=-1), axis=-1)
            
            sorted_indices_to_remove = cumulative_probs > top_p
            
            # shift
            sorted_indices_to_remove = np.roll(sorted_indices_to_remove, 1)
            sorted_indices_to_remove[:,0] = 0
            
            indices_to_remove = np.zeros_like(logits, dtype=bool)
            np.put_along_axis(indices_to_remove, sorted_indices, sorted_indices_to_remove, axis=-1)
            
            logits[indices_to_remove] = filter_value
            
        return logits
    
    generated = [vocab[token] for token in start_sentence.split()]
    input_seq = np.expand_dims(np.array(generated),0)
    
    while len(input_seq[0]) < max_len:
        output = model.forward(input_seq)
        next_token_logits = output[-1]
        next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
        probs = softmax(next_token_logits, dim=-1)
        next_token = np.random.choice(np.arange(probs[0].shape[0]),size=1,p=probs[0])
        generated.append(next_token[0])
        if next_token == vocab['<eos>']:
            break
        input_seq = np.array([generated])
        
    return ' '.join(inverse_vocab[token] for token in generated)

sentence = "You are"
print('greedy:', generate_sentence(model, sentence, max_len=16))
print('topk and topp:', generate_with_top_k_and_top_p(model, sentence, max_len=16, top_k=0, top_p=1))

greedy: You are <eos>
topk and topp: You are busy busy late Knowledge <eos>
