In [None]:
# read raw text 
with open("naval-ravikant.txt", "r", encoding="utf-8") as f:
    book_raw_text = f.read()


In [None]:
# basic tokenizer
import re

preprocessed = re.split(r"([,.:;?_!\"()\'\[\]]|--|\s)", book_raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed), "tokens")
print(preprocessed[:20])

49359 tokens
['PART', 'I', 'WEALTH', 'How', 'to', 'get', 'rich', 'without', 'getting', 'lucky', '.', 'BUILDING', 'WEALTH', '·', '29BUILDING', 'WEALTH', 'Making', 'money', 'is', 'not']


In [10]:
# tokenization vocabulary
all_words = sorted(list(set(preprocessed)))
vocab_size = len(all_words)
print("Vocabulary size:", vocab_size)
print(all_words[:10])

Vocabulary size: 6138
['!', '#1', '#2', '$1', '$100', '$200', '$5', '$50', '&', '(']


In [12]:
vocab = {token:integer for integer, token in enumerate(all_words)}
print(vocab)

{'!': 0, '#1': 1, '#2': 2, '$1': 3, '$100': 4, '$200': 5, '$5': 6, '$50': 7, '&': 8, '(': 9, ')': 10, '*': 11, '+': 12, ',': 13, '-': 14, '.': 15, '//': 16, '//fs': 17, '//idlewords': 18, '//meltingasphalt': 19, '//nav': 20, '//stratechery': 21, '00': 22, '000': 23, '00001': 24, '000X': 25, '000x': 26, '1': 27, '10': 28, '10/10': 29, '100': 30, '101': 31, '101SHED': 32, '102': 33, '103like': 34, '104': 35, '105radical': 36, '106': 37, '107of': 38, '108': 39, '109care': 40, '10x': 41, '11': 42, '110': 43, '111FALSIFIABILITY': 44, '112': 45, '113Your': 46, '114': 47, '115Read': 48, '116': 49, '117random': 50, '118': 51, '119Study': 52, '12': 53, '120': 54, '121have': 55, '122': 56, '123If': 57, '124': 58, '127LEARNING': 59, '128': 60, '129satisfaction': 61, '130': 62, '131This': 63, '132': 64, '133A': 65, '134': 66, '135Anticipation': 67, '136': 68, '137DESIRE': 69, '138': 70, '139One': 71, '14': 72, '140': 73, '140-character': 74, '141Most': 75, '142': 76, '143The': 77, '144': 78, '145O

In [None]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {v: k for k, v in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r"([,.:;?_!\"()\'\[\]]|--|\s)", text)
        tokens = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [item if item in self.str_to_int else "<|unk|>" for item in tokens]
        ids = [self.str_to_int[token] for token in preprocessed]
        return ids

    def decode(self, token_ids):
        text = ' '.join([self.int_to_str[token_id] for token_id in token_ids])
        text = re.sub(r'\s([,.:;?_!\"()\'\[\]])', r'\1', text)
        return text

In [16]:
# special tokens
all_tokens = sorted(list(set(preprocessed + ["<|unk|>", "<|endoftext|>"])))
vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()), "tokens with special tokens")

6140 tokens with special tokens


In [23]:
# bpe tokenization
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

tokenizer = tiktoken.get_encoding("gpt2")

text = "akad bakad bambe"
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print("BPE token IDs:", integers)

decoded_text = tokenizer.decode(integers)
print("Decoded text:", decoded_text)

tiktoken version: 0.12.0
BPE token IDs: [461, 324, 275, 461, 324, 275, 321, 1350]
Decoded text: akad bakad bambe


In [25]:
with open("naval-ravikant.txt", "r", encoding="utf-8") as f:
    book_raw_text = f.read()

enc_text = tokenizer.encode(book_raw_text)
print("Encoded text length:", len(enc_text))

Encoded text length: 65932


In [27]:
enc_sample = enc_text[50:]
context_size = 10
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print("x:", x)
print("y:", y)

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print("Context:", context, "->", desired)
    print("Decoded context:", tokenizer.decode(context), "->", tokenizer.decode([desired]))

x: [1270, 14128, 35219, 2257, 6981, 29630, 12887, 40818, 3180, 29244]
y: [14128, 35219, 2257, 6981, 29630, 12887, 40818, 3180, 29244, 11617]
Context: [1270] -> 14128
Decoded context: 30 ->  ·
Context: [1270, 14128] -> 35219
Decoded context: 30 · ->  UNDER
Context: [1270, 14128, 35219] -> 2257
Decoded context: 30 · UNDER -> ST
Context: [1270, 14128, 35219, 2257] -> 6981
Decoded context: 30 · UNDERST -> AND
Context: [1270, 14128, 35219, 2257, 6981] -> 29630
Decoded context: 30 · UNDERSTAND ->  HOW
Context: [1270, 14128, 35219, 2257, 6981, 29630] -> 12887
Decoded context: 30 · UNDERSTAND HOW ->  WE
Context: [1270, 14128, 35219, 2257, 6981, 29630, 12887] -> 40818
Decoded context: 30 · UNDERSTAND HOW WE -> ALTH
Context: [1270, 14128, 35219, 2257, 6981, 29630, 12887, 40818] -> 3180
Decoded context: 30 · UNDERSTAND HOW WEALTH ->  IS
Context: [1270, 14128, 35219, 2257, 6981, 29630, 12887, 40818, 3180] -> 29244
Decoded context: 30 · UNDERSTAND HOW WEALTH IS ->  CRE
Context: [1270, 14128, 35219,

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        token_ids = tokenizer.encode(txt)

        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i+max_length]
            target_chunk = token_ids[i+1: i+1 + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

data_loader = create_dataloader_v1(book_raw_text, batch_size=8, max_length=4, stride=4, shuffle=False)
data_iter = iter(data_loader)
first_batch = next(data_iter)
print("First batch input IDs:", first_batch[0])
print("First batch target IDs:", first_batch[1])
print("Decoded first batch input:", tokenizer.decode(first_batch[0][0].tolist()))
print("Decoded first batch target:", tokenizer.decode(first_batch[1][0].tolist()))

second_batch = next(data_iter)
print("Second batch input IDs:", second_batch[0])
print("Second batch target IDs:", second_batch[1])
print("Decoded second batch input:", tokenizer.decode(second_batch[0][0].tolist()))
print("Decoded second batch target:", tokenizer.decode(second_batch[1][0].tolist()))
        

First batch input IDs: tensor([[30709,   314,   198,  8845],
        [40818,   198,  2437,   284],
        [  651,  5527,  1231,  1972],
        [ 9670,    13,   198,   198],
        [19499, 26761,  2751,   220],
        [12887, 40818, 14128,   220],
        [20571, 26761,  2751, 12887],
        [40818,   198, 23874,  1637]])
First batch target IDs: tensor([[  314,   198,  8845, 40818,   198],
        [  198,  2437,   284,   651,  5527],
        [ 5527,  1231,  1972,  9670,    13],
        [   13,   198,   198, 19499, 26761],
        [26761,  2751,   220, 12887, 40818],
        [40818, 14128,   220, 20571, 26761],
        [26761,  2751, 12887, 40818,   198],
        [  198, 23874,  1637,   318,   407]])
Decoded first batch input: PART I
WE
Decoded first batch target:  I
WEALTH

Second batch input IDs: tensor([[  318,   407,   257,  1517],
        [  345,   466,   960,   270],
        [  447,   247,    82,   257],
        [ 5032,   345,  2193,    13],
        [  198,   198,  1270, 14128

In [38]:
vocab_size = 50257
output_dimensions = 256
token_embedding_layer = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=output_dimensions)
print(token_embedding_layer)

max_length = 4
data_loader = create_dataloader_v1(book_raw_text, batch_size=8, max_length=max_length, stride=max_length, shuffle=False)
data_iter = iter(data_loader)
inputs, targets = next(data_iter)
print("Input shape:", inputs)

token_embeddings = token_embedding_layer(inputs)
print("Token embeddings shape:", token_embeddings.shape)



Embedding(50257, 256)
Input shape: tensor([[30709,   314,   198,  8845],
        [40818,   198,  2437,   284],
        [  651,  5527,  1231,  1972],
        [ 9670,    13,   198,   198],
        [19499, 26761,  2751,   220],
        [12887, 40818, 14128,   220],
        [20571, 26761,  2751, 12887],
        [40818,   198, 23874,  1637]])
Token embeddings shape: torch.Size([8, 4, 256])


In [None]:
# representation sample
sample_idx = 3
token_ids = inputs[sample_idx, :4].tolist()
pieces = [tokenizer.decode([tid]) for tid in token_ids]
print("ids:", token_ids)
for k, (tid, piece) in enumerate(zip(token_ids, pieces)):
    print(f"{k}: id={tid}, piece={repr(piece)}")

ids: [9670, 13, 198, 198]
0: id=9670, piece=' lucky'
1: id=13, piece='.'
2: id=198, piece='\n'
3: id=198, piece='\n'


In [41]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(num_embeddings=context_length, embedding_dim=output_dimensions)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print("Positional embeddings shape:", pos_embeddings.shape)

input_embeddings = token_embeddings + pos_embeddings
print("Input embeddings shape:", input_embeddings.shape)

Positional embeddings shape: torch.Size([4, 256])
Input embeddings shape: torch.Size([8, 4, 256])
