# read the book

In [1]:
with open('the-verdict.txt', 'r') as f:
    book_text = f.read()

In [2]:
print("Total length = {}".format(len(book_text)))
print("Sample = '{}'".format(book_text[:30]))

Total length = 20479
Sample = 'I HAD always thought Jack Gisb'


In [3]:
import re
text = "Hello, (world). Is_ th;is-- a te:st?"

In [4]:
parts = re.split(r'([,.\?"_:;\(\)\']|\-\-|\s)', text)

In [5]:
tokens = [tk for tk in parts if tk.strip()]
print(tokens)

['Hello', ',', '(', 'world', ')', '.', 'Is', '_', 'th', ';', 'is', '--', 'a', 'te', ':', 'st', '?']


In [6]:
parts = re.split(r'([,.\?"_:;\(\)\'!]|\-\-|\s)', book_text)
preprocessed = [tk for tk in parts if tk.strip()]

In [7]:
print("Total tokens = {}".format(len(preprocessed)))
print("Sample = '{}'".format(preprocessed[:30]))

Total tokens = 4690
Sample = '['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']'


In [8]:
unique_tks = sorted(set(preprocessed))
vocab = {tk: idx for idx, tk in enumerate(unique_tks)}

In [9]:
print(len(vocab))
print(list(vocab.items())[:20])

1130
[('!', 0), ('"', 1), ("'", 2), ('(', 3), (')', 4), (',', 5), ('--', 6), ('.', 7), (':', 8), (';', 9), ('?', 10), ('A', 11), ('Ah', 12), ('Among', 13), ('And', 14), ('Are', 15), ('Arrt', 16), ('As', 17), ('At', 18), ('Be', 19)]


In [10]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, passage):
        parts = re.split(r'([,.\?"_:;\(\)\'!]|\-\-|\s)', passage)
        preprocessed = [tk for tk in parts if tk.strip()]
        return [self.str_to_int[tk] for tk in preprocessed]

    def decode(self, tokens):
        processed = " ".join(self.int_to_str[tk] for tk in tokens)
        r1 = re.sub(r'\s+([,.:;_\)"\']|--)', r'\1', processed)
        # for ( & -- we should remove the suffix spaces
        return re.sub(r'([\(]|--)\s+', r'\1', r1)

In [11]:
# test tokenizer
text = book_text[:297]
print("text = {}".format(text))

text = I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would


In [12]:
tokenizer = SimpleTokenizerV1(vocab)
encoded = tokenizer.encode(text)
print("encoded = {}".format(encoded))
decoded = tokenizer.decode(encoded)
print("decoded = '{}'".format(decoded))

encoded = [53, 44, 149, 1003, 57, 38, 818, 115, 256, 486, 6, 1002, 115, 500, 435, 392, 6, 908, 585, 1077, 709, 508, 961, 1016, 663, 1016, 535, 987, 5, 568, 988, 538, 722, 549, 496, 5, 533, 514, 370, 549, 748, 5, 661, 115, 841, 1102, 5, 157, 397, 547, 568, 115, 1066, 727, 988, 84, 7, 3, 99, 53, 818, 1003, 585, 1120]
decoded = 'I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would'


In [13]:
text == decoded

True

# Support non trained words and end of content tokens

In [14]:
UNKNOWN_TOKEN = '<|unk|>'
END_OF_TEXT_TOKEN = '<|endoftext|>'

In [15]:
unique_tks.extend([UNKNOWN_TOKEN, END_OF_TEXT_TOKEN])
len(unique_tks)

1132

In [16]:
vocab = {tk: idx for idx, tk in enumerate(unique_tks)}

In [17]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
        self.UNKNOWN_TOKEN = vocab[UNKNOWN_TOKEN]   # Change from V1

    def encode(self, passage):
        parts = re.split(r'([,.\?"_:;\(\)\'!]|\-\-|\s)', passage)
        preprocessed = [tk for tk in parts if tk.strip()]
        return [self.str_to_int.get(tk, self.UNKNOWN_TOKEN) for tk in preprocessed]   # Change from V1

    def decode(self, tokens):
        processed = " ".join(self.int_to_str[tk] for tk in tokens)
        r1 = re.sub(r'\s+([,.:;_\)"\']|--)', r'\1', processed)
        # for ( & -- we should remove the suffix spaces
        return re.sub(r'([\(]|--)\s+', r'\1', r1)

In [18]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " {} ".format(END_OF_TEXT_TOKEN).join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [19]:
print([vocab[UNKNOWN_TOKEN], vocab[END_OF_TEXT_TOKEN]])

[1130, 1131]


In [20]:
tokenizer2 = SimpleTokenizerV2(vocab)

In [21]:
encoded = tokenizer2.encode(text)
print("encoded = {}".format(encoded))
decoded = tokenizer2.decode(encoded)
print("decoded = '{}'".format(decoded))

encoded = [1130, 5, 355, 1126, 628, 975, 10, 1131, 55, 988, 956, 984, 722, 988, 1130, 7]
decoded = '<|unk|>, do you like tea ? <|endoftext|> In the sunlit terraces of the <|unk|>.'


In [22]:
decoded == text

False

# Use external Byte-pair tokenizer

In [23]:
import tiktoken
tiktoken.__version__

'0.12.0'

In [24]:
tokenizer = tiktoken.get_encoding("gpt2")

In [25]:
text = (
 "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
 "of someunknownPlace."
)
encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
print("encoded = {}".format(encoded))
decoded = tokenizer.decode(encoded)
print("decoded = '{}'".format(decoded))

encoded = [15496, 11, 466, 345, 588, 8887, 30, 220, 50256, 554, 262, 4252, 18250, 8812, 2114, 1659, 617, 34680, 27271, 13]
decoded = 'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'


In [26]:
text == decoded

True

In [27]:
example_encoding = tokenizer.encode("Akwirw ier")  # This tokenizer works with anything
print(example_encoding)
print([tokenizer.decode([x]) for x in example_encoding])

[33901, 86, 343, 86, 220, 959]
['Ak', 'w', 'ir', 'w', ' ', 'ier']


# Sliding window sampling

In [28]:
book_encoded = tokenizer.encode(book_text)
len(book_encoded)

5145

In [29]:
sample_enc = book_encoded[50:]

In [30]:
context_size = 4
for i in range(context_size):
    print(tokenizer.decode(sample_enc[:(i+1)]), " ---> ", tokenizer.decode([sample_enc[i+1]]))

 and  --->   established
 and established  --->   himself
 and established himself  --->   in
 and established himself in  --->   a


In [31]:
import torch
torch.__version__

'2.10.0+rocm7.1'

In [32]:
torch.cuda.is_available()

/opt/amdgpu/share/libdrm/amdgpu.ids: No such file or directory


True

In [33]:
from torch.utils.data import Dataset, DataLoader

In [39]:
class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, window_length, stride):
        tokens = tokenizer.encode(txt)
        self.input_tokens = []
        self.output_tokens = []
        for i in range(0, len(tokens) - window_length, stride):
            self.input_tokens.append(torch.tensor(tokens[i:(i+window_length)]))
            self.output_tokens.append(torch.tensor(tokens[(i+1):(i+1+window_length)]))

    def __len__(self):
        return len(self.input_tokens)

    def __getitem__(self, idx):
        return self.input_tokens[idx], self.output_tokens[idx]

In [40]:
# Only function directly copied from the book
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
 tokenizer = tiktoken.get_encoding("gpt2")
 dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
 dataloader = DataLoader(
     dataset,
     batch_size=batch_size,
     shuffle=shuffle,
     drop_last=drop_last,
     num_workers=num_workers
     )
 return dataloader

In [45]:
dataloader = create_dataloader_v1(book_text, batch_size=2, max_length=4, stride=1, shuffle=False)
data_iter = iter(dataloader)

In [46]:
print("Batch 1: ", next(data_iter))
print("Batch 2: ", next(data_iter))

Batch 1:  [tensor([[  40,  367, 2885, 1464],
        [ 367, 2885, 1464, 1807]]), tensor([[ 367, 2885, 1464, 1807],
        [2885, 1464, 1807, 3619]])]
Batch 2:  [tensor([[2885, 1464, 1807, 3619],
        [1464, 1807, 3619,  402]]), tensor([[1464, 1807, 3619,  402],
        [1807, 3619,  402,  271]])]


# Token embeddings

In [48]:
torch.manual_seed(123)
emb = torch.nn.Embedding(6,3)
print(emb.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [51]:
token_ids = torch.tensor([2, 4, 0])
embedding_vector = emb(token_ids)
print(embedding_vector)

tensor([[ 1.2753, -0.2010, -0.1606],
        [-1.1589,  0.3255, -0.6315],
        [ 0.3374, -0.1778, -0.1690]], grad_fn=<EmbeddingBackward0>)


# Encoding word positions

In [52]:
vocab_size = 50257
output_dim = 256
emb = torch.nn.Embedding(vocab_size, output_dim)

In [54]:
max_length = 4
dataloader = create_dataloader_v1(
 book_text, batch_size=8, max_length=max_length,
 stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [57]:
embeddings = emb(inputs)
print(embeddings.shape) # each token is 256-d vector

torch.Size([8, 4, 256])


In [63]:
ctx_len = max_length
positional_embedding_layer = torch.nn.Embedding(ctx_len, output_dim)

In [69]:
torch.arange(ctx_len)

tensor([0, 1, 2, 3])

In [65]:
position_embs = positional_embedding_layer(torch.arange(ctx_len))

In [66]:
position_embs

tensor([[-0.6451, -0.6397,  0.3619,  ...,  0.4454,  0.4887,  0.5341],
        [ 0.0851, -0.3358, -0.3749,  ...,  1.2503,  0.2494,  1.2698],
        [ 1.4413,  0.1969,  0.0874,  ..., -0.6001, -0.0722,  0.4005],
        [-1.5235,  0.8094,  0.4816,  ..., -0.2414, -1.5252,  1.0934]],
       grad_fn=<EmbeddingBackward0>)

In [73]:
position_embs.shape

torch.Size([4, 256])

In [67]:
positional_embedding_layer

Embedding(4, 256)

In [68]:
positional_embedding_layer.weight

Parameter containing:
tensor([[-0.6451, -0.6397,  0.3619,  ...,  0.4454,  0.4887,  0.5341],
        [ 0.0851, -0.3358, -0.3749,  ...,  1.2503,  0.2494,  1.2698],
        [ 1.4413,  0.1969,  0.0874,  ..., -0.6001, -0.0722,  0.4005],
        [-1.5235,  0.8094,  0.4816,  ..., -0.2414, -1.5252,  1.0934]],
       requires_grad=True)

In [71]:
input_embeddings = embeddings + position_embs

In [72]:
print(input_embeddings.shape)

torch.Size([8, 4, 256])
