In [1]:
# define options
import logging
from typing import List
import re
import os

In [2]:
data_fp = "data/the-verdict.txt"
do_download_data = False if os.path.exists(data_fp) else True

l = logging.getLogger(__name__)
l.setLevel(logging.INFO)
l.info(f"{do_download_data=}")

In [25]:
def download_data(url:str="https://raw.githubusercontent.com/rasbt/LLMs-from-scratch/main/ch02/01_main-chapter-code/the-verdict.txt",
                  file_path:str="the-verdict.txt"):
  l.info(f"Downloading Data to {file_path=}")
  import urllib.request
  urllib.request.urlretrieve(url, file_path)

if do_download_data:
  download_data(file_path=data_fp)

# let's have a look at the data
with open(data_fp, "r+") as f:
  all_text = f.readlines()
  l.info(all_text[0])

## Tokenizing our text

In [4]:
# Create the tokenizer
class SimpleTokenizerV1:
  def __init__(self, vocab) -> None:
    self.str_to_int = vocab
    self.int_to_str = {i:s for s, i in vocab.items()}

  def encode(self, text:str) -> List[int]:
    preprocessed = re.split(r'([,.?!()"\']|--|\s)', text)
    preprocessed = [
        item.strip() for item in preprocessed if item.strip()
    ]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids:List[int]) -> str:
    text = " ".join([self.int_to_str[id] for id in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [5]:
# test the tokenizer
text = "Hello, world. Is this-- a test?"
preprocessed = re.split(r'([,.:;?!"()\']|--|\s)', text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
all_words = sorted(set(preprocessed))
vocab = {t:idx for idx, t in enumerate(all_words)}
print(vocab)

{',': 0, '--': 1, '.': 2, '?': 3, 'Hello': 4, 'Is': 5, 'a': 6, 'test': 7, 'this': 8, 'world': 9}


In [6]:
tokenizer = SimpleTokenizerV1(vocab)

In [7]:
test_text = "Hello test. Is this a world?"
ids = tokenizer.encode(test_text)
l.info(ids)
print(tokenizer.decode(ids))

Hello test. Is this a world?


In [8]:
# now if we ask for an unknown word - we will get an error
test_text = "Hello Caroline. Is this a world?"
try:
  ids = tokenizer.encode(test_text)
except Exception as e:
  l.info(f"{e=}")

In [9]:
# we can remedy this by adding the unknown token token to the vocabulary. other
# special tokens handle the concatenation of different corpora, and so on
all_tokens = all_words
all_tokens.extend(["<|unk|>", "<|endoftext|>"])
vocab = {t:idx for idx, t in enumerate(all_tokens)}
print(vocab)

{',': 0, '--': 1, '.': 2, '?': 3, 'Hello': 4, 'Is': 5, 'a': 6, 'test': 7, 'this': 8, 'world': 9, '<|unk|>': 10, '<|endoftext|>': 11}


In [10]:
# equipped with this extended vocabulary, we can build an updated tokenizer that
# accounts for the special tokens
class SimpleTokenizerV2:
  def __init__(self, vocab) -> None:
    self.str_to_int = vocab
    self.int_to_str = {i:s for s, i in vocab.items()}

  def encode(self, text:str) -> List[int]:
    preprocessed = re.split(r'([,.?!()"\']|--|\s)', text)
    preprocessed = [
        item.strip() for item in preprocessed if item.strip()
    ]
    preprocessed = [item if item in self.str_to_int
                    else "<|unk|>" for item in preprocessed]
    ids = [self.str_to_int[s] for s in preprocessed]
    return ids

  def decode(self, ids:List[int]) -> str:
    text = " ".join([self.int_to_str[id] for id in ids])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [11]:
# now test that indeed we get the unknown token
tokenizer =  SimpleTokenizerV2(vocab)
test_text = "Hello Caroline. Is this a world?"
try:
  ids = tokenizer.encode(test_text)
except Exception as e:
  l.info(f"{e=}")
l.info(ids)
l.info(tokenizer.decode(ids))

In [13]:
# !pip install tiktoken
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

Collecting tiktoken
  Downloading tiktoken-0.8.0-cp311-cp311-win_amd64.whl (884 kB)
     ------------------------------------- 884.5/884.5 kB 18.6 MB/s eta 0:00:00
Collecting regex>=2022.1.18
  Downloading regex-2024.11.6-cp311-cp311-win_amd64.whl (274 kB)
     ---------------------------------------- 274.1/274.1 kB ? eta 0:00:00
Collecting requests>=2.26.0
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
     ---------------------------------------- 64.9/64.9 kB 3.6 MB/s eta 0:00:00
Collecting charset-normalizer<4,>=2
  Downloading charset_normalizer-3.4.0-cp311-cp311-win_amd64.whl (101 kB)
     -------------------------------------- 101.8/101.8 kB 5.7 MB/s eta 0:00:00
Collecting idna<4,>=2.5
  Downloading idna-3.10-py3-none-any.whl (70 kB)
     ---------------------------------------- 70.4/70.4 kB ? eta 0:00:00
Collecting urllib3<3,>=1.21.1
  Downloading urllib3-2.2.3-py3-none-any.whl (126 kB)
     -------------------------------------- 126.3/126.3 kB 7.3 MB/s eta 0:00:00
Colle


[notice] A new release of pip available: 22.3 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
# let's see how this is different
tokens = tokenizer.encode(test_text)
l.info(f"{tokens=}")
l.info(tokenizer.decode(tokens))

# Implementing a Dataloader

In [15]:
# remember we have the data in all_text
the_text = "".join(all_text)
encoded_text = tokenizer.encode(the_text)
print(len(encoded_text))

5145


In [16]:
# to build our model which does next token prediction, we need to sample
# up until the current token and then the next token as target
# for demonstration purposes, we do this for context_size window
context_size = 4
for i in range(1, context_size + 1):
  x = encoded_text[:i]
  xhat = tokenizer.decode(x)
  y = encoded_text[i]
  yhat = tokenizer.decode([y])
  l.info(f"{xhat=} --> {yhat=}")

In [17]:
# now implement a torch data loader
!pip install torch
import torch
from torch.utils.data import Dataset, DataLoader


[notice] A new release of pip available: 22.3 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting torch
  Downloading torch-2.5.1-cp311-cp311-win_amd64.whl (203.1 MB)
     -------------------------------------- 203.1/203.1 MB 3.1 MB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.16.1-py3-none-any.whl (16 kB)
Collecting networkx
  Downloading networkx-3.4.2-py3-none-any.whl (1.7 MB)
     ---------------------------------------- 1.7/1.7 MB 12.2 MB/s eta 0:00:00
Collecting jinja2
  Using cached jinja2-3.1.4-py3-none-any.whl (133 kB)
Collecting fsspec
  Downloading fsspec-2024.10.0-py3-none-any.whl (179 kB)
     ------------------------------------- 179.6/179.6 kB 11.3 MB/s eta 0:00:00
Collecting sympy==1.13.1
  Downloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
     ---------------------------------------- 6.2/6.2 MB 18.0 MB/s eta 0:00:00
Collecting mpmath<1.4,>=1.1.0
  Using cached mpmath-1.3.0-py3-none-any.whl (536 kB)
Collecting MarkupSafe>=2.0
  Downloading MarkupSafe-3.0.2-cp311-cp311-win_amd64.whl (15 kB)
Installing collected packages: mpmath, sympy, netwo

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [18]:
class GPTDatasetV1(Dataset):
  def __init__(self, txt, tokenizer, max_length, stride) -> None:
    self.input_ids = []
    self.target_ids = []
    token_ids = tokenizer.encode(txt)

    for i in range(0, len(token_ids) - max_length, stride):
      input_chunk = token_ids[i:i + max_length]
      target_chunk = token_ids[i + 1:i + max_length + 1]
      self.input_ids.append(torch.tensor(input_chunk))
      self.target_ids.append(torch.tensor(target_chunk))

  def __len__(self):
    return len(self.input_ids)
  
  def __getitem__(self, idx):
    return self.input_ids[idx], self.target_ids[idx]

In [19]:
# now create a data loader for this data set
def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, 
                         shuffle=True, drop_last=True,
                         num_workers=0):
  tokenizer = tiktoken.get_encoding("gpt2")
  dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)
  dataloader = DataLoader(dataset,
                          batch_size=batch_size,
                          shuffle=shuffle,
                          drop_last=drop_last,
                          num_workers = num_workers
  )
  
  return dataloader

In [20]:
# so let's load
dl = create_dataloader_v1(the_text, batch_size=1, max_length=4, stride=1, shuffle=False)
data_iter = iter(dl)
first_batch = next(data_iter)
l.info(first_batch)

# Embed the tokenized data

In [21]:
# tokens are embedded using a token embedding.
vocab_size = 50257 # from BPE
output_dim = 256   # embedding size
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

# to preserve the position a positional embedding - relative or absolute
# is used. here we use an absolute position encoding scheme
max_length = 4
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
# we fill the positional embeddings - with an embedding of 0,1,..,context_length
# to designate the position of each token
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
l.info(pos_embeddings.shape)

In [22]:
# to test the embedding get some token from the data loader
x, y = next(data_iter)
l.info(f"{x=} => {y=}")

In [23]:
# and embedd
token_embed = token_embedding_layer(x)

In [24]:
# position embedding is simply 0, 1, 2, ... n - only the token embedding changes
x_embed = token_embed + pos_embeddings
l.info(x_embed)