In [None]:
# Read the sample file and print a few things based on what was read
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

In [None]:
# tokenize based on whitespace

import re
text = "Hello, world. This, is a test."
result = re.split(r'(\s)', text)
print(result)

In [None]:
# Include punctuation as tokens
result = re.split(r'([,.]|\s)', text)
print(result)

In [None]:
# Now remove the spaces - note for some models we'll keep the spaces as they can be significat, e.g. python syntax
result = [item for item in result if item.strip()]
print(result)

In [None]:
# Expand to include addition punctuation, etc

text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [item.strip() for item in result if item.strip()]
print(result)

In [None]:
# tokenize the example text
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

In [None]:
print(preprocessed[:30])

### Building a vocabulary

A vocabulary defines how we map each word and special character to a unique identifier. To build a vocabulary we tokenize the entire dataset, sort it, and remove duplicates.

In [None]:
# Remove duplicates and sort the tokens from our tokenized dataset
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)
print(vocab_size)

In [None]:
# Create the vocabulary from the sorted/deduplicated tokens
vocab = {token:integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 50:
        break

In [None]:
enumerate(all_words)

Note in addition to mapping tokens to integers, we want to take the output of LLMs and map integer output to tokens

In [None]:
# Simple tokenizer class
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids]) 

        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [None]:
tokenizer = SimpleTokenizerV1(vocab)
text = """"It's the last he painted, you know," 
       Mrs. Gisburn said with pardonable pride."""
ids = tokenizer.encode(text)
print(ids)

In [None]:
print(tokenizer.decode(ids))

In [None]:
# What if the token is not in the vocab? We get a KeyError. Having a large dataset can mitigate this somewhat...
text = "Hello, do you like tea?"
print(tokenizer.encode(text))

## Special Context Tokens

Modify the tokenizer to handle unknown words, and address the usage and addition of special context tokens that can enhance a model's understanding of context or other relevent information in the text. 

Below we'll use <|unk|> to represent unknown words, and <|endoftext|> to signal text following the token presents a new text.

In [None]:
# Add the tokens
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer,token in enumerate(all_tokens)}

print(len(vocab.items()))

In [None]:
# Updated tokenizer that can deal with unknown words
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}

    def encode(self, text):
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        preprocessed = [item if item in self.str_to_int
                        else "<|unk|>" for item in preprocessed]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])

        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [None]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

In [None]:
tokenizer = SimpleTokenizerV2(vocab)
print(tokenizer.encode(text))

In [None]:
print(tokenizer.decode(tokenizer.encode(text)))

## Byte Pair Encoding Tokenizer

BPE tokenizers break down unknown words into subwords and individual characters. This way, a BPE tokenizer can parse any word and doesn’t need to replace unknown words with special tokens, such as <|unk|>.

Both GPT-2 and GPT-3 used this tokenizer algorithm.

Implementation library - [tiktoken](https://github.com/openai/tiktoken)

In [None]:
from importlib.metadata import version
import tiktoken
print("tiktoken version:", version("tiktoken"))

In [None]:
tokenizer = tiktoken.get_encoding("gpt2")

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
     "of someunknownPlace."
)
integers = tokenizer.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

In [None]:
strings = tokenizer.decode(integers)
print(strings)

## Data Sampling with Sliding Window

The next step in creating the embeddings for the LLM is to generate the input–target pairs required for training an LLM. What do these input–target pairs look like? As we already learned, LLMs are pretrained by predicting the next word in a text.

Consider a text sample "LLMS learn to predict one word at a time".

Pairings for training using sliding windows are \[LLMs\]\[learn\],  \[LLMs learn\]\[to\],  \[LLMs learn to\]\[predict\], and so on,





In [None]:
# Tokenize the input using the BPE tokenizer
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

enc_text = tokenizer.encode(raw_text)
print(len(enc_text))

In [None]:
# Remove the first 50 tokens from the dataset - doing makes for a better demo
enc_sample = enc_text[50:]

In [None]:
# create the input–target pairs for the next-word prediction task is to create two variables, x and y, 
# where x contains the input tokens and y contains the targets, which are the inputs shifted by 1:
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

In [None]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "---->", desired)

In [None]:
# Same thing, except decode the token ids
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokenizer.decode(context), "---->", tokenizer.decode([desired]))