In [22]:
# Load text file.
with open("the-verdict.txt", "r") as file:
    raw_content = file.read()

In [23]:
import re

# Preprocess text by splitting on whitespace and punctuation, keeping them as tokens.
# This function will return a list of tokens, where each token is a word or punctuation mark.
def preprocess_text(text):
    # Split on whitespace and punctuation, keep them as tokens
    output = re.split(r'([,.:;?_!"()\']|--|\s|\n])', text )
    output = [substring.strip() for substring in output if substring.strip()]
    return output

In [24]:
# Preprocess the raw content.
preprocessed = preprocess_text(raw_content)
preprocessed[:10]  # Display first 10 tokens for verification

['I',
 'HAD',
 'always',
 'thought',
 'Jack',
 'Gisburn',
 'rather',
 'a',
 'cheap',
 'genius']

In [25]:
# Returns a map of unique words to their indices.
def create_vocab(preprocessed):
    all_tokens = sorted(list(set(preprocessed)))
    all_tokens.extend(['<|endoftext|>', '<|unk|>'])
    vocab = {word: idx for idx, word in enumerate(all_tokens)}
    return vocab

In [26]:
# Create vocabulary from the preprocessed text.
vocab = create_vocab(preprocessed)
for k,v in list(vocab.items())[-10:]:
    print (k, v)

year 1122
years 1123
yellow 1124
yet 1125
you 1126
younger 1127
your 1128
yourself 1129
<|endoftext|> 1130
<|unk|> 1131


In [83]:
# Define a simple tokenizer.
# Tokenizer supports Encode and Decode methods.
# The tokenizer splits text into words and punctuation, removing whitespaces.
# Tokens that are not in the vocab. are encoded as "<|unk|>".
class SimpleTokenizer: 
    def __init__(self, vocab):
        self.text_to_int = vocab
        self.int_to_text = {v: k for k, v in vocab.items()}

    def encode(self, text):
        return [
            self.text_to_int[word] if word in self.text_to_int 
            else self.text_to_int['<|unk|>'] for word in preprocess_text(text)
            ]
    
    def decode(self, tokens):
        output = " ".join([self.int_to_text[id] for id in tokens])
        output = re.sub(r'\s+([,.:;?_!"()\']|--)', r'\1', output)
        return output

In [84]:
# Init. tokenizer.
tokenizerV1 = SimpleTokenizer(vocab)

# Sampe use of tokenizer functions.
text = "this Hello world <|endoftext|> is!! was--"
print(tokenizerV1.encode(text))  # Example encoding.
print(tokenizerV1.decode(tokenizerV1.encode(text)))  # Example decoding.
del text

[999, 1131, 1131, 1130, 584, 0, 0, 1077, 6]
this <|unk|> <|unk|> <|endoftext|> is!! was--


## Using Byte Pair Encoding

- [x] Create GPT dataset (txt, tokenizer, max_lenght, stride)
- [x] Create dataloader using dataset (txt, batch_size, max_length, stride, shuffle, drop_last, num_workers)

In [85]:
# Create dataset.

from torch.utils.data import Dataset
import torch

class DatasetGPTV1(Dataset):

    def __init__(self, txt : str, tokenizer, max_length: int, stride: int):
        self.input_ids = []
        self.target_ids = []

        all_ids  = tokenizer.encode(txt)

        for i in range(0, len(all_ids)-max_length, stride):
            # Select first max_lenght-i elements.
            input_chunk = all_ids[i:i+max_length]
            # Select max_lenght-i elements, slide by 1.
            target_chunk = all_ids[i+1:i+max_length+1]
            # Append to input/target lists.
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
    
    def __len__(self):
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [86]:
DatasetGPTV1(raw_content, SimpleTokenizer(vocab), max_length=3, stride=2)[0]

(tensor([ 53,  44, 149]), tensor([  44,  149, 1003]))

In [96]:
from torch.utils.data import DataLoader
import tiktoken

def create_dataloader_v1(txt, batch_size=4, max_length=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokenizer = tiktoken.encoding_for_model("gpt2")
    dataset = DatasetGPTV1(txt, tokenizer, max_length, stride)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)
    return dataloader

In [97]:
dataloader = create_dataloader_v1( raw_content, batch_size=8, max_length=4, stride=4, shuffle=False )
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   40,   367,  2885,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  198, 11274,  5891,  1576],
        [  438,   568,   340,   373],
        [  645,  1049,  5975,   284],
        [  502,   284,  3285,   326]])

Targets:
 tensor([[  367,  2885,  1464,  1807],
        [ 3619,   402,   271, 10899],
        [ 2138,   257,  7026, 15632],
        [  438,  2016,   257,   198],
        [11274,  5891,  1576,   438],
        [  568,   340,   373,   645],
        [ 1049,  5975,   284,   502],
        [  284,  3285,   326,    11]])


## Create Token Embedding Layers

- [x] Create token embeddings (torch.nn.Embedding)
- [x] Create positional embeddings (same)

In [107]:
# Create token embeddings.
vocab_size = 50257 # Size of the BPE.
output_dim = 256   # Output embedding size. 
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
token_embedding = token_embedding_layer(inputs)
print(token_embedding.shape)

torch.Size([8, 4, 256])


In [108]:
# Create positional embeddings (absoute position).
context_length = max_length = 4
pos_embedding_layer = torch.nn.Embedding(4, output_dim)
pos_embedding = pos_embedding_layer(torch.arange(context_length))
print(pos_embedding.shape)

torch.Size([4, 256])


In [109]:
# Make a single input embedding as the sum.
input_embeddings = token_embedding + pos_embedding
print(input_embeddings.shape)

torch.Size([8, 4, 256])
