In [2]:
import os 
import torch
import tiktoken
import re

  cpu = _conversion_method_template(device=torch.device("cpu"))


## 1. Tokenizing text

In [3]:
data_dir = "./porsche_text"

In [4]:
raw_text_dict = {}
for file in os.listdir(data_dir):
    filename = os.path.basename(file).split(".")[0]
    if file.endswith(".txt"):
        with open(os.path.join(data_dir, file), "r", encoding="utf-8") as f:
            raw_text_dict[filename] = f.read()

# first play with origin set
raw_text = raw_text_dict["origin"]
print("Total number of character in origin.txt:", len(raw_text))
print(raw_text[:99])

Total number of character in origin.txt: 17938
Dr. Ing. h.c. F. Porsche AG, commonly known as Porsche,[a] is a German automobile manufacturer spec


In [5]:
tokens = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)
tokens = [item.strip() for item in tokens if item.strip()]
print(tokens[:30])

['Dr', '.', 'Ing', '.', 'h', '.', 'c', '.', 'F', '.', 'Porsche', 'AG', ',', 'commonly', 'known', 'as', 'Porsche', ',', '[a]', 'is', 'a', 'German', 'automobile', 'manufacturer', 'specializing', 'in', 'luxury', ',', 'high-performance', 'sports']


In [6]:
print(len(tokens))

3437


## 2. Converting Tokens to Token IDs

In [7]:
all_words = sorted(set(tokens))
vocab_size = len(all_words)

print(vocab_size)

1080


In [8]:
vocabulary = {word: idx for idx, word in enumerate(all_words)}

In [9]:
class SimpleTokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
                                
        tokens = [
            item.strip() for item in tokens if item.strip()
        ]
        ids = [self.str_to_int[s] for s in tokens]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

- The `encode` function turns text into token IDs
- The `decode` function turns token IDs back into text

In [13]:
tokenizer = SimpleTokenizerV1(vocabulary)

text_example = """The origins of the company date to the 1930s
            when German Bohemian automotive engineer Ferdinand Porsche founded Porsche"""
token_ids = tokenizer.encode(text_example)
print(token_ids)

[324, 816, 795, 997, 527, 564, 1011, 997, 23, 1061, 202, 147, 449, 601, 190, 284, 644, 284]


In [14]:
tokenizer.decode(token_ids)

'The origins of the company date to the 1930s when German Bohemian automotive engineer Ferdinand Porsche founded Porsche'

## 3. Manage special context tokens

We will try to encode the second .txt file with the current Tokenizer.

Some changes has to be done ensuring a functional encoding

In [21]:
first_tokenizer = SimpleTokenizerV1(vocabulary)

text = raw_text_dict["production_and_sales"][:50]

first_tokenizer.encode(text)

KeyError: 'located'

In [22]:
unknown_token = "<unk>"
end_of_seq_token = "<eos>"

In [23]:
all_tokens = sorted(list(set(tokens)))
all_tokens.extend([unknown_token, end_of_seq_token])

vocabulary = {token:integer for integer,token in enumerate(all_tokens)}

In [24]:
len(vocabulary)

1082

In [25]:
class SimpleTokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}
    
    def encode(self, text):
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = [item.strip() for item in tokens if item.strip()]
        tokens = [
            item if item in self.str_to_int 
            else "<unk>" for item in tokens
        ]

        ids = [self.str_to_int[s] for s in tokens]
        return ids
        
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.:;?!"()\'])', r'\1', text)
        return text

In [31]:
text_1 = raw_text_dict["origin"][:54]
text_2 = raw_text_dict["production_and_sales"][:61]

text = " <eos> ".join((text_1, text_2))
text

'Dr. Ing. h.c. F. Porsche AG, commonly known as Porsche <eos> The headquarters and main factory are located in Zuffenhausen'

In [36]:
tokenizer = SimpleTokenizerV2(vocabulary)
print(tokenizer.encode(text))

[176, 10, 226, 10, 659, 10, 489, 10, 187, 10, 284, 108, 8, 525, 709, 440, 284, 1081, 324, 667, 426, 742, 620, 435, 1080, 681, 1080]


In [33]:
tokenizer.decode(tokenizer.encode(text))

'Dr. Ing. h. c. F. Porsche AG, commonly known as Porsche <eos> The headquarters and main factory are <unk> in <unk>'

## 4. BytePair encoding

- GPT-2 used BytePair encoding (BPE) as its tokenizer

In [38]:
tokenizer = tiktoken.get_encoding("gpt2")

In [41]:
text = (
    """Dr. Ing. h. c. F. Porsche AG, commonlyknownas Porsche <eos> 
    The headquarters and main factory are <unk> in <unk>"""
)

integers = tokenizer.encode(text, allowed_special={"<eos>"})

print(integers)

[6187, 13, 17589, 13, 289, 13, 269, 13, 376, 13, 28367, 13077, 11, 8811, 4002, 292, 28367, 1279, 68, 418, 29, 220, 198, 220, 220, 220, 383, 10043, 290, 1388, 8860, 389, 1279, 2954, 29, 287, 1279, 2954, 29]


In [42]:
strings = tokenizer.decode(integers)

print(strings)

Dr. Ing. h. c. F. Porsche AG, commonlyknownas Porsche <eos> 
    The headquarters and main factory are <unk> in <unk>


- BPE tokenizers break down unknown words into subwords and individual characters

That is why commonlyknownas -> 3 tokens commonly known as -> easy to reconstruct commonlyknownas

## 5. Data sampling with a sliding window

In [43]:
from torch.utils.data import Dataset, DataLoader


class GPTDataset(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<eos>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [44]:
def create_dataloader(txt, batch_size=4, max_length=256, 
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDataset(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [45]:
text = raw_text_dict["origin"][:100]

In [47]:
dataloader = create_dataloader(
    raw_text, batch_size=8, max_length=4, stride=1, shuffle=False
)

for input, target in dataloader:
    print(input)
    print(target)
    break

tensor([[ 6187,    13, 17589,    13],
        [   13, 17589,    13,   289],
        [17589,    13,   289,    13],
        [   13,   289,    13,    66],
        [  289,    13,    66,    13],
        [   13,    66,    13,   376],
        [   66,    13,   376,    13],
        [   13,   376,    13, 28367]])
tensor([[   13, 17589,    13,   289],
        [17589,    13,   289,    13],
        [   13,   289,    13,    66],
        [  289,    13,    66,    13],
        [   13,    66,    13,   376],
        [   66,    13,   376,    13],
        [   13,   376,    13, 28367],
        [  376,    13, 28367, 13077]])


## 6. Creating token embeddings

- The BytePair encoder has a vocabulary size of 50,257:
- Suppose we want to encode the input tokens into a 256-dimensional vector representation:

In [48]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [51]:
# no overlap accross batches

max_length = 4
dataloader = create_dataloader(
    raw_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [52]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[ 6187,    13, 17589,    13],
        [  289,    13,    66,    13],
        [  376,    13, 28367, 13077],
        [   11,  8811,  1900,   355],
        [28367, 17414,    64,    60],
        [  318,   257,  2679, 27930],
        [11554, 40847,   287, 13064],
        [   11,  1029,    12, 26585]])

Inputs shape:
 torch.Size([8, 4])


In [53]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


## 7. Encoding word positions

- GPT-2 uses absolute position embeddings, so we just create another embedding layer:

In [54]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

In [55]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


- To create the input embeddings used in an LLM, we simply add the token and the positional embeddings:

In [56]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)


torch.Size([8, 4, 256])
