### Set env variable value from Google Colab secret


In [1]:
import os
from google.colab import userdata
GCP_PROJECT_ID = userdata.get('GCP_PROJECT_ID')

os.environ["GCP_PROJECT_ID"] = GCP_PROJECT_ID

### Login to GCP project

In [None]:
!gcloud auth login --no-launch-browser
!gcloud config set project $GCP_PROJECT_ID

### Download necessary input files from GCP bucket

In [3]:
%%capture

!gsutil cp gs://virtual-home-studio/vhome_medium_ds/v0_t1680_images.tar.gz .
!gsutil cp gs://virtual-home-studio/vhome_medium_ds/requirements.txt .
!gsutil cp gs://virtual-home-studio/vhome_medium_ds/v0_t1680_image_2_captions.json .
!tar -xzvf v0_t1680_images.tar.gz
!nvidia-smi

### Concatentate captions into single file

In [4]:
import numpy as np
import json

def write_concatenated_captions_to_file(captions_dict, output_file):
  """
  Writes all captions in a dictionary to a single text file.

  Args:
    captions_dict: A dictionary where keys are image filenames and values are CLIP generated image captions.
    output_file: The path to the output text file.
  """

  with open(output_file, 'w') as f:
    for filename, caption in captions_dict.items():
      # print(caption[0])
      f.write(caption[0] + '. ')

def read_captions_from_file(input_file):
  """
  Read all captions in a dictionary to a single text file.

  Args:
    input_file: The path to the input text file.
  """

  with open(input_file) as f:
      v0_t1680_image_2_captions = json.load(f)
  return v0_t1680_image_2_captions


v0_t1680_image_2_captions = read_captions_from_file('v0_t1680_image_2_captions.json')
write_concatenated_captions_to_file(v0_t1680_image_2_captions, 'captions.txt')


### Preview captions generated from VirtualHome images.

In [None]:
!head captions.txt

### Install depedencies

In [6]:
%%capture
!pip install -r requirements.txt

#### Capture dependencies versions

In [7]:
from importlib.metadata import version

import tiktoken
import torch
import re

libs = ["torch", "tiktoken", "tensorflow", "numpy", "pandas", "matplotlib"]
for lib in libs:
    print(f"{lib} version:", version(lib))

torch version: 2.3.1+cu121
tiktoken version: 0.7.0
tensorflow version: 2.15.0
numpy version: 1.25.2
pandas version: 2.2.2
matplotlib version: 3.7.1


### Build a tokenizer for captions

In [55]:
with open("captions.txt", "r", encoding="utf-8") as f:
    captions_text = f.read()

print("Total number of character:", len(captions_text))
print(captions_text.split('.')[0])

Total number of character: 81176
a red refrigerator freezer sitting next to a sink


In [56]:
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', captions_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(f'Total number of tokens present in corpus: {len(preprocessed)}')
print(preprocessed[:30])

Total number of tokens present in corpus: 19154
['a', 'red', 'refrigerator', 'freezer', 'sitting', 'next', 'to', 'a', 'sink', '.', 'a', 'woman', 'standing', 'next', 'to', 'a', 'table', 'with', 'a', 'pie', 'on', 'it', '.', 'a', 'bathroom', 'with', 'a', 'sink', 'and', 'a']


In [10]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(f'Vocab size: {vocab_size}')
vocab = {token:integer for integer,token in enumerate(all_words)}
for i, item in enumerate(vocab.items()):
    print(f'Vocab <> token ID: {item}')
    if i >= 50:
        break

Vocab size: 243
Vocab <> token ID: ("'", 0)
Vocab <> token ID: (',', 1)
Vocab <> token ID: ('.', 2)
Vocab <> token ID: ('a', 3)
Vocab <> token ID: ('above', 4)
Vocab <> token ID: ('aerial', 5)
Vocab <> token ID: ('against', 6)
Vocab <> token ID: ('air', 7)
Vocab <> token ID: ('an', 8)
Vocab <> token ID: ('and', 9)
Vocab <> token ID: ('another', 10)
Vocab <> token ID: ('apartment', 11)
Vocab <> token ID: ('are', 12)
Vocab <> token ID: ('at', 13)
Vocab <> token ID: ('back', 14)
Vocab <> token ID: ('background', 15)
Vocab <> token ID: ('backpack', 16)
Vocab <> token ID: ('bath', 17)
Vocab <> token ID: ('bathroom', 18)
Vocab <> token ID: ('bathtub', 19)
Vocab <> token ID: ('bed', 20)
Vocab <> token ID: ('bedroom', 21)
Vocab <> token ID: ('behind', 22)
Vocab <> token ID: ('bench', 23)
Vocab <> token ID: ('black', 24)
Vocab <> token ID: ('blackboard', 25)
Vocab <> token ID: ('blue', 26)
Vocab <> token ID: ('book', 27)
Vocab <> token ID: ('bookcase', 28)
Vocab <> token ID: ('books', 29)
Vocab

In [34]:
class TokenizerV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}

    def encode(self, text):
        """
        Tokenizer uses encode method to tokenize texts into integers

        Args:
          text: A string containing the text to be tokenized.

        Returns:
          A list of integers representing the tokenized text
        """
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)

        preprocessed = [
            item.strip() for item in preprocessed if item.strip()
        ]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        """
        Tokenizer uses decode method to convert integers back to text

        Args:
          ids: A list of integers representing the tokenized text.

        Returns:
          A string containing the decoded text.
        """
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

tokenizer = TokenizerV1(vocab)

text = "a red refrigerator freezer sitting next to a sink"
ids = tokenizer.encode(text)
print(f"Tokenized ids:", ids)
print(f"Decoded text:", tokenizer.decode(ids))

Tokenized ids: [3, 164, 165, 82, 182, 138, 209, 3, 180]
Decoded text: a red refrigerator freezer sitting next to a sink


Can you spot any issues with this tokenizer? How would it handle words it hasn't seen before?

#### Support out of vocabulary words

In [14]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])

vocab = {token:integer for integer,token in enumerate(all_tokens)}
len(f'Number of tokens: {vocab.items()}')
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('women', 240)
('wooden', 241)
('yellow', 242)
('<|endoftext|>', 243)
('<|unk|>', 244)


In [42]:
class TokenizerV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = { i:s for s,i in vocab.items()}

    def encode(self, text):
        """
        Tokenizer uses encode method to tokenize texts into integers

        Args:
          text: A string containing the text to be tokenized.

        Returns:
          A list of integers representing the tokenized text
        """
        preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        preprocessed = [
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]

        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        """
        Tokenizer uses decode method to convert integers back to text

        Args:
          ids: A list of integers representing the tokenized text.

        Returns:
          A string containing the decoded text.
        """
        text = " ".join([self.int_to_str[i] for i in ids])
        # Replace spaces before the specified punctuations
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [43]:
tokenizer = TokenizerV2(vocab)

text1 = "Test is a new string not in original vocabulary."
text2 = "a red refrigerator freezer sitting next to a sink"


text = " <|EndOfText|> ".join((text1, text2))

print(text)

Test is a new string not in original vocabulary. <|EndOfText|> a red refrigerator freezer sitting next to a sink


In [44]:
tokenizer.encode(text)


[244,
 107,
 3,
 244,
 244,
 244,
 105,
 244,
 244,
 2,
 244,
 3,
 164,
 165,
 82,
 182,
 138,
 209,
 3,
 180]

In [45]:
tokenizer.decode(tokenizer.encode(text))


'<|unk|> is a <|unk|> <|unk|> <|unk|> in <|unk|> <|unk|>. <|unk|> a red refrigerator freezer sitting next to a sink'

We can now see words not in the vocabulary be represented as `<|unk|>`.

### Compare w/ BytePair encoding

In [57]:
tokenizer = tiktoken.get_encoding("gpt2")
enc_text = tokenizer.encode(captions_text)
print(f"Length of encoded text: {len(enc_text)}")

Length of encoded text: 19256


#### Inspect encoded text (context and next word)

In [None]:
enc_sample = enc_text[50:]
context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(f"""Token Id Context: {context}, "---->", Desired Id: {desired}""")

In [54]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]

    print(f"""Context: {tokenizer.decode(context)}, "---->", Desired: {tokenizer.decode([desired])}""")

Context:  television, "---->", Desired: .
Context:  television., "---->", Desired:  a
Context:  television. a, "---->", Desired:  living
Context:  television. a living, "---->", Desired:  room


#### Create dataset and dataloader to extract chunks from the input text dataset


In [21]:
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

Test dataloader with a batch size of 1 for an LLM with a context size of 4...

In [22]:
dataloader = create_dataloader_v1(
    captions_text, batch_size=1, max_length=4, stride=1, shuffle=False
)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[   64,  2266, 30500, 30967]]), tensor([[ 2266, 30500, 30967,  5586]])]


In [23]:
dataloader = create_dataloader_v1(captions_text, batch_size=8, max_length=4, stride=4, shuffle=False)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[   64,  2266, 30500, 30967],
        [ 5586,  1306,   284,   257],
        [14595,    13,   257,  2415],
        [ 5055,  1306,   284,   257],
        [ 3084,   351,   257,  2508],
        [  319,   340,    13,   257],
        [12436,   351,   257, 14595],
        [  290,   257, 16146,    13]])

Targets:
 tensor([[ 2266, 30500, 30967,  5586],
        [ 1306,   284,   257, 14595],
        [   13,   257,  2415,  5055],
        [ 1306,   284,   257,  3084],
        [  351,   257,  2508,   319],
        [  340,    13,   257, 12436],
        [  351,   257, 14595,   290],
        [  257, 16146,    13,   257]])


BytePair encoder has a vocabulary size of 50,257.
If we sample data from the dataloader, we embed the tokens in each batch into a 256-dimensional vector

In [24]:
vocab_size = 50257
output_dim = 256

token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
max_length = 4
dataloader = create_dataloader_v1(
    captions_text, batch_size=8, max_length=max_length,
    stride=max_length, shuffle=False
)
data_iter = iter(dataloader)
inputs, targets = next(data_iter)

In [25]:
print("Token IDs:\n", inputs)
print("\nInputs shape:\n", inputs.shape)

Token IDs:
 tensor([[   64,  2266, 30500, 30967],
        [ 5586,  1306,   284,   257],
        [14595,    13,   257,  2415],
        [ 5055,  1306,   284,   257],
        [ 3084,   351,   257,  2508],
        [  319,   340,    13,   257],
        [12436,   351,   257, 14595],
        [  290,   257, 16146,    13]])

Inputs shape:
 torch.Size([8, 4])


In [26]:
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


This results in a 8 x 4 x 256 tensor since we have a batch size of 8 with 4 tokens


In [27]:
context_length = max_length
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

GPT-2 uses absolute position embeddings, so we just create another embedding layer...


In [28]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


To create the input embeddings used in an LLM, add the token and the positional embeddings...

In [29]:
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])


In [30]:
import tiktoken
import torch
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]


def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True, num_workers=0):
    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers)

    return dataloader

tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(captions_text)

vocab_size = 50257
output_dim = 256
context_length = 1024


token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)

max_length = 4
dataloader = create_dataloader_v1(captions_text, batch_size=8, max_length=max_length, stride=max_length)

Iterate and inspect input embeddings to sanity check...

In [31]:
for batch in dataloader:
    x, y = batch

    token_embeddings = token_embedding_layer(x)
    pos_embeddings = pos_embedding_layer(torch.arange(max_length))

    input_embeddings = token_embeddings + pos_embeddings

    break