In [53]:
import requests
import torch

In [4]:
# downloading the shakespear dataset to quickly build and experiment
# NOTE: For a large dataset which is similar to what openai has used use this: https://huggingface.co/datasets/openwebtext
URL = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(URL).text

In [8]:
print(response[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [9]:
# since we are building a character level model, which is a model that is able to predict the next character and not the next word, we will need to build a character level vocabulary
# TODO: In the future, we can implement tokenizers via HuggingFace's tokenizers library
chars = sorted(list(set(response)))
vocab_size = len(chars)

print("Total number of characters: ", vocab_size)

Total number of characters:  65


In [60]:
# let's build a basic tokenizer, this tokenizer is a character level tokenizer
# NOTE: a more useful tokenization library may be using something like tiktoken which is openais tokenizer for their gpt-x models
# link here: https://github.com/openai/tiktoken TODO: implement this tokenizer
class Tokenizer:
    def __init__(self, chars, vocab_size):
        self.chars = chars
        self.vocab_size = vocab_size
        self.s2i = {char: idx for idx, char in enumerate(chars)} # character to index
        self.i2s = {idx: char for idx, char in enumerate(chars)} # index to character

    def encode(self, text, return_tensor=False):
        """This function encodes the text into tokens"""
        encoded_list = [self.s2i[char] for char in text]
        if return_tensor: return torch.tensor(encoded_list, dtype=torch.long)
        return encoded_list
    
    def decode(self, tokens):
        """This function decodes the tokens into text"""
        # if tokens are a pytorch tensor, convert it to a list
        if type(tokens) == torch.Tensor:tokens = tokens.tolist()
        return "".join([self.i2s[token] for token in tokens])
    
    @classmethod
    def from_text(cls, text):
        """This function creates a tokenizer from the text"""
        chars = sorted(list(set(text)))
        vocab_size = len(chars)
        return cls(chars, vocab_size)

In [61]:
# let's instantiate our tokenizer
tokenizer = Tokenizer.from_text(response) # from our text

# as we can see if contains the same character length as before
print("Total number of characters: ", tokenizer.vocab_size)

Total number of characters:  65


In [62]:
# let's now encode sample text
tokenizer.encode("hii there")

[46, 47, 47, 1, 58, 46, 43, 56, 43]

In [63]:
# and we can decode it back
tokenizer.decode(tokenizer.encode("hii there"))

'hii there'

In [64]:
from torch.utils.data import DataLoader

In [66]:
# lets now build a dataset from our text data which will use our tokenizer to encode the text
class CharDataset:
    def __init__(self, text, tokenizer, seq_len=128):
        self.text = text
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.text_encoded = self.tokenizer.encode(self.text, return_tensor=True) # all our text encoded
        self.total_len = len(self.text_encoded)

    @classmethod
    def from_text(cls, text, tokenizer_cls=None, seq_len=128):
        tokenizer = Tokenizer.from_text(text) if tokenizer_cls is None else tokenizer_cls.from_text(text)
        return cls(text, tokenizer, seq_len)
    
    def __len__(self): return self.total_len // self.seq_len
    def __getitem__(self, idx): return self.text_encoded[idx:idx+self.seq_len]

In [67]:
# lets now instantiate our dataset
dataset = CharDataset.from_text(response, seq_len=128*2)

print("The length of our dataset is: ", len(dataset))
print("The vocab size is : ", dataset.tokenizer.vocab_size)

The length of our dataset is:  4357
The vocab size is :  65


In [69]:
# showing an example of the dataset
print(dataset[0])

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
        47, 59, 57,  1, 47, 57,  1, 41, 

In [70]:
# we can decode this too
print(dataset.tokenizer.decode(dataset[0]))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:

