In [5]:
# Check GPU availability
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Device: {device}")

# Hyperparameters
block_size = 8 # No of tokens the model look at once during training (e.g., context window)
batch_size = 4 # No of sequences fed to the model in each training step

Device: cuda


## Load data

In [2]:
with open('wizard_of_oz.txt', 'r', encoding="utf-8") as f:
    text = f.read()

print(text[:100])

The Wonderful Wizard of Oz




Chapter I
The Cyclone


Dorothy lived in the midst of the great Kans


## Create a basic character-level tokenizer 

In [7]:
# Check the info of unique tokens in the text

chars = sorted(set(text))
print(chars)
vocabulary_size = len(chars)
print(f"No of unique chars in the entire text: {vocabulary_size}")

['\n', ' ', '!', '(', ')', '*', ',', '-', '.', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '—', '‘', '’', '“', '”', '\ufeff']
No of unique chars in the entire text: 70


In [4]:
# Create basic encoder-decoder

string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

# Test encoding-decoding
sample_encoding = encode("hello")
print(sample_encoding)

sample_decoding = decode(sample_encoding)
print(sample_decoding)

[45, 42, 49, 49, 52]
hello


In [9]:
# Encode the complete text
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:100])

tensor([69, 31, 45, 42,  1, 34, 52, 51, 41, 42, 55, 43, 58, 49,  1, 34, 46, 63,
        38, 55, 41,  1, 52, 43,  1, 26, 63,  0,  0,  0,  0,  0, 14, 45, 38, 53,
        57, 42, 55,  1, 20,  0, 31, 45, 42,  1, 14, 62, 40, 49, 52, 51, 42,  0,
         0,  0, 15, 52, 55, 52, 57, 45, 62,  1, 49, 46, 59, 42, 41,  1, 46, 51,
         1, 57, 45, 42,  1, 50, 46, 41, 56, 57,  1, 52, 43,  1, 57, 45, 42,  1,
        44, 55, 42, 38, 57,  1, 22, 38, 51, 56])
