In [19]:
with open("dorothy_and_the_wizard_in_oz.txt", "r", encoding="utf-8") as f:
    corpus_text = f.read()

corpus_text_length = len(corpus_text)

In [20]:
vocabulary = sorted(set(corpus_text))
vocabulary_length = len(vocabulary)

# ---------------------------------------- string_to_int dictionary
# string_to_int = {ch: i for i, ch in enumerate(chars)}
string_to_int_dictionary = {}
for i, c in enumerate(vocabulary):
    string_to_int_dictionary[c] = i


string_to_int_dictionary_length = len(string_to_int_dictionary)


# ---------------------------------------- int_to_string dictionary
# int_to_string = {i: ch for i, ch in enumerate(chars)}
int_to_string_dictionary = {}
for i, c in enumerate(vocabulary):
    int_to_string_dictionary[i] = c


int_to_string_dictionary_length = len(int_to_string_dictionary)

# ---------------------------------------- encode string
# encode = lambda s: [string_to_int[c] for c in s]
def encode(string):
    result = []
    for c in string:
        result.append(string_to_int_dictionary[c])

    return result


# ---------------------------------------- decode int
# decode = lambda l: ''.join([int_to_string[i] for i in l])
def decode(int_list):
    result = ""
    for i in int_list:
        result = result + int_to_string_dictionary[i]
    
    return result

In [21]:
import torch as torch

tensor_data = torch.tensor(encode(corpus_text), dtype=torch.long)
tensor_data_length = len(tensor_data)

In [22]:
train_data_percentage = 0.8

train_data_length = int(train_data_percentage * tensor_data_length)
train_data = tensor_data[:train_data_length]

validation_data_length = tensor_data_length - train_data_length
validation_data = tensor_data[train_data_length:]

In [23]:
print("==================== Corpus =====================================")
print(f"corpus_text (first 50 chars)   => {corpus_text[:50]}")
print(f"corpus_text_length             => {corpus_text_length}")
print()

print("==================== Vocabulary & Dictionary ====================")
print(f"vocabulary (first 20) => {vocabulary[:20]}")
print(f"vocabulary_length     => {vocabulary_length}")
print(f"int_to_string_dictionary_length     => {int_to_string_dictionary_length}")
print(f"string_to_int_dictionary_length     => {string_to_int_dictionary_length}")
print()

print("==================== Tensor Data ================================")
print(f"tensor_data (first 10) => {tensor_data[:10]}")
print(f"tensor_data_length     => {tensor_data_length}")
print()
print(f"train_data (percentage) => {train_data_percentage*100}%")
print(f"train_data (first 10)   => {train_data[:10]}")
print(f"train_data_length       => {train_data_length}")
print()
print(f"validation_data (first 10) => {validation_data[:10]}")
print(f"validation_data_length     => {validation_data_length}")
print()

corpus_text (first 50 chars)   => ﻿The Project Gutenberg eBook of Dorothy and the Wi
corpus_text_length             => 252022

vocabulary (first 20) => ['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3']
vocabulary_length     => 92
int_to_string_dictionary_length     => 92
string_to_int_dictionary_length     => 92

tensor_data (first 10) => tensor([91, 48, 65, 62,  1, 44, 75, 72, 67, 62])
tensor_data_length     => 252022

train_data (percentage) => 80.0%
train_data (first 10)   => tensor([91, 48, 65, 62,  1, 44, 75, 72, 67, 62])
train_data_length       => 201617

validation_data (first 10) => tensor([ 1, 77, 65, 62,  1, 76, 62, 58, 77,  1])
validation_data_length     => 50405



In [24]:
block_size = 8

x = train_data[:block_size]
y = train_data[1:block_size+1]

print(f"x => {x}")
print(f"y => {y}")
print()

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"{t} => When input is {context} target is {target}")

x => tensor([91, 48, 65, 62,  1, 44, 75, 72])
y => tensor([48, 65, 62,  1, 44, 75, 72, 67])

0 => When input is tensor([91]) target is 48
1 => When input is tensor([91, 48]) target is 65
2 => When input is tensor([91, 48, 65]) target is 62
3 => When input is tensor([91, 48, 65, 62]) target is 1
4 => When input is tensor([91, 48, 65, 62,  1]) target is 44
5 => When input is tensor([91, 48, 65, 62,  1, 44]) target is 75
6 => When input is tensor([91, 48, 65, 62,  1, 44, 75]) target is 72
7 => When input is tensor([91, 48, 65, 62,  1, 44, 75, 72]) target is 67
