In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# dataset of shakespeare text

chars = sorted(list(set(text)))
print("Number of unique characters: ", len(chars))
vocab_size = len(chars)
print(''.join(chars))

stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}

def encode_text(text):
    return [stoi[ch] for ch in text]
def decode_text(encoded_text):
    return ''.join([itos[i] for i in encoded_text])

hello_encoded = encode_text("hello")
print(hello_encoded)
print(decode_text(hello_encoded))

import torch
data = torch.tensor(encode_text(text), dtype=torch.long)
print(data.dtype)
print(data.size())

n = int(0.9*len(data))

train_data = data[:n]
val_data = data[n:]

block_size = 8
train_data[:block_size+1]

torch.manual_seed(1337)
batch_size = 4 # how many different sequences are processed at once
block_size = 8 # what is the context length (max)

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(0, data.size(0) - block_size, (batch_size,)) # 4 random locations we can sample from
    x = torch.stack([data[i:i+block_size] for i in ix]) # random sequences
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # next character for each random sequence

    return x, y

xb, yb = get_batch('train')
print("inputs:")
print(xb.shape)
print(xb)
print("targets:")
print(yb.shape)
print(yb)

print("----")

for b in range(batch_size):
    for t in range(block_size): # for each of the characters in the sample
        context = xb[b, :t+1]
        target = yb[b, t]
        print("when we see the text", context, "we predict the next character is", target)

# data prep

Number of unique characters:  65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
[46, 43, 50, 50, 53]
hello
torch.int64
torch.Size([1115394])
inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
----
when we see the text tensor([24]) we predict the next character is tensor(43)
when we see the text tensor([24, 43]) we predict the next character is tensor(58)
when we see the text tensor([24, 43, 58]) we predict the next character is tensor(5)
when we see the text tensor([24, 43, 58,  5]) we predict the next character is tensor(57)
when we see the text tensor([24, 43, 58,  5, 57]) we predict the next character is tensor(1)
when we see th

In [3]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

print("shape of input: ", xb.shape) # 4 sequences of 8 characters
token_embedding_table = nn.Embedding(vocab_size, vocab_size)
    
logits = token_embedding_table(xb)
logits.shape # 4 sequences of 8 characters, each character is a one-hot encoded vector of size 65

# let's recreate the encoding with matrix multiplication
# one hot vector for each character
one_hot = F.one_hot(xb, vocab_size).float()
print("shape of one hot vector: ", one_hot.shape)

one_hot[0, 2].shape # batch 0, character 2

shape of input:  torch.Size([4, 8])
shape of one hot vector:  torch.Size([4, 8, 65])


torch.Size([65])