In [7]:
import requests

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)

with open("tinyshakespeare.txt", "w", encoding="utf-8") as file:
    file.write(response.text)

In [9]:
print("Characters in data:", len(response.text))

Characters in data: 1115394


In [10]:
# Find uniquw characters
chars = sorted(list(set(response.text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [11]:
# Map charatcters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] #Encoder
decode = lambda l: ''.join([itos[i] for i in l]) #Decoder

print(encode("Lucas"))
print(decode(encode("Lucas")))

[24, 59, 41, 39, 57]
Lucas


In [14]:
import torch
data = torch.tensor(encode(response.text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])  # Print first 1000 characters as integers

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [15]:
n = int(0.9 * len(data))  # 90% for training
train_data = data[:n]
val_data = data[n:]

In [None]:
context_size = 8  # Number of characters used to predict at a time
train_data[:context_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [18]:
# Explaination of context_size:
# context_size determines how many characters the model will look at to predict the next character.

x = train_data[:context_size]  # Input sequence
y = train_data[1:context_size+1]  # Target sequence (next character)
for i in range(context_size):
    context = x[:i+1]  # Up to the i-th character
    target = y[i]  # The next character to predict
    print(f"Context: {context}, Target: {target}")

Context: tensor([18]), Target: 47
Context: tensor([18, 47]), Target: 56
Context: tensor([18, 47, 56]), Target: 57
Context: tensor([18, 47, 56, 57]), Target: 58
Context: tensor([18, 47, 56, 57, 58]), Target: 1
Context: tensor([18, 47, 56, 57, 58,  1]), Target: 15
Context: tensor([18, 47, 56, 57, 58,  1, 15]), Target: 47
Context: tensor([18, 47, 56, 57, 58,  1, 15, 47]), Target: 58


In [21]:
torch.manual_seed(1337) 
batch_size = 4  # Number of sequences in a batch
context_size = 8  # Number of characters in each sequence

def get_batch(split):
    data = train_data if split == 'train' else val_data
    n = len(data) - context_size
    ix = torch.randint(n, (batch_size,))
    x = torch.stack([data[i:i+context_size] for i in ix])
    y = torch.stack([data[i+1:i+context_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs: ')
print(xb.shape)
print(xb)
print('targets: ')
print(yb.shape)
print(yb)

print("")

for b in range(batch_size):
    for t in range(context_size):
        context = xb[b, :t+1]  # Up to the t-th character
        target = yb[b, t]  # The next character to predict
        print(f"Batch {b}, Context: {context.tolist()}, Target: {target}")

inputs: 
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets: 
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])

Batch 0, Context: [24], Target: 43
Batch 0, Context: [24, 43], Target: 58
Batch 0, Context: [24, 43, 58], Target: 5
Batch 0, Context: [24, 43, 58, 5], Target: 57
Batch 0, Context: [24, 43, 58, 5, 57], Target: 1
Batch 0, Context: [24, 43, 58, 5, 57, 1], Target: 46
Batch 0, Context: [24, 43, 58, 5, 57, 1, 46], Target: 43
Batch 0, Context: [24, 43, 58, 5, 57, 1, 46, 43], Target: 39
Batch 1, Context: [44], Target: 53
Batch 1, Context: [44, 53], Target: 56
Batch 1, Context: [44, 53, 56], Target: 1
Batch 1, Context: [44, 53, 56, 1], Target: 58
Batch 1, Context: [44, 53, 56, 1, 58], Target: 46
Batch 1, C