In [None]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)

torch.Size([314937]) torch.int64
tensor([59, 21, 16,  ..., 72, 77, 73])


In [3]:
n = int(0.9 * (len(data)))
train_data = data[:n]
val_data = data[n:]

In [4]:
# FT: Context length, length of the chunk of the text for the training, if we would use all text at once, it would be computationaly expensive
# FT: It's important to do because we want to train our network for small texts also (eg. one char), not just because of perf
block_size = 10
train_data[:block_size+1]

tensor([59, 21, 16, 24, 16, 19, 17, 19, 22,  2, 18])

In [5]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f'When input is {context} the target is {target}')

When input is tensor([59]) the target is 21
When input is tensor([59, 21]) the target is 16
When input is tensor([59, 21, 16]) the target is 24
When input is tensor([59, 21, 16, 24]) the target is 16
When input is tensor([59, 21, 16, 24, 16]) the target is 19
When input is tensor([59, 21, 16, 24, 16, 19]) the target is 17
When input is tensor([59, 21, 16, 24, 16, 19, 17]) the target is 19
When input is tensor([59, 21, 16, 24, 16, 19, 17, 19]) the target is 22
When input is tensor([59, 21, 16, 24, 16, 19, 17, 19, 22]) the target is 2
When input is tensor([59, 21, 16, 24, 16, 19, 17, 19, 22,  2]) the target is 18


In [6]:
# Making batches for context blocks, storing multiple blocks inside one tensor, because perf, making GPUs busy
torch.manual_seed(1111)
batch_size = 4 # How many independent sequences will we process in parallel?
block_size = 8 # What is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(0, len(data) - block_size, (batch_size,)) # 4 element array
    # [4, 5, 1, 10]
    x = torch.stack([data[i:i + block_size] for i in ix]) # [data[4:12], data[5:13]...]
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix]) # [data[5:13], data[6,14]...]
    return x, y

xb, yb = get_batch('train')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'When input is {context.tolist()} the target is {target}')


When input is [72] the target is 67
When input is [72, 67] the target is 2
When input is [72, 67, 2] the target is 81
When input is [72, 67, 2, 81] the target is 84
When input is [72, 67, 2, 81, 84] the target is 67
When input is [72, 67, 2, 81, 84, 67] the target is 2
When input is [72, 67, 2, 81, 84, 67, 2] the target is 73
When input is [72, 67, 2, 81, 84, 67, 2, 73] the target is 77
When input is [76] the target is 63
When input is [76, 63] the target is 2
When input is [76, 63, 2] the target is 68
When input is [76, 63, 2, 68] the target is 77
When input is [76, 63, 2, 68, 77] the target is 76
When input is [76, 63, 2, 68, 77, 76] the target is 1
When input is [76, 63, 2, 68, 77, 76, 1] the target is 59
When input is [76, 63, 2, 68, 77, 76, 1, 59] the target is 18
When input is [73] the target is 81
When input is [73, 81] the target is 63
When input is [73, 81, 63] the target is 2
When input is [73, 81, 63, 2] the target is 52
When input is [73, 81, 63, 2, 52] the target is 80
Whe

In [7]:
print(xb)
print(yb)

tensor([[72, 67,  2, 81, 84, 67,  2, 73],
        [76, 63,  2, 68, 77, 76,  1, 59],
        [73, 81, 63,  2, 52, 80, 71, 84],
        [22, 21,  2, 48, 45, 61,  2, 33]])
tensor([[67,  2, 81, 84, 67,  2, 73, 77],
        [63,  2, 68, 77, 76,  1, 59, 18],
        [81, 63,  2, 52, 80, 71, 84, 63],
        [21,  2, 48, 45, 61,  2, 33, 74]])


In [8]:
# Simplest biagram language model, working only on the 2 chars at the time, looking at one and trying to predict next
# Bigram is the (char1, char2) tuple
from classes.BigramLanguageModel import BigramLanguageModel

model = BigramLanguageModel(vocab_size)
logits, loss = model.forward(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 159])
tensor(5.3450, grad_fn=<NllLossBackward0>)


In [None]:
prompt_ids = encode('e brate')
context = torch.tensor(
    [prompt_ids],
    dtype=torch.long
)
prediction = model.generate(context, 30)
print(decode(prediction[0].tolist()))

tensor([[67,  2, 64, 80, 63, 82, 67]])
torch.Size([1, 37])
e brateü•π2≈°‚úã3—öü§£—öƒÜXüòçü§ù5–≤yOZ0üòÄF–∫üëç‚ù§–º*–ªwüòç2≈°


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
batch_size = 32
for epoch in range(10000):
    xb, yb = get_batch('train')
    logits, loss = model.forward(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if(epoch % 100 == 0):
        print(loss.item())

2.018098831176758
2.21431565284729
2.0775020122528076
2.1575984954833984
1.9534363746643066
2.0726282596588135
2.297912120819092
2.4960439205169678
2.0833330154418945
2.1392390727996826
2.2052831649780273
2.2020103931427
1.9181740283966064
2.2327539920806885
2.0558528900146484
2.1309473514556885
2.194337844848633
2.072873115539551
2.172433376312256
2.106092929840088
2.2093262672424316
2.28464937210083
2.3201231956481934
2.0127007961273193
2.2236480712890625
2.030179977416992
1.8751254081726074
2.1253509521484375
2.373624086380005
2.3188934326171875
2.116074323654175
2.0170481204986572
2.4597511291503906
2.252742290496826
1.9551255702972412
2.292646884918213
2.0085737705230713
1.9547629356384277
2.1476359367370605
2.12552547454834
2.2439870834350586
2.06255841255188
2.202749252319336
2.1941516399383545
2.0588877201080322
2.068826675415039
2.1794629096984863
2.340195894241333
2.2345762252807617
2.2233266830444336
1.9820386171340942
2.2657017707824707
1.9370473623275757
2.2795801162719727

In [29]:
prompt_ids = encode('e brate')
context = torch.tensor(
    [prompt_ids],
    dtype=torch.long
)
prediction = model.generate(context, 100)
print(decode(prediction[0].tolist()))

tensor([[67,  2, 64, 80, 63, 82, 67]])
e bratenipri
[1/1E ko ksalovan
[4 tova sebr
[4 dasekoviman:110/8 bri m si Alipo n:11 s:14/2020202:02 n sile
