In [1]:
text = 'Kupac: Koliko kosta Bosch GBH 141? Prodavac: Postovani, Bosch GBH 141 kosta 10000 RSD.'
chars = sorted(set(text))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

stoi = { ch:i for i,ch in enumerate(chars) } # dict comperhension eg. A:0
itos = { i:ch for i,ch in enumerate(chars) } # dict comperhension eg. 0:A

def encode(chars):
    return [stoi[c] for c in chars]

def decode(chars):
    return ''.join([itos[i] for i in chars])

encoded = encode('Prodavac: Postovani')
decoded = decode(encoded)
print(encoded)
print(decoded)

 ,.014:?BDGHKPRSacdhiklnoprstuv
31
[13, 26, 24, 18, 16, 30, 16, 17, 6, 0, 13, 24, 27, 28, 24, 30, 16, 23, 20]
Prodavac: Postovani


In [2]:
import torch

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data)

torch.Size([86]) torch.int64
tensor([12, 29, 25, 16, 17,  6,  0, 12, 24, 22, 20, 21, 24,  0, 21, 24, 27, 28,
        16,  0,  8, 24, 27, 17, 19,  0, 10,  8, 11,  0,  4,  5,  4,  7,  0, 13,
        26, 24, 18, 16, 30, 16, 17,  6,  0, 13, 24, 27, 28, 24, 30, 16, 23, 20,
         1,  0,  8, 24, 27, 17, 19,  0, 10,  8, 11,  0,  4,  5,  4,  0, 21, 24,
        27, 28, 16,  0,  4,  3,  3,  3,  3,  0, 14, 15,  9,  2])


In [3]:
n = int(0.9 * (len(data)))
train_data = data[:n]
val_data = data[n:]

In [4]:
# FT: Context length, length of the chunk of the text for the training, if we would use all text at once, it would be computationaly expensive
# FT: It's important to do because we want to train our network for small texts also (eg. one char), not just because of perf
block_size = 10
train_data[:block_size+1]

tensor([12, 29, 25, 16, 17,  6,  0, 12, 24, 22, 20])

In [5]:
x = train_data[:block_size]
y = train_data[1:block_size + 1]
for t in range(block_size):
    context = x[:t + 1]
    target = y[t]
    print(f'When input is {context} the target is {target}')

When input is tensor([12]) the target is 29
When input is tensor([12, 29]) the target is 25
When input is tensor([12, 29, 25]) the target is 16
When input is tensor([12, 29, 25, 16]) the target is 17
When input is tensor([12, 29, 25, 16, 17]) the target is 6
When input is tensor([12, 29, 25, 16, 17,  6]) the target is 0
When input is tensor([12, 29, 25, 16, 17,  6,  0]) the target is 12
When input is tensor([12, 29, 25, 16, 17,  6,  0, 12]) the target is 24
When input is tensor([12, 29, 25, 16, 17,  6,  0, 12, 24]) the target is 22
When input is tensor([12, 29, 25, 16, 17,  6,  0, 12, 24, 22]) the target is 20


In [6]:
# Making batches for context blocks, storing multiple blocks inside one tensor, because perf, making GPUs busy
torch.manual_seed(1111)
batch_size = 4 # How many independent sequences will we process in parallel?
block_size = 8 # What is the maximum context length for predictions?

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(0, len(data) - block_size, (batch_size,)) # 4 element array
    # [4, 5, 1, 10]
    x = torch.stack([data[i:i + block_size] for i in ix]) # [data[4:12], data[5:13]...]
    y = torch.stack([data[i + 1:i + block_size + 1] for i in ix]) # [data[5:13], data[6,14]...]
    return x, y

xb, yb = get_batch('train')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b, :t+1]
        target = yb[b, t]
        print(f'When input is {context.tolist()} the target is {target}')


When input is [10] the target is 8
When input is [10, 8] the target is 11
When input is [10, 8, 11] the target is 0
When input is [10, 8, 11, 0] the target is 4
When input is [10, 8, 11, 0, 4] the target is 5
When input is [10, 8, 11, 0, 4, 5] the target is 4
When input is [10, 8, 11, 0, 4, 5, 4] the target is 7
When input is [10, 8, 11, 0, 4, 5, 4, 7] the target is 0
When input is [5] the target is 4
When input is [5, 4] the target is 0
When input is [5, 4, 0] the target is 21
When input is [5, 4, 0, 21] the target is 24
When input is [5, 4, 0, 21, 24] the target is 27
When input is [5, 4, 0, 21, 24, 27] the target is 28
When input is [5, 4, 0, 21, 24, 27, 28] the target is 16
When input is [5, 4, 0, 21, 24, 27, 28, 16] the target is 0
When input is [13] the target is 26
When input is [13, 26] the target is 24
When input is [13, 26, 24] the target is 18
When input is [13, 26, 24, 18] the target is 16
When input is [13, 26, 24, 18, 16] the target is 30
When input is [13, 26, 24, 18, 16

In [7]:
print(xb)
print(yb)

tensor([[10,  8, 11,  0,  4,  5,  4,  7],
        [ 5,  4,  0, 21, 24, 27, 28, 16],
        [13, 26, 24, 18, 16, 30, 16, 17],
        [24, 27, 17, 19,  0, 10,  8, 11]])
tensor([[ 8, 11,  0,  4,  5,  4,  7,  0],
        [ 4,  0, 21, 24, 27, 28, 16,  0],
        [26, 24, 18, 16, 30, 16, 17,  6],
        [27, 17, 19,  0, 10,  8, 11,  0]])


In [8]:
# Simplest biagram language model, working only on the 2 chars at the time, looking at one and trying to predict next
# Bigram is the (char1, char2) tuple
from classes.BigramLanguageModel import BigramLanguageModel

model = BigramLanguageModel(vocab_size)
logits, loss = model.forward(xb, yb)
print(logits.shape)
print(loss)

torch.Size([32, 31])
tensor(3.7138, grad_fn=<NllLossBackward0>)


In [14]:
prediction = model.generate(xb, 30)
print(prediction.shape)
print(decode(prediction[0].tolist()))

torch.Size([4, 38])
GBH 141?B,lHHp,s.HRki. uiScd4PGaauiPrd
