In [41]:
import requests

# URL for the Tiny Shakespeare dataset
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"

# Download the dataset
response = requests.get(url)
data = response.text

# Save the dataset to a local file
with open('tiny_shakespeare.txt', 'w') as file:
    file.write(data)

print("Dataset downloaded and saved as tiny_shakespeare.txt.")


Dataset downloaded and saved as tiny_shakespeare.txt.


In [1]:
with open('tiny_shakespeare.txt', 'r') as file:
    text = file.read()

In [2]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)

65


In [3]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

In [4]:
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [5]:
decode([17,33])

'EU'

In [6]:
import torch
data = torch.tensor(encode(text), dtype=torch.long)



pasplitint i test ir train

In [7]:
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

In [8]:
block_size = 8
train_data[:block_size+1]

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [9]:
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f'when input is {context} then target is {target}')

when input is tensor([18]) then target is 47
when input is tensor([18, 47]) then target is 56
when input is tensor([18, 47, 56]) then target is 57
when input is tensor([18, 47, 56, 57]) then target is 58
when input is tensor([18, 47, 56, 57, 58]) then target is 1
when input is tensor([18, 47, 56, 57, 58,  1]) then target is 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) then target is 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) then target is 58


taip gauname visus atvejus, kuriuos panaudoja transsformeris besimokydamas, kadangi pirma ima character 2 su kontekstu 1 ir tt, kol praeina viso bloko konteksta

In [11]:
torch.manual_seed(1337)
batch_size = 4 # how many independant sequences we will process in parallel 
block_size = 8 # what is the maximum context for those batch sequences

def get_batch(split):
    #generating a small batch of data of inputs for x and y
    data = train_data if split == 'train' else val_data # we ste our data set
    ix = torch.randint(len(data) - block_size, (batch_size,)) # here we randomize 4 (since this is the batch size) x's to have as a starting point 
    x = torch.stack([data[i:i+block_size] for i in ix]) # we fill in the full x vector from the one we randomized
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]) # we fill in the y vector to be able to follow the context
    return x,y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)
print('-----')


for b in range(batch_size): # batch dimnesion
    for t in range(block_size): # block or time dimension 
        context = xb[b, :t+1]
        target = yb[b,t]
        print(f'when input is {context.tolist} then target is {target}')


inputs:
torch.Size([4, 8])
tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])
targets:
torch.Size([4, 8])
tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
-----
when input is <built-in method tolist of Tensor object at 0x12fc33590> then target is 43
when input is <built-in method tolist of Tensor object at 0x12fc334d0> then target is 58
when input is <built-in method tolist of Tensor object at 0x1068d2690> then target is 5
when input is <built-in method tolist of Tensor object at 0x12fc33590> then target is 57
when input is <built-in method tolist of Tensor object at 0x12fc334d0> then target is 1
when input is <built-in method tolist of Tensor object at 0x1068d2690> then target is 46
when input is <built-in method tolist of Tensor object at 0x12fc33590

In [14]:
print(xb.shape) # input for transformer
# THIS IS LATER REFERED TO AS IDX in the constructor

torch.Size([4, 8])


here we construct a embeding table that will hold the values of relations between the characetrs
b*t so that we would see how all of the different independant batches affect each other so we strech thee logit table to contain all of the batch exmaples


In [25]:
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()
        # each token directly looks up next token from lookup array of all token relation values
        self.token_embeding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, idx, targets=None):

        #idx and targets are both (B,T) tensor of integers
        logits = self.token_embeding_table(idx) #(B,T,C)
        #B - batch size, how many sequences we look over
        #T - time, which means how long of a context we look into
        #C - channels or how many different characters we have

        if targets is None:
            loss = None
        else:
        
            B, T, C = logits.shape # this is needed after the investigation of loss function required inputs 
            logits = logits.view(B*T, C) 
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss


    def generate(self, idx, max_new_tokens):
        #idx is (B,T) array of indices in the current context

        for _ in range(max_new_tokens):
            #get the predictions 
            logits, loss = self(idx)
            #focus only on the last time step
            #we reduce dimensionality to 2 dimension basically since we need only the last output to make the new token prediction
            logits = logits[:, -1, :] # becomes B,C
            #apply softmax to probabilities
            probs = F.softmax(logits, dim=-1)
            #sample from distribution 
            idx_next = torch.multinomial(probs, num_samples=1) # (B,1)
            #append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)

        return idx

m = BigramLanguageModel(vocab_size)
logits, loss = m(xb, yb)
print(loss)

tensor(4.8786, grad_fn=<NllLossBackward0>)


In [26]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))


SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


since the output is random (giving the result based on no training), it is bascially useless. Thus we need to train the created embeding architecture in order for it to give out meaningful results, or in this case generated text.
This will be started by implementing training process based on adamW optimizer.

In [27]:
#create pytorch optimizer 
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [31]:
batch_size = 32
for steps in range(10000):

    #first we need to sample the training data 
    xb, yb = get_batch('train')
    
    #then we evaluate the loss 
    logits, loss = m(xb, yb)
    #we zeroing out all of the gradients from the previous step
    #this is essential since after every iteration we want to have cleared
    #so that the newly evaluated gradients accumulates only for that specific iteration
    #and calling this function is necessary to clear the gradients before each backpropagation pass
    optimizer.zero_grad(set_to_none=True) #set_to_none is used to have them not as 0's but rather as none values
    #then we compute all of the gradients 
    loss.backward()
    #then to recalculate all of the weights and biases using the calculated gradients
    optimizer.step()

    print(loss.item())

2.599949359893799
2.4685709476470947
2.467829704284668
2.448638439178467
2.536531686782837
2.4843695163726807
2.353912591934204
2.437197208404541
2.446620225906372
2.39513897895813
2.401313066482544
2.5695745944976807
2.5136396884918213
2.45255708694458
2.389624834060669
2.4585049152374268
2.368389844894409
2.4304115772247314
2.3746204376220703
2.5275089740753174
2.4111266136169434
2.322298288345337
2.494917869567871
2.5799038410186768
2.433724880218506
2.308471918106079
2.3241360187530518
2.534543037414551
2.4952995777130127
2.397226095199585
2.4748382568359375
2.437386989593506
2.544435501098633
2.4318687915802
2.4357409477233887
2.3984274864196777
2.548039197921753
2.375275135040283
2.3791306018829346
2.5857717990875244
2.351057291030884
2.503523826599121
2.4406611919403076
2.571166753768921
2.5360209941864014
2.512481927871704
2.473360300064087
2.5368430614471436
2.4201526641845703
2.4292309284210205
2.431366443634033
2.601490020751953
2.5642662048339844
2.419848680496216
2.4692595

In [36]:
print(decode(m.generate(idx = torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0].tolist()))



This!
So tean fo I, t t tomavest-ind mom artotithitt
Tut en inhed is wengsi'd ag tenobuteefou pat.



self attention block

In [38]:
torch.manual_seed(1337)

B,T,C = 4,8,2
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

In [None]:
#at this time we want to evaluate the mean of previous tokens 'atention' 
#x[b,t] = mean{i<=t}x[b,i]
xbow = torch.zeros((B,T,C))
for b in range(B):
    for t in range(T):
        xprev = x[b, :t+1] # (t,C)
        # (t,C) is of dimension where t is all of the elements in the past and then C all the two dimensional information from these tokens
        xbow[b, t] = torch.mean(xprev, 0) # 0 stands for 0th dimension


but this is very inefficient and there is a better solution how to get the same result using matrix multiplication
this is done using matrix multiplication and diagonal matrixes


In [None]:
wei = torch.tril(torch.ones(T, T))
wei = wei / wei.sum(1, keep_dim=True)
xbow2 = wei @ x # (B (CREATED AUTO), T, T) @ (B, T, C) ---> (B, T, C) => xbow2 == xbow

and we can improve it even more 

In [None]:
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))
# this is preventing the communication of the tokens from the future and thus keeping the infomration flow only between the tokens in the past
wei = wei.masked_fill(tril == 0, float('-inf')) 

wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x


self attention block
the main principle of self attention is to find which past values are relevant for the specific token 
this main problem is solved using key and query vectors, where 
    keyt vector is the tensor that defines what does this token contain 
    query vector contains the information about what are we looking for 

after that the dot product of these vectors is what matters for attention evalution, because if the two vectors alling thus we can say that the information from that key is relevant in the prediction of the token

and then after this is applied we evaluate the vector v, which is the vector that represents the elements that we aggregate

In [47]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B, T, C) #batch, time, channel

#adding the head of self attention 
head_size = 16
key = nn.Linear(C, head_size, bias=F)
query = nn.Linear(C, head_size, bias=F)
value = nn.Linear(C, head_size, bias=F)
k = key(x)      # (B, T, 16)
q = query(x)    # (B, T, 16)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) --> (B, T, T)



tril = torch.tril(torch.ones(T,T))
#wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf')) 
wei = F.softmax(wei, dim=-1)

v = value(x)
#out = wei @ x


out = wei @ v

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.8368, 0.1632, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0585, 0.2252, 0.7164, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4540, 0.0818, 0.3104, 0.1538, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0442, 0.1218, 0.1840, 0.3724, 0.2776, 0.0000, 0.0000, 0.0000],
        [0.0214, 0.1360, 0.1688, 0.0460, 0.5743, 0.0536, 0.0000, 0.0000],
        [0.0592, 0.0265, 0.4947, 0.0623, 0.2254, 0.0120, 0.1199, 0.0000],
        [0.0351, 0.0286, 0.3691, 0.1940, 0.1005, 0.0118, 0.0331, 0.2277]],
       grad_fn=<SelectBackward0>)

In [None]:
from typing import Any


class BatchNorm1d:

    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        #parameters
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)
    
    def __call__(self, x):
        xmean = x.mean(1, keepdim=True)
        xvar = x.var(1, keepdim=True)
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalization to unit vairance
        self.out = self.gamma * xhat + self.beta
        return self.out 
    
    def parameters(self):
        return [self.gamma, self.beta]
    
    
        
        








