In [1]:
text = open('input.txt', 'r', encoding='utf-8').read()

In [2]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [3]:
#tokenizer
vocab_size = sorted(list(set(text)))
vocab_length = len(vocab_size)
print(f'We have vocab size of: {len(vocab_size)}')
stoi = {s : i for i, s in enumerate(vocab_size)}
itos = {i : s for s, i in stoi.items()}
encoder = lambda f: [stoi[i] for i in f]
decoder = lambda f: ''.join([itos[i] for i in f])
x = encoder('hello')
print(x)
y = decoder(x)
print(y)



We have vocab size of: 65
[46, 43, 50, 50, 53]
hello


In [4]:
import torch
data = torch.tensor(encoder(text), dtype=torch.long)
print(data.shape)
print(data[:10])


torch.Size([1115394])
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47])


In [5]:
#splitting dataset to training and validation
n = int(len(data) * 0.9)
train_data = data[:n]
val_data = data[n:] 
block_size = 8


x = train_data[:block_size]
y = train_data[1:block_size + 1]
###small overview of our batches in small scale
for i in range(block_size):
    context = x[:i + 1]
    target = y[i]
    print(f'{context=}=={target=}')

context=tensor([18])==target=tensor(47)
context=tensor([18, 47])==target=tensor(56)
context=tensor([18, 47, 56])==target=tensor(57)
context=tensor([18, 47, 56, 57])==target=tensor(58)
context=tensor([18, 47, 56, 57, 58])==target=tensor(1)
context=tensor([18, 47, 56, 57, 58,  1])==target=tensor(15)
context=tensor([18, 47, 56, 57, 58,  1, 15])==target=tensor(47)
context=tensor([18, 47, 56, 57, 58,  1, 15, 47])==target=tensor(58)


In [6]:
#creating batches of data to process in block_size
block_size = 8
batch_size = 4


def get_batch(label):
    data = train_data if label == 'train' else val_data
    ix = torch.randint(len(data) - block_size - 1, (batch_size, ))
    x = torch.stack([data[i:i+block_size] for i in ix]) #creates 2d tensor from all of the lists of values we provide
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y
    
xb, xy = get_batch('train')
print(f'inputs: {xb}') 
print(f'outputs: {xy}')

for i in range(batch_size):
    for j in range(block_size):
        context = xb[i, :j + 1]
        target = xy[i, j]     
        print(f'when context is: {context.tolist()} target is: {target}')


inputs: tensor([[47, 57,  1, 52, 39, 51, 43, 10],
        [ 1, 49, 52, 53, 61,  1, 58, 46],
        [49, 57,  8,  0,  0, 23, 21, 26],
        [56, 43,  2,  0,  0, 13, 26, 19]])
outputs: tensor([[57,  1, 52, 39, 51, 43, 10,  1],
        [49, 52, 53, 61,  1, 58, 46, 43],
        [57,  8,  0,  0, 23, 21, 26, 19],
        [43,  2,  0,  0, 13, 26, 19, 17]])
when context is: [47] target is: 57
when context is: [47, 57] target is: 1
when context is: [47, 57, 1] target is: 52
when context is: [47, 57, 1, 52] target is: 39
when context is: [47, 57, 1, 52, 39] target is: 51
when context is: [47, 57, 1, 52, 39, 51] target is: 43
when context is: [47, 57, 1, 52, 39, 51, 43] target is: 10
when context is: [47, 57, 1, 52, 39, 51, 43, 10] target is: 1
when context is: [1] target is: 49
when context is: [1, 49] target is: 52
when context is: [1, 49, 52] target is: 53
when context is: [1, 49, 52, 53] target is: 61
when context is: [1, 49, 52, 53, 61] target is: 1
when context is: [1, 49, 52, 53, 61, 1]

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class BiagramModel(nn.Module):
    def __init__(self, xdim, ydim):
        
        super().__init__()

        self.token_embedding_table = nn.Embedding(xdim, ydim) #from second lesson, we initiliez one hot representation to all indexes, but now we are doing it for embedding
        #the reason x dim and y dim should be vocab size is that for cross entropy loss, the compared values after softmax is targets indexes and logits embedding values.
        # and xdim is vocab size because the iterated indexes are within the interval of 0 and vocab size.
    def forward(self, idx, targets=None):

        logits = self.token_embedding_table(idx) #B T C, it will generate batch size, block_size, vocab_size // assigning vocab sized embed to each index
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_tokens):
        for _ in range(max_tokens):
            logits, loss = self(idx)
            # getting last index of array and getting that time dimensions element
            logits = logits[:, -1, :]

            probs = F.softmax(logits, dim=-1)

            idx_next = torch.multinomial(probs, num_samples=1)
            ## when you dont do dim 1 in here very strangely it closes down cursor, strange. but this line basically does concatting tensors on dim 1
            ## because they output as:: [[2]] [[23, 33]]
            idx = torch.cat((idx, idx_next), dim=1)

        return idx


m = BiagramModel(vocab_length, vocab_length)
out, loss = m(xb, xy)
print(out.shape)
print(loss.item())

print(decoder(m.generate(torch.zeros((1, 1), dtype=torch.long), 200)[0].tolist()))
#there is issue in init loss becuase we expect loss to be -ln(1/65) --> 4.174 but rn we get 4.588 which is we need to normalize init weights.

torch.Size([256, 65])
4.820847034454346

N3RgbEzo$eSD&FzydNMx.:aCM3MVXjfmFMkXjhyKM$bYeS.b&Ot'KZkDrAYGz3eYIldNILLQWpZFUnO K-KYyTpuPfzxBmqmC!.CbK?uUgMW3SdH;&ku3OPiUiTnUKqb:QYRQWgN:Nx'dupE;ZrOIIbU&KA&AMFrfzWyx?LcjbxXgWb:opN!,ZbE:u3K:oRXzK&wwCa$


In [19]:
optimizer = torch.optim.Adam(m.parameters(), lr=1e-3)

In [23]:
batch_size = 32 

for steps in range(10000):

    xb, xy = get_batch('train')
    logits, loss = m(xb, xy)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

    
print(loss.item())

2.430536985397339


In [30]:
print(decoder(m.generate(torch.zeros((1, 1), dtype=torch.long), 200)[0].tolist()))


LAloitemariarderecos iend CENCO:
II ES: is d?
AMul g sweas ingseveven:

ANorwin'ss:
S:
S:
ist bur the avigembed If; d t:
ngousin fin y'llld h'ste thapert t y nowaind y My hest;
Ange:

CERueve akiamey 


In [70]:
####attention section####
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
x.shape


torch.Size([4, 8, 2])

In [71]:
xbow = torch.zeros(B, T, C) #xbow stands for bag of words, common term when we try to predict context of words based on some calculation over all of its previous chars
#we try to get the mean of all of the C channels for its prevous values and storing this info into xbow
#this is most basic way to capture meaning from other words, taking mean would make us lose very much info about the words.

for i in range(B):
    for j in range(T):
        x_prev = x[i, :j+1]
        xbow[i, j] = x_prev.mean(0)
xbow.shape

torch.Size([4, 8, 2])

In [73]:
#doing the same thing as above but with much much more efficient manner
torch.manual_seed(3)
wei = torch.tril(torch.ones(T,T)) # the reason we do this is that, we dont want future tokens to be able to talk to each other so we just initilize them as zero
wei = wei / wei.sum(1, keepdim=True) # getting probability distribution
print(wei)
wout = wei @ x # (T, T) @ (B, T, C) ==> (B, T, T) @ (B, T, C) ==> (B, T, C)
#generating exact same result but with matrix mult. 
torch.allclose(wout, xbow)#checking if all results are same.

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


True

In [79]:
##creating with softmax, this implementation is much better because with this, wei's values are data-dependent, we just at first initilize at 0

tril = torch.tril(torch.ones((T,T)))
wei = torch.zeros((T,T)) #think of it as interaction strength of attention, how """"much"""" of the past values do we want to take mean of.
#some tokens will find other tokens a bit more interesting.  
wei = wei.masked_fill(tril == 0, float('-inf')) #when exponenting -inf for softmax, we want it to output 0 and for 0 we want it to output 1 // we are saying in this layer basically is past tokens cannot talk to each other.
wei = wei.softmax(1)
print(wei)
xbow2 = wei @ x
torch.allclose(xbow2, xbow) # it is true for all of the values x 



tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


True

In [111]:
torch.manual_seed(1337)
B, T, C = 4, 8, 32
x = torch.randn(B,T,C)
# 2, 4 4,3
head_size = 16
key = nn.Linear(C, head_size, bias=False) 
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
#what we are doing is single head attention, we create bunch of 'questions(headsize) and keys(headsize)' with key query matrix and dot product those questions with our
#initial matrix which is x. once we get those matrixes, we dot product key and query
# if dot product is higher, the affect of change is much higher.

k = key(x) #dot product with key matrix (B, T, headsize)
q = query(x) #dot product with query matrix (B, T, headsize)

#i think the great way to think about these dot products above is that we have bunch of questions and that question number is head size and
#and there are bunch of people that will respond to our question (T number), now we ask our one question to every people and our second question to every people and so on until everyone is being questioned about every question we have 
#and the amount of words, amount of effort that we are going to spend for people to answer that questions is our C channel.
#if we increase C channel, we can get much more detailed answers about our asked questions. we can also increase amount of questions that we will ask and so on to reach our goal of understanding context of our questions for every people.

#cannot do dot product with (B, T, headsize) (B, T, headsize) so converting key to (B, headsize, T)
wei = q @ k.transpose(-2, -1) #(B,T,T) gives us how much results from the query matrix align with the key matrix.

tril = torch.tril(torch.ones((T,T)))
# wei = torch.zeros((T,T)) #this will comment out since we want data dependable wei.
wei = wei.masked_fill(tril == 0, float('-inf')) # (B,T,T)
wei = F.softmax(wei, dim=-1) # (B,T,T)
print(wei[0])
print(wei.shape)

v = value(x) # (B,T, headsize) #this value matrix stores kinda private information of the given input, you can think as key query matrix resolves and gives you nice dot product value
#but even though query and key pairs are communicated ok, value pairs determine how are those questions alligment matter.
#the resulting aligment of query key matrix may be ok or not ok for some results.


out = wei @ v # (B,T, headsize)
#1. another way to think about attention is that every token, in our case char, has a information vector about itself and this vector aggregate information via weighted sum of every other
#node that points to it. for ex: first node only points to itself, second node points to first node and itself.third points to first second and itself and so on.
#last node which is determined by block size is weighted sum vector of all of the nodes pointing to it and itself.
#2. attention can be applied to any arbitary directed graph and is just a communication principle between graph 

#3. these nodes have no understanding of the space they are currently belong which other terms mean they dont know where they are in context. so we should encode them
#with their position. 
 
#4. important thing to notice is that batches of examples never talk to each other. we do single head attention logic on single batch and not talk to any other batch.
#by batch i mean the exact number of #(B,T,T) where batch number is B

#5. by apllying masking, we created 'decoder' block where it kinda decodes the text and its kinda context and everythinh gradually and not looking to future tokens
#but when making such as sentiment analysis, you can just delete masking line and you will have fully talking with each other tokens.

#6. we have 'self-attention' block which means keys, querys, and values are generated from dot product of x, like they are generated from the example we have given, generated from batch
#while 'cross-attention' gets keys, values but not querys from other source of matrix, external source of matrix mult of some nodes.  
#these matrixes could be come from some encoder blocks that we want to convey some specific context, some specific meaning to tokens and test them over self-generated queries.
#in cross attention we are just producing queries within ourselves but reading information externally
#basically wwe use this if there is seperate source of nodes we want to pool information from to our nodes 

#7 we miss one variable from paper softmax( Q @ T / √dk) @ V. dk is headsize. wei when taken its softmax converges to extreme values at initilazation if there is big values present in the vector
#and basically becomes one hot vector, it means that you are aggregating information from the result of attention block from just one node.
#this is not we want especially in init. so dk used for normalizing these values before softmax.



#this attention mechanism can be applied to any 


print(out[0])



tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)
torch.Size([4, 8, 8])
tensor([[-0.1571,  0.8801,  0.1615, -0.7824, -0.1429,  0.7468,  0.1007, -0.5239,
         -0.8873,  0.1907,  0.1762, -0.5943, -0.4812, -0.4860,  0.2862,  0.5710],
        [ 0.6764, -0.5477, -0.2478,  0.3143, -0.1280, -0.2952, -0.4296, -0.1089,
         -0.0493,  0.7268,  0.7130, -0.1164,  0.3266,  0.3431, -0.0710,  1.2716],
        [ 0.4823, -0.1069

In [109]:
a = torch.randint(0,3, (2,4))
b = torch.randint(0,3, (4,3))
print(a)
print(b)
c = a @ b 
print(c)


tensor([[2, 1, 0, 0],
        [0, 2, 0, 2]])
tensor([[2, 2, 1],
        [1, 2, 1],
        [0, 2, 2],
        [0, 0, 0]])
tensor([[5, 6, 3],
        [2, 4, 2]])
