<a href="https://colab.research.google.com/github/dhruvchopra2003/Paper2/blob/main/LLM_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-09-18 13:16:10--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-09-18 13:16:10 (26.0 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

# Hyperparameters
batch_size = 16 # how many independent sequences will we process in
block_size = 32 # maximum context length for
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

#_________________

torch.manual_seed(1337)
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Here we are trying to create a character level language
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Create mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]

decode = lambda l: ''.join([itos[i] for i in l])

# train test split
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

# Data loading
def get_batch(split):
  # Generate small batches of data of inputs x and y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])

  # This is being done to push the tensors onto the GPU to accelerate training
  x, y = x.to(device), y.to(device)

  return x, y


#This is basically to optimize pytorch. It tells that whatever intermediate
# variables are created, don't store them, coz we're never gonna call backwards
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out

# The bigram language model
class BigramLanguageModel(nn.Module):
  def __init__(self, vocab_size):
    super().__init__()
    # each token directly reads from the logits for the next token from the lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

  def forward(self, idx, targets=None):
    # idx and targets are both (B, T) tensor of integers
    logits = self.token_embedding_table(idx) # (B, T, C)

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current
    for _ in range(max_new_tokens):
      # get the prediction
      logits, loss = self(idx)
      # Focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # Apply softmax to get probabilities
      probs = F.softmax(logits, dim=-1) # (B, C)
      # sample from the distribution
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # Applied sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

model = BigramLanguageModel(vocab_size)
m = model.to(device)

# Create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
  # every once in a while evaluate the loss on train and val sets
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
  xb, yb = get_batch('train')
  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# Generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))



step 0: train loss 4.7260, val loss 4.7259
step 100: train loss 4.5986, val loss 4.5953
step 200: train loss 4.4763, val loss 4.4755
step 300: train loss 4.3572, val loss 4.3574
step 400: train loss 4.2446, val loss 4.2459
step 500: train loss 4.1332, val loss 4.1303
step 600: train loss 4.0279, val loss 4.0317
step 700: train loss 3.9268, val loss 3.9321
step 800: train loss 3.8344, val loss 3.8387
step 900: train loss 3.7425, val loss 3.7476
step 1000: train loss 3.6578, val loss 3.6587
step 1100: train loss 3.5823, val loss 3.5821
step 1200: train loss 3.5049, val loss 3.5055
step 1300: train loss 3.4314, val loss 3.4358
step 1400: train loss 3.3648, val loss 3.3694
step 1500: train loss 3.2985, val loss 3.3034
step 1600: train loss 3.2409, val loss 3.2487
step 1700: train loss 3.1900, val loss 3.1855
step 1800: train loss 3.1357, val loss 3.1408
step 1900: train loss 3.0850, val loss 3.0887
step 2000: train loss 3.0422, val loss 3.0514
step 2100: train loss 3.0007, val loss 3.0041


## Mathematical Logic behind self attention

In [None]:
torch.manual_seed(1337)
B,T,C = 4,8,2 # batch, time, channels
x = torch.randn(B,T,C)
x.shape

torch.Size([4, 8, 2])

Now we need the tokens to interact with each other. We basically want each token to have some representation in terms of it's preceeding tokens. Simplest way is to take the average of the channels of each predecessor.

In [None]:
# We want x[b,t] = mean_{i<=t} x[b,i]
xbow = torch.zeros((B,T,C)) # bow aka: bag of words -> used for averaging a set of words
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        xbow[b,t] = torch.mean(xprev, 0)


In [None]:
xbow

tensor([[[ 0.1808, -0.0700],
         [-0.0894, -0.4926],
         [ 0.1490, -0.3199],
         [ 0.3504, -0.2238],
         [ 0.3525,  0.0545],
         [ 0.0688, -0.0396],
         [ 0.0927, -0.0682],
         [-0.0341,  0.1332]],

        [[ 1.3488, -0.1396],
         [ 0.8173,  0.4127],
         [-0.1342,  0.4395],
         [ 0.2711,  0.4774],
         [ 0.2421,  0.0694],
         [ 0.0084,  0.0020],
         [ 0.0712, -0.1128],
         [ 0.2527,  0.2149]],

        [[-0.6631, -0.2513],
         [ 0.1735, -0.0649],
         [ 0.1685,  0.3348],
         [-0.1621,  0.1765],
         [-0.2312, -0.0436],
         [-0.1015, -0.2855],
         [-0.2593, -0.1630],
         [-0.3015, -0.2293]],

        [[ 1.6455, -0.8030],
         [ 1.4985, -0.5395],
         [ 0.4954,  0.3420],
         [ 1.0623, -0.1802],
         [ 1.1401, -0.4462],
         [ 1.0870, -0.4071],
         [ 1.0430, -0.1299],
         [ 1.1138, -0.1641]]])

In [None]:
x[0]

tensor([[ 0.1808, -0.0700],
        [-0.3596, -0.9152],
        [ 0.6258,  0.0255],
        [ 0.9545,  0.0643],
        [ 0.3612,  1.1679],
        [-1.3499, -0.5102],
        [ 0.2360, -0.2398],
        [-0.9211,  1.5433]])

In [None]:
xbow[0] # each row is the average of all the elements above it (in the same column)
# This implies that the last row has the average of the entire set

tensor([[ 0.1808, -0.0700],
        [-0.0894, -0.4926],
        [ 0.1490, -0.3199],
        [ 0.3504, -0.2238],
        [ 0.3525,  0.0545],
        [ 0.0688, -0.0396],
        [ 0.0927, -0.0682],
        [-0.0341,  0.1332]])

In [None]:
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=False)
a

tensor([[1.0000, 0.0000, 0.0000],
        [1.0000, 0.5000, 0.0000],
        [1.0000, 0.5000, 0.3333]])

In [None]:
# Now we can be much more efficient doing this by using matrix multiplication
torch.manual_seed(42)
a = torch.tril(torch.ones(3,3))
a = a / torch.sum(a, 1, keepdim=True)
b = torch.randint(0, 10, (3,2)).float()
c = a @ b
print('a=')
print(a)
print('--')
print('b=')
print(b)
print('--')
print('c=')
print(c)

a=
tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
--
b=
tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
--
c=
tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


In [None]:
# In the above code, after using multiplying tensor b with the row averaged lower triangular matrix, we find that in tensor c, each row is the average of the rows above it.
# This means that c[n-1] row has the average of all the rows before it

In [None]:
# Now let's create use this averaging logic (using a matrix a [lower triangular matrix with row averaged weights]) and apply it to vectorize our bag of words tensor before
# This would help us to associate each word with all it's predecessors

# The lower triangular matrix actually ensures that each element only gets information from it's predecessors and not it's successors.

In [None]:
wei = torch.tril(torch.ones(T, T))
wei = wei/torch.sum(wei, 1, keepdim=True)
print(f"Weight matrix: \n{wei}") # each row of it sums to 1

# our b here is: x
print(f"  Input matrix: \n{x}")

# Notice that there is a slight difference in the dimensions. X has dimensions: (B, T, C) however wei has dimensions (T, T). Torch manages itself and creates batches (B) for
# the wei matrix to be able to multiply correctly

xbow2 = wei @ x # (B, T, T) @ (B, T, C) ----> (B, T, C)
torch.allclose(xbow, xbow2)
torch.allclose(xbow, xbow2, rtol=1e-04, atol=1e-06)

Weight matrix: 
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])
  Input matrix: 
tensor([[[ 0.1808, -0.0700],
         [-0.3596, -0.9152],
         [ 0.6258,  0.0255],
         [ 0.9545,  0.0643],
         [ 0.3612,  1.1679],
         [-1.3499, -0.5102],
         [ 0.2360, -0.2398],
         [-0.9211,  1.5433]],

        [[ 1.3488, -0.1396],
         [ 0.2858,  0.9651],
         [-2.0371,  0.4931],
         [ 1.4870,  0.5910],
         [ 0.1260, -1.56

True

In [None]:
# version 3: use Softmax
tril = torch.tril(torch.ones(T, T))
wei = torch.zeros((T,T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = F.softmax(wei, dim=-1)
xbow3 = wei @ x
# torch.allclose(xbow, xbow3)
torch.allclose(xbow, xbow3, rtol=1e-04, atol=1e-06)



True

## Making some small changes to the script

In [None]:
""" Changes:
  -> remove vocab size from the BigramLanguageModel constructor as it is already defined.
  -> Introducing a new variable n_embd: short for number of embedding dimensions
"""
import torch
import torch.nn as nn
from torch.nn import functional as F

# Hyperparameters
batch_size = 16 # how many independent sequences will we process in
block_size = 32 # maximum context length for
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32 # number of embedding dimensions

#__________________________________________________________________________

torch.manual_seed(1336)
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Here we are trying to create a character level language
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Create mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]

decode = lambda l: ''.join([itos[i] for i in l])

# train test split
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

# Data loading
def get_batch(split):
  # Generate small batches of data of inputs x and y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])

  # This is being done to push the tensors onto the GPU to accelerate training
  x, y = x.to(device), y.to(device)

  return x, y


#This is basically to optimize pytorch. It tells that whatever intermediate
# variables are created, don't store them, coz we're never gonna call backwards
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out


# _______________________________________________________________________________________

# The bigram language model
class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    # each token directly reads from the logits for the next token from the lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size) # This layer is used to go from token embeddings to the logits
    # LM_head stands for language model head

    # Now we create a provision to use positional embeddings along with the embeddings of the token identities (token_embeddings)
    # self.position_embedding_table = nn.Embedding(block_size, n_embd) # each position from 0 to block size would also get it's own vector


  def forward(self, idx, targets=None):
    B, T = idx.shape


    # idx and targets are both (B, T) tensor of integers
    # When we replace the vocab size with n_embd, it wouldn't give us logits directly, but rather token embeddings
    tok_emb = self.token_embedding_table(idx) # (B, T, C)
    # pos_embedding = self.position_embedding_table(torch.arange(T, device=device)) # (T, C) | torch.arange gives integers from 0 to T-1
    # x = tok_emb + pos_embedding # (B, T, C) TODO: Check working. the lower dimensional vector (T, C) gets right aligned and another col is added to perform the operation
    logits = self.lm_head(tok_emb) # (B, T, vocab_size())

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current
    for _ in range(max_new_tokens):
      # get the prediction
      logits, loss = self(idx)
      # Focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # Apply softmax to get probabilities
      # Introduce a small value (epsilon) to add to the logits before softmax
      epsilon = 1e-6
      probs = F.softmax(logits + epsilon, dim=-1) # (B, C)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # sample from the distribution
      probs = F.softmax(logits, dim=-1) # (B, C)

      # Applied sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

# _____________________________________________________________________________

# Create model
model = BigramLanguageModel()
m = model.to(device)

# Create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
  # every once in a while evaluate the loss on train and val sets
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
  xb, yb = get_batch('train')
  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# Generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))



step 0: train loss 4.3797, val loss 4.3634
step 100: train loss 3.5072, val loss 3.5054
step 200: train loss 3.0373, val loss 3.0381
step 300: train loss 2.8189, val loss 2.8195
step 400: train loss 2.7122, val loss 2.7211
step 500: train loss 2.6485, val loss 2.6498
step 600: train loss 2.6063, val loss 2.6238
step 700: train loss 2.5839, val loss 2.5861
step 800: train loss 2.5690, val loss 2.5772
step 900: train loss 2.5456, val loss 2.5653
step 1000: train loss 2.5300, val loss 2.5446
step 1100: train loss 2.5272, val loss 2.5370
step 1200: train loss 2.5156, val loss 2.5309
step 1300: train loss 2.5094, val loss 2.5161
step 1400: train loss 2.5146, val loss 2.5175
step 1500: train loss 2.5019, val loss 2.5209
step 1600: train loss 2.5077, val loss 2.5130
step 1700: train loss 2.4954, val loss 2.5114
step 1800: train loss 2.4934, val loss 2.5107
step 1900: train loss 2.4866, val loss 2.5067
step 2000: train loss 2.4887, val loss 2.5094
step 2100: train loss 2.4894, val loss 2.5096


## There is an error while implementing positional encodings

In [None]:
""" Changes:
  -> remove vocab size from the BigramLanguageModel constructor as it is already defined.
  -> Introducing a new variable n_embd: short for number of embedding dimensions
"""
import torch
import torch.nn as nn
from torch.nn import functional as F

# Hyperparameters
batch_size = 16 # how many independent sequences will we process in
block_size = 32 # maximum context length for
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 32 # number of embedding dimensions

#__________________________________________________________________________

torch.manual_seed(137)
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

# Here we are trying to create a character level language
chars = sorted(list(set(text)))
vocab_size = len(chars)

# Create mapping from characters to integers
stoi = {ch: i for i, ch in enumerate(chars)}
itos = {i: ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]

decode = lambda l: ''.join([itos[i] for i in l])

# train test split
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9 * len(data))
train_data = data[:n]
val_data = data[n:]

# Data loading
def get_batch(split):
  # Generate small batches of data of inputs x and y
  data = train_data if split == 'train' else val_data
  ix = torch.randint(len(data) - block_size, (batch_size,))
  x = torch.stack([data[i:i+block_size] for i in ix])
  y = torch.stack([data[i+1:i+block_size+1] for i in ix])

  # This is being done to push the tensors onto the GPU to accelerate training
  x, y = x.to(device), y.to(device)

  return x, y


#This is basically to optimize pytorch. It tells that whatever intermediate
# variables are created, don't store them, coz we're never gonna call backwards
@torch.no_grad()
def estimate_loss():
  out = {}
  model.eval()
  for split in ['train', 'val']:
    losses = torch.zeros(eval_iters)
    for k in range(eval_iters):
      X, Y = get_batch(split)
      logits, loss = model(X, Y)
      losses[k] = loss.item()
    out[split] = losses.mean()
  model.train()
  return out


# _______________________________________________________________________________________

# The bigram language model
class BigramLanguageModel(nn.Module):
  def __init__(self):
    super().__init__()
    # each token directly reads from the logits for the next token from the lookup table
    self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
    self.lm_head = nn.Linear(n_embd, vocab_size) # This layer is used to go from token embeddings to the logits
    # LM_head stands for language model head

    # Now we create a provision to use positional embeddings along with the embeddings of the token identities (token_embeddings)
    self.position_embedding_table = nn.Embedding(block_size, n_embd) # each position from 0 to block size would also get it's own vector


  def forward(self, idx, targets=None):
    B, T = idx.shape


    # idx and targets are both (B, T) tensor of integers
    # When we replace the vocab size with n_embd, it wouldn't give us logits directly, but rather token embeddings
    tok_emb = self.token_embedding_table(idx) # (B, T, C)
    pos_embedding = self.position_embedding_table(torch.arange(T, device=device)) # (T, C) | torch.arange gives integers from 0 to T-1
    x = tok_emb + pos_embedding # (B, T, C) TODO: Check working. the lower dimensional vector (T, C) gets right aligned and another col is added to perform the operation
    logits = self.lm_head(x) # (B, T, vocab_size())

    if targets is None:
      loss = None
    else:
      B, T, C = logits.shape
      logits = logits.view(B*T, C)
      targets = targets.view(B*T)
      loss = F.cross_entropy(logits, targets)
    return logits, loss

  def generate(self, idx, max_new_tokens):
    # idx is (B, T) array of indices in the current
    for _ in range(max_new_tokens):
      # get the prediction
      logits, loss = self(idx)
      # Focus only on the last time step
      logits = logits[:, -1, :] # becomes (B, C)
      # Apply softmax to get probabilities
      # Introduce a small value (epsilon) to add to the logits before softmax
      epsilon = 1e-6
      probs = F.softmax(logits + epsilon, dim=-1) # (B, C)
      idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
      # sample from the distribution
      probs = F.softmax(logits, dim=-1) # (B, C)

      # Applied sampled index to the running sequence
      idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

# _____________________________________________________________________________

# Create model
model = BigramLanguageModel()
m = model.to(device)

# Create pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):
  # every once in a while evaluate the loss on train and val sets
  if iter % eval_interval == 0:
    losses = estimate_loss()
    print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
  xb, yb = get_batch('train')
  # evaluate the loss
  logits, loss = model(xb, yb)
  optimizer.zero_grad(set_to_none=True)
  loss.backward()
  optimizer.step()

# Generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=1000)[0].tolist()))



step 0: train loss 4.5478, val loss 4.5503
step 100: train loss 3.5436, val loss 3.5492
step 200: train loss 3.0673, val loss 3.0798
step 300: train loss 2.8837, val loss 2.9120
step 400: train loss 2.7939, val loss 2.8095
step 500: train loss 2.7232, val loss 2.7400
step 600: train loss 2.6854, val loss 2.6863
step 700: train loss 2.6521, val loss 2.6619
step 800: train loss 2.6289, val loss 2.6468
step 900: train loss 2.6177, val loss 2.6160
step 1000: train loss 2.5881, val loss 2.6035
step 1100: train loss 2.5865, val loss 2.5923
step 1200: train loss 2.5792, val loss 2.5805
step 1300: train loss 2.5663, val loss 2.5784
step 1400: train loss 2.5477, val loss 2.5668
step 1500: train loss 2.5484, val loss 2.5535
step 1600: train loss 2.5387, val loss 2.5541
step 1700: train loss 2.5345, val loss 2.5466
step 1800: train loss 2.5281, val loss 2.5423
step 1900: train loss 2.5203, val loss 2.5392
step 2000: train loss 2.5225, val loss 2.5354
step 2100: train loss 2.5226, val loss 2.5315


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


## Self Attention Logic

We are implementing self attention for a single individual head
using: B, T, C = 4, 8, 32

we have a 4x8 arrangement of tokens and info on each token is 32 dimensional.

The code we had before did a simple average of all the past tokens and the current token. So the previous info and the current info is being mixed together in an average. (We are masking out the wei matrix)

Each number in the previous matrices we developed represented the affinities of each token with one another, which we had uniformly initialized.

- Now we don't want to initialize the affinities to be uniform as different tokens would find other tokens more or less interesting

- So the problem that self attention solves is that it allows us to gather info from the past in a data dependant way (Need More explanation on this)

- The way self attention solves this:
  - every single token/node at each position produces 2 vectors: Query(Q) and Key(K)
    - Query: What am I looking for?
    - Key: What do I contain?
  - To get the affinities between these sequenced tokens, we do: Q.K = wei




In [13]:
# version 4: self-attention!
torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C) # contains the private info for each token. We have randomized it in this case for understanding

# Creating a single head for self attention
head_size = 16
key = nn.Linear(C, head_size, bias=False) # bias false just makes it apply simple matrix multiplication with it's weights
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)

k = key(x)   # (B, T, 16) 16 is the head size
q = query(x) # (B, T, 16)
wei =  q @ k.transpose(-2, -1) # Transposing the last two dimensions: Tx16 becomes 16xT ==> (B, T, 16) @ (B, 16, T) ---> (B, T, T)
# For every row B, we are now gonna have a T^2 matrix representing the affinities

tril = torch.tril(torch.ones(T, T))
#wei = torch.zeros((T,T))

# Till here, wei only contains raw affinities between each token and the others.
wei = wei.masked_fill(tril == 0, float('-inf'))

# # The reason for doing the triangular matrix above is that if I'm from the 6th node/token, I don't want any information about the 7th, 8th .... nodes, coz they're in the future, I'm
# # learning from the present and past. The upper triangular matrix does that for us


wei = F.softmax(wei, dim=-1)
# Softmax helps us to exponentiate and divide by the sum => normalizing the data to be between 0 and 1
v = value(x)

# Understand the relation between v and x.
out = wei @ v # is used to modify the output channels

out.shape

torch.Size([4, 8, 16])

In [14]:
wei[0] # previously every single batch element had uniform wei, but now, every batch element is different, coz every batch element contains different tokens

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
        [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
        [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
        [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],
       grad_fn=<SelectBackward0>)