In [3245]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3247]:
emb_dim = 4
num_heads=2
b = 2
s = 8
x = torch.randn(b,s,emb_dim) #b,s,d

In [3394]:
class MultiHeadAttentionWithCrossAttention(nn.Module):
    def __init__(self,emb_dim=emb_dim,num_heads=num_heads):
        super().__init__()
        self.emb_dim = emb_dim
        self.num_heads = num_heads
        self.Q = nn.Linear(emb_dim,emb_dim,bias=False,dtype=float)
        self.K = nn.Linear(emb_dim,emb_dim,bias=False,dtype=float)
        self.V = nn.Linear(emb_dim,emb_dim,bias=False,dtype=float)
        self.proj = nn.Linear(emb_dim,emb_dim,dtype=float)
    def forward(self,x,x2=None,x3=None,mask=None):
        x = x.to(float)
        # print("forwards")
        b,s,d = x.shape
        q = self.Q(x) #b,s,d d,d = b,s,d
        if x2!=None:
            x2 = x2.to(float)
            k = self.K(x2)
        else:
            k = self.K(x)
        if x3!=None:
            x3 = x3.to(float)
            v = self.V(x3)
        else:
            v = self.V(x)
        # q,k,v = b,s,d ()
        q = q.view(b,self.num_heads,s,d//self.num_heads)
        k = k.view(b,self.num_heads,s,d//self.num_heads)
        v = v.view(b,self.num_heads,s,d//self.num_heads)
        # q,k,v = b,h,s,d/h -> q.kT (b,h,s,d/h * b,h,d/h,s)
        attn_scores = (q@k.transpose(-1,-2)/(self.emb_dim//self.num_heads)**(0.5))
        # print(attn_scores.shape)
        if mask is not None:
            attn_scores = torch.masked_fill(attn_scores,mask,-torch.inf)
        # print(attn_scores.shape)
            
        attn_weights = torch.softmax(attn_scores,dim=-1)
        attn_weights = F.dropout(attn_weights,p=0.1,training=self.training)
        # attn_weights = b,h,s,s v = b,h,s,d/h => b,h,s,d/h => b,s,d 
        attn = attn_weights@v
        # print(attn.shape)
        attn = attn.view(b,s,d)
        output = self.proj(attn)
        return output.to(float)

In [3396]:
class MLP(nn.Module):
    def __init__(self,emb_dim,exp_factor=4):
        super().__init__()
        self.l1 = nn.Linear(emb_dim , exp_factor*emb_dim).to(float)
        self.r  = nn.ReLU()
        self.l2 = nn.Linear(exp_factor*emb_dim ,emb_dim).to(float)
    def forward(self,x):
        x = x.to(float)
        return self.l2(self.r(self.l1(x))).to(float)

In [3398]:
class Attention(nn.Module):
    def __init__(self,emb_dim,num_heads,exp_factor=4):
        super().__init__()
        self.mha = MultiHeadAttentionWithCrossAttention(emb_dim,num_heads).to(float)
        self.mlp = MLP(emb_dim,exp_factor).to(float)
        self.ln1 = nn.LayerNorm(emb_dim).to(float)
        self.ln2 = nn.LayerNorm(emb_dim).to(float)
    def forward(self,x):
        x = x.to(float)
        y = self.mha(x,None,None,mask=None).to(float)
        o = self.ln1(y+x)
        o2 = self.mlp(o)
        return self.ln2(o2+o).to(float)

In [3400]:
class Encoder(nn.Module):
    def __init__(self,emb_dim,num_heads,exp_factor=4,n_layers=6):
        super().__init__()
        self.emb_dim = emb_dim 
        self.num_heads = num_heads
        self.exp_factor = exp_factor
        self.n_layers = n_layers
        self.seq = nn.ModuleList()
        for i in range(n_layers):
            attn = Attention(emb_dim,num_heads,exp_factor).to(float)
            self.seq.append(attn)
    def forward(self,x):
        x = x.to(float)
        output = x
        for i in self.seq:
            output = i(output)
        return output.to(float)

In [3402]:
class decoder_block(nn.Module):
    def __init__(self,emb_dim,num_heads,exp_factor):
        super().__init__()
        self.mha = MultiHeadAttentionWithCrossAttention(emb_dim,num_heads).to(float)
        self.mlp = MLP(emb_dim,exp_factor).to(float)
        self.ln1 = nn.LayerNorm(emb_dim).to(float)
        self.ln2 = nn.LayerNorm(emb_dim).to(float)
        self.ln3 = nn.LayerNorm(emb_dim).to(float)
    def forward(self,x,input,mask=None):
        x = x.to(float)
        input = input.to(float)
        output = self.mha(x,mask=mask)
        l1 = self.ln1(x+output)
        cha = self.mha(l1,input,input)
        l2 = self.ln2(cha+l1)
        o1 = self.mlp(l2)
        return self.ln3(o1+l2).to(float)

In [3404]:
class Decoder(nn.Module):
    def __init__(self,emb_dim,num_heads,exp_factor,n_layers=6):
        super().__init__()
        self.seq  =nn.ModuleList()
        for i in range(n_layers):
            d  = decoder_block(emb_dim,num_heads,exp_factor=4).to(float)
            self.seq.append(d)
    def forward(self,x,y,mask=None):
        x = x.to(float)
        y = y.to(float)
        output  = x
        for i in self.seq:
            output = i(output,y,mask = mask)
        return output.to(float)

In [3418]:
class Transformer(nn.Module):
    def __init__(self,emb_dim,num_heads,vocab_size,exp_factor=4,n_layers=6):
        super().__init__()
        self.enc = Encoder(emb_dim,num_heads,exp_factor=4,n_layers=6)
        self.dec = Decoder(emb_dim,num_heads,exp_factor=4,n_layers=6)
# Shared weight matrix
        self.num_heads=num_heads
        self.emb_weight = nn.Parameter(
            torch.randn(vocab_size, emb_dim, dtype=float)*math.sqrt(emb_dim)
        )

        # Embedding layer (ensure weight dtype matches)
        self.embedding = nn.Embedding(vocab_size, emb_dim).to(float)
        self.embedding.weight = self.emb_weight

        # Linear layer (tied weights, ensure dtype matches)
        self.linear = nn.Linear(emb_dim, vocab_size, bias=False).to(float)
        self.linear.weight = self.emb_weight
        # self.linear = nn.Linear(emb_dim,vocab_size).to(float)
        # self.embedding = nn.Embedding(vocab_size,emb_dim).to(float)
    def forward(self, x, output, mask=None):
        # print(f"Input x shape: {x.shape}")
        # print(f"Input output shape: {output.shape}")
    
        x = self.embedding(x).to(float)
        output = self.embedding(output).to(float)
        
        # print(f"Embedded x shape: {x.shape}")
        # print(f"Embedded output shape: {output.shape}")
    
        y = self.enc(x)
        # print(f"Encoder output y shape: {y.shape}")
    
        o1 = self.dec(output, y, mask)
        # print(f"Decoder output o1 shape: {o1.shape}")
    
        o2 = self.linear(o1).to(float)
        # print(f"Linear output o2 shape: {o2.shape}")
    
        output = torch.softmax(o2, dim=-1)
        # print(f"Softmax output shape: {output.shape}")
    
        return output.to(float)

In [3609]:
model = Transformer(4,4,6)

In [3611]:
from torch.utils.tensorboard import SummaryWriter

In [3613]:
writer = SummaryWriter(log_dir='exp1')

In [3654]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
epochs=25
for epoch in range(epochs):
    epoch_losses = [] 
    for j,i in tqdm(enumerate(dataloader)):
        inp = [j[0] for j in i] 
        out = [j[1] for j in i]
        X = torch.tensor(np.array(inp))
        Y = torch.tensor(np.array(out))
        bat,s = Y.shape
        a = torch.tensor([4])
        a = a.expand(bat,1)
        y_dec = torch.cat([a,Y],dim=-1)
        X = torch.cat([a,X],dim=-1)
        mask = create_decoder_mask(y_dec)
        num_heads = model.num_heads  # Assuming your Transformer model has this attribute
        mask = mask.unsqueeze(1).expand(-1, num_heads, -1, -1)
        b = torch.tensor([5])
        b = b.expand(bat,1)
        y_true = torch.cat([Y,b],dim=-1)
        y_out = model(X,y_dec,mask)
        # print(y_out.shape)
        y_predicted = torch.argmax(y_out,dim=-1)
        # print(y_true.shape,y_predicted.shape)
        y_out2 = y_out.view(-1,6)
        y_true2 = y_true.view(-1)
        # print(y_out2.shape,y_true2.shape)
        loss = criterion(y_out2,y_true2)
        # print(loss.item())
        # writer.add_scalar('loss',loss.item(),global_step=i)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # print(loss.item())
        epoch_losses.append(loss.item())
        # print(epoch,len(dataloader),i)
        writer.add_scalar('batch_loss', loss.item(), global_step=epoch * int(len(dataloader)) + j)
    
    # Calculate and log average epoch loss
    avg_epoch_loss = sum(epoch_losses) / len(epoch_losses)
    writer.add_scalar('epoch_loss', avg_epoch_loss, global_step=epoch)
    print(f"Epoch {epoch+1}/{epochs} - Average Loss: {avg_epoch_loss:.4f}")
writer.close()

313it [00:16, 18.52it/s]


Epoch 1/25 - Average Loss: 1.2331


313it [00:17, 17.68it/s]


Epoch 2/25 - Average Loss: 1.2328


313it [00:16, 18.63it/s]


Epoch 3/25 - Average Loss: 1.2319


313it [00:16, 18.49it/s]


Epoch 4/25 - Average Loss: 1.2194


313it [00:17, 17.68it/s]


Epoch 5/25 - Average Loss: 1.1741


313it [00:16, 18.55it/s]


Epoch 6/25 - Average Loss: 1.1746


313it [00:17, 17.86it/s]


Epoch 7/25 - Average Loss: 1.1737


313it [00:18, 16.99it/s]


Epoch 8/25 - Average Loss: 1.1748


313it [00:16, 18.81it/s]


Epoch 9/25 - Average Loss: 1.1730


313it [00:16, 18.65it/s]


Epoch 10/25 - Average Loss: 1.1753


313it [00:17, 17.71it/s]


Epoch 11/25 - Average Loss: 1.1737


313it [00:16, 19.06it/s]


Epoch 12/25 - Average Loss: 1.1742


313it [00:16, 19.09it/s]


Epoch 13/25 - Average Loss: 1.1740


313it [00:16, 19.22it/s]


Epoch 14/25 - Average Loss: 1.1740


313it [00:16, 19.13it/s]


Epoch 15/25 - Average Loss: 1.1747


313it [00:16, 19.07it/s]


Epoch 16/25 - Average Loss: 1.1735


313it [00:16, 19.14it/s]


Epoch 17/25 - Average Loss: 1.1749


313it [00:16, 19.11it/s]


Epoch 18/25 - Average Loss: 1.1741


313it [00:16, 19.15it/s]


Epoch 19/25 - Average Loss: 1.1747


313it [00:16, 19.01it/s]


Epoch 20/25 - Average Loss: 1.1743


313it [00:16, 19.01it/s]


Epoch 21/25 - Average Loss: 1.1735


313it [00:16, 19.11it/s]


Epoch 22/25 - Average Loss: 1.1749


313it [00:16, 19.10it/s]


Epoch 23/25 - Average Loss: 1.1736


313it [00:16, 19.10it/s]


Epoch 24/25 - Average Loss: 1.1739


313it [00:16, 19.09it/s]

Epoch 25/25 - Average Loss: 1.1737





In [3637]:
# pip install --upgrade torch numpy

In [3656]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np
from tqdm import tqdm
import torch

# After training loop
model.eval()  # Set model to evaluation mode
test_losses = []
with torch.no_grad():
    for j, i in tqdm(enumerate(val_dataloader)):
        inp = [j[0] for j in i]
        out = [j[1] for j in i]
        X = torch.tensor(np.array(inp))
        Y = torch.tensor(np.array(out))
        bat, s = Y.shape
        
        # Add start token (e.g., 4) to the input and output sequences
        a = torch.tensor([4])
        a = a.expand(bat, 1)
        y_dec = torch.cat([a, Y], dim=-1)
        X = torch.cat([a, X], dim=-1)
        
        # Create decoder mask
        mask = create_decoder_mask(y_dec)
        num_heads = model.num_heads
        mask = mask.unsqueeze(1).expand(-1, num_heads, -1, -1)
        
        # Add end token (e.g., 5) to the true labels
        b = torch.tensor([5])
        b = b.expand(bat, 1)
        y_true = torch.cat([Y, b], dim=-1)
        
        # Forward pass
        y_out = model(X, y_dec, mask)
        y_predicted = torch.argmax(y_out, dim=-1)
        
        # Calculate loss
        y_out2 = y_out.view(-1, 6)  # Reshape for loss calculation
        y_true2 = y_true.view(-1)   # Flatten true labels
        loss = criterion(y_out2, y_true2)
        test_losses.append(loss.item())
        
        # Decode predictions and labels
        decoded_strings = ["".join(reverse_vocab[i.item()] for i in row) for row in y_predicted]
        decoded_true = ["".join(reverse_vocab[i.item()] for i in row) for row in y_true]
    
    print(f"Test Loss: {avg_test_loss:.4f} - BLEU Score: {avg_bleu_score:.2f}")
    
    # Print example prediction
    if len(decoded_strings) > 0:
        print("\nExample Prediction:")
        print(f"Input: {inp[0]}")
        print(f"True Label: {decoded_true[0]}")
        print(f"Predicted Label: {decoded_strings[0]}")

# Reset model to training mode
# model.train()

32it [00:00, 49.86it/s]

Test Loss: 1.3113 - BLEU Score: 0.00

Example Prediction:
Input: tensor([1, 2, 1, 2, 1, 2, 0, 2, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
True Label: 0 0 1 1 1<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><eos>
Predicted Label: 0 0 0 1 1 <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>





In [3658]:
len(decoded_strings)

8

In [3660]:
for i in range(len(decoded_strings)):
    print("\nExample Prediction:")
    print(f"Input: {inp[i]}")
    print(f"True Label: {decoded_true[i]}")
    print(f"Predicted Label: {decoded_strings[i]}")


Example Prediction:
Input: tensor([1, 2, 1, 2, 1, 2, 0, 2, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
True Label: 0 0 1 1 1<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><eos>
Predicted Label: 0 0 0 1 1 <pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

Example Prediction:
Input: tensor([1, 2, 1, 2, 1, 2, 0, 2, 0, 2, 1, 2, 0, 2, 0, 3, 3, 3, 3])
True Label: 0 0 0 0 1 1 1 1<pad><pad><pad><pad><eos>
Predicted Label: 0 1 1 0 0 1 1 1<pad><pad><pad><pad><pad>

Example Prediction:
Input: tensor([1, 2, 0, 2, 1, 2, 1, 2, 1, 2, 0, 2, 0, 3, 3, 3, 3, 3, 3])
True Label: 0 0 0 1 1 1 1<pad><pad><pad><pad><pad><pad><eos>
Predicted Label: 0 0 0 1 1 1<pad>1<pad><pad><pad><pad><pad><pad><pad>

Example Prediction:
Input: tensor([0, 2, 0, 2, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])
True Label: 0 0 1<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><eos>
Predicted Label: 0 0 0<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>

Example Prediction:
Input: tensor([

In [3641]:
y_out.shape , y_true.shape

(torch.Size([8, 20, 6]), torch.Size([8, 20]))

In [3430]:
vocab = {'0': 0, '1': 1, ' ': 2, '<pad>': 3,'<sos>':4,'<eos>':5}
reverse_vocab = {v: k for k, v in vocab.items()}

In [3434]:
decoded_strings = ["".join(reverse_vocab[i.item()] for i in row) for row in y_predicted]

In [3438]:
decoded_strings[0]

'<pad><pad>0<pad><pad><pad>0<pad>0<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>'

In [3410]:
import random
from torch.utils.data import Dataset,DataLoader

class NumberSortingDataset(Dataset):
    def __init__(self, num_samples=10000, max_length=10):
        self.num_samples = num_samples
        self.max_length = max_length
        self.vocab = {'0': 0, '1': 1, ' ': 2, '<pad>': 3,'<sos>':4,'<eos>':5}
        
    def __len__(self):
        return self.num_samples
    
    def __getitem__(self, idx):
        # Generate random numbers
        length = random.randint(3, self.max_length)
        numbers = [str(random.randint(0, 1)) for _ in range(length)]
        input_seq = ' '.join(numbers)
        target_seq = ' '.join(sorted(numbers))  # Sorted sequence
        
        # Convert to index list
        input_tensor = [self.vocab[c] for c in input_seq]
        target_tensor = [self.vocab[c] for c in target_seq]

        # Compute padding length
        pad_length = self.max_length * 2 - 1
        input_tensor += [self.vocab['<pad>']] * (pad_length - len(input_tensor))
        target_tensor += [self.vocab['<pad>']] * (pad_length - len(target_tensor))

        # Convert to PyTorch tensors after padding
        return torch.tensor(input_tensor, dtype=torch.long), torch.tensor(target_tensor, dtype=torch.long)

# Initialize
dataset = NumberSortingDataset()
dataloader = DataLoader(dataset, batch_size=32, collate_fn=lambda x: x)
val_dataset = NumberSortingDataset(num_samples=1000)
val_dataloader = DataLoader(val_dataset, batch_size=32, collate_fn=lambda x: x)

In [3412]:
x = next(iter(dataloader))

In [3390]:
x[1][1]

tensor([0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 3, 3, 3, 3, 3, 3])

In [3392]:
l = [ 1,2,3]

In [3332]:
inp = [i[0] for i in x]

In [3334]:
out = [i[1] for i in x]

In [3336]:
inp[7] , out[7]

IndexError: list index out of range

In [3338]:
import numpy as np
X = torch.tensor(np.array(inp)).unsqueeze(dim=-1)
Y = torch.tensor(np.array(out)).unsqueeze(dim=-1)

In [3340]:
X.dtype


torch.float32

In [3342]:
Y.dtype

torch.float32

In [3344]:
# X[0], Y[0]

In [3346]:
model = Transformer(20,1,4)

model(X,Y)

In [3349]:
output = model(X,Y)

Input x shape: torch.Size([2, 4, 1])
Input output shape: torch.Size([2, 4, 1])


RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [3350]:
output.shape

torch.Size([])

In [3353]:
output = torch.argmax(output,dim=-1)

In [3355]:
output

tensor(0)

In [3357]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x304269bb0>

In [3359]:
def create_decoder_mask(y_dec, pad_token=3):
    batch_size, seq_len = y_dec.shape

    # Padding mask (True for valid tokens, False for pad tokens)
    padding_mask = y_dec != pad_token  # Shape: (batch_size, seq_len)

    # Causal mask (Look-ahead mask)
    causal_mask = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool))  # (seq_len, seq_len)

    # Combine both: expand padding mask to match causal mask dimensions
    final_mask = padding_mask.unsqueeze(1) & causal_mask  # Shape: (batch_size, seq_len, seq_len)
    return ~final_mask

In [3361]:
from tqdm import tqdm
epochs = 1
for epoch in range(epochs):
    model.train()
    for i in tqdm(dataloader):
        inp = [j[0] for j in i] 
        out = [j[1] for j in i]
        X = torch.tensor(np.array(inp))
        Y = torch.tensor(np.array(out))
        b,s = Y.shape
        a = torch.tensor(4)
        y_dec = torch.cat([a,Y],dim=-1)
        print(y_dec.shape)
        
        # y_pred = model(X,Y)
        break

  0%|                                                   | 0/313 [00:00<?, ?it/s]


RuntimeError: zero-dimensional tensor (at position 0) cannot be concatenated

In [3363]:
import matplotlib.pyplot as plt

In [3366]:
model = Transformer(7,1,6)
criterion = nn.CrossEntropyLoss(ignore_index=3)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
for i in tqdm(dataloader):
    inp = [j[0] for j in i] 
    out = [j[1] for j in i]
    X = torch.tensor(np.array(inp))
    Y = torch.tensor(np.array(out))
    bat,s = Y.shape
    a = torch.tensor([4])
    a = a.expand(bat,1)
    y_dec = torch.cat([a,Y],dim=-1)
    X = torch.cat([a,X],dim=-1)
    mask = create_decoder_mask(y_dec)
    num_heads = model.num_heads  # Assuming your Transformer model has this attribute
    mask = mask.unsqueeze(1).expand(-1, num_heads, -1, -1)
    b = torch.tensor([5])
    b = b.expand(bat,1)
    y_true = torch.cat([Y,b],dim=-1)
    print(mask.shape)
    # y_out = model(X,y_dec,mask)
    
    break

  0%|                                                   | 0/313 [00:00<?, ?it/s]

torch.Size([32, 1, 20, 20])





In [None]:
y_pred = torch.argmax(y_out,dim=-1)

In [3205]:
y_true.shape , y_pred.shape

(torch.Size([32, 20]), torch.Size([32, 20]))

In [3207]:
y_dec[0] , y_pred[0]

(tensor([4, 0, 2, 0, 2, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1]),
 tensor([4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 3, 4, 3, 4, 3, 3, 3]))

In [3080]:
y_dec.shape

torch.Size([32, 20])

In [3027]:
import torch

def create_decoder_mask(y_dec, pad_token=3):
    """
    Creates a mask for the decoder input where padding tokens are marked as False.

    Args:
        y_dec (torch.Tensor): Tensor of shape (batch_size, seq_len) containing tokenized sequences.
        pad_token (int): The token representing padding.

    Returns:
        torch.Tensor: Boolean mask of shape (batch_size, seq_len) where True means valid token, False means padding.
    """
    return y_dec != pad_token  # Mask where True means the token is valid

# Example usage
y_dec = torch.tensor([
    [4, 0, 2, 0, 2, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3],
    [4, 0, 2, 0, 2, 0, 2, 0, 2, 1, 2, 1, 3, 3, 3, 3, 3, 3, 3, 3]
])  # Example input

mask = create_decoder_mask(y_dec)
print(mask)

tensor([[ True,  True,  True,  True,  True,  True, False, False, False, False,
         False, False, False, False, False, False, False, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True, False, False, False, False, False, False, False, False]])


In [2500]:
import torch

def create_decoder_mask(y_dec, pad_token=3):
    batch_size, seq_len = y_dec.shape

    # Padding mask (True for valid tokens, False for pad tokens)
    padding_mask = y_dec != pad_token  # Shape: (batch_size, seq_len)

    # Causal mask (Look-ahead mask)
    causal_mask = torch.tril(torch.ones((seq_len, seq_len), dtype=torch.bool))  # (seq_len, seq_len)

    # Combine both: expand padding mask to match causal mask dimensions
    final_mask = padding_mask.unsqueeze(1) & causal_mask  # Shape: (batch_size, seq_len, seq_len)
    return ~final_mask

# Example usage
y_dec = torch.tensor([
    [4, 0, 2, 0, 2, 1, 3, 3, 3, 3],
    [4, 0, 2, 0, 2, 0, 2, 1, 2, 1]
])  # Example input

mask = create_decoder_mask(y_dec)
print(mask)

tensor([[[False,  True,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False, False,  True,  True,  True,  True,  True,  True,  True],
         [False, False, False, False,  True,  True,  True,  True,  True,  True],
         [False, False, False, False, False,  True,  True,  True,  True,  True],
         [False, False, False, False, False, False,  True,  True,  True,  True],
         [False, False, False, False, False, False,  True,  True,  True,  True],
         [False, False, False, False, False, False,  True,  True,  True,  True],
         [False, False, False, False, False, False,  True,  True,  True,  True],
         [False, False, False, False, False, False,  True,  True,  True,  True]],

        [[False,  True,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False, Fa

In [1487]:
mask

tensor([[[False,  True,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False, False,  True,  True,  True,  True,  True,  True,  True],
         [False, False, False, False,  True,  True,  True,  True,  True,  True],
         [False, False, False, False, False,  True,  True,  True,  True,  True],
         [False, False, False, False, False, False,  True,  True,  True,  True],
         [False, False, False, False, False, False,  True,  True,  True,  True],
         [False, False, False, False, False, False,  True,  True,  True,  True],
         [False, False, False, False, False, False,  True,  True,  True,  True],
         [False, False, False, False, False, False,  True,  True,  True,  True]],

        [[False,  True,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False,  True,  True,  True,  True,  True,  True,  True,  True],
         [False, False, Fa

In [1376]:
attn_scores = torch.masked_fill(y_dec,mask,-torch.inf)

RuntimeError: The size of tensor a (10) must match the size of tensor b (2) at non-singleton dimension 1

In [None]:
torch.masked_fill()

In [None]:
<sos>daksh is GooD boy<eos><pad>

In [3850]:
def positional_encoding(emb):
    b,s,d = emb.shape
    for i in range(1,s+1):
        for j in range(d):
            if(j%2==0):
                # print(emb[:,i2,j])
                # print(torch.sin((i2)/(10000**(2j/d))))
                emb[:,i-1,j]+=math.sin(i/(10000**(2*j/d)))
            else:
                emb[:,i-1,j]+=math.cos(i/(10000**(2*j/d)))
    return emb

In [3852]:
temp  = torch.randn(2,4,8)

In [3854]:
positional_encoding(temp).shape

torch.Size([2, 4, 8])

In [3842]:
temp.shape

torch.Size([2, 4, 8])

In [3742]:
temp[:,0,0]+=math.sin(1/2)

In [3744]:
temp[:,0,0]

tensor([3.2385, 1.4396])

In [3822]:
b,s,d = temp.shape
i2 = 0 
j=0
i2/(10000**(2*j/d))

0.0

In [3824]:
i2

0

In [3820]:
i2/1000**(2*j/d)

0.0

In [3860]:
import torch
from torch.utils.data import Dataset
class dataset(Dataset):
    def __init__(self,df):
        super().__init__()
        self.df = df
    def len(self):
        return len(self.df)
    def get(self,idx):
        return self.df.iloc[idx][0],self.df.iloc[idx][0]