In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        self.embed_dim = embed_dim
        
        # DEFINIZIONE: Non chiamare .cuda() qui. Lascia i layer neutri.
        self.query = nn.Linear(embed_dim, embed_dim)
        self.key = nn.Linear(embed_dim, embed_dim)
        self.value = nn.Linear(embed_dim, embed_dim)

    def forward(self, x):
        # x shape: (batch, seq_len, embed_dim)
        
        Q = self.query(x)
        K = self.key(x)
        V = self.value(x)
        
        # Moltiplicazione matriciale: Q @ K^T
        scores = Q @ K.transpose(-2, -1)
        
        # Scaling
        scores = scores / (self.embed_dim ** 0.5)
        
        # Softmax
        attn_weights = F.softmax(scores, dim=-1)
        
        # Output finale
        output = attn_weights @ V
        return output

In [7]:
# 1. Definisci il device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Sto usando il device: {device}")

# 2. Crea il layer
attention = SelfAttention(embed_dim=64)

# 3. Sposta il MODELLO sul device (GPU)
attention.to(device)

# 4. Crea l'input
x = torch.randn(2, 10, 64)

# --- CORREZIONE QUI SOTTO ---
# 5. Devi spostare anche i DATI sul device (GPU)
x = x.to(device) 
# ----------------------------

# 6. Ora puoi applicare l'attenzione
# (Entrambi, modello e x, sono ora sullo stesso device)
output = attention(x)

print(f"Input shape: {x.shape}")      
print(f"Output shape: {output.shape}")
print(f"Output device: {output.device}") # Verifica che l'output sia su cuda

Sto usando il device: cuda
Input shape: torch.Size([2, 10, 64])
Output shape: torch.Size([2, 10, 64])
Output device: cuda:0


### Working Through a Manual Example

In [8]:
# Input: 3 word embeddings, 4 dimensions each
x = torch.tensor([[1.0, 0.0, 1.0, 0.0],  # Word 1: "The"
                  [0.0, 1.0, 0.0, 1.0],  # Word 2: "cat"
                  [1.0, 1.0, 0.0, 0.0]]) # Word 3: "sat"
x=x.to(device)
print(f"Input shape: {x.shape}")  # torch.Size([3, 4])

Input shape: torch.Size([3, 4])


In [9]:
# Initialize random weight matrices
# In practice, these are learned during training
W_q = torch.randn(4, 4).to(device)  # Query projection
W_k = torch.randn(4, 4).to(device)  # Key projection
W_v = torch.randn(4, 4).to(device)  # Value projection

In [10]:
# Transform input into Q, K, V
# Notice: all three come from the SAME input x!
Q = x @ W_q  # Shape: (3, 4)
K = x @ W_k  # Shape: (3, 4)
V = x @ W_v  # Shape: (3, 4)

print(f"Q shape: {Q.shape}")  # torch.Size([3, 4])
print(f"K shape: {K.shape}")  # torch.Size([3, 4])
print(f"V shape: {V.shape}")  # torch.Size([3, 4])


Q shape: torch.Size([3, 4])
K shape: torch.Size([3, 4])
V shape: torch.Size([3, 4])


In [11]:
# Similarity between all query-key pairs
d_k = 4
scores = Q @ K.T / (d_k ** 0.5)

print(f"Scores shape: {scores.shape}")  # torch.Size([3, 3])
print("Scores:")
print(scores)

Scores shape: torch.Size([3, 3])
Scores:
tensor([[-2.5850,  0.3489, -2.3574],
        [ 1.1196, -0.1771,  1.1724],
        [-0.8364, -0.5975, -0.4580]], device='cuda:0')


In [12]:
attn_weights = F.softmax(scores, dim=-1)

print("Attention weights:")
print(attn_weights)
print(f"\\nRow 0 sum: {attn_weights[0].sum()}")  # 1.0

Attention weights:
tensor([[0.0475, 0.8929, 0.0596],
        [0.4296, 0.1175, 0.4529],
        [0.2681, 0.3405, 0.3914]], device='cuda:0')
\nRow 0 sum: 0.9999999403953552


In [13]:
# Weighted combination of all value vectors
output = attn_weights @ V

print(f"\\nOutput shape: {output.shape}")  # torch.Size([3, 4])

\nOutput shape: torch.Size([3, 4])
