In [1]:
import torch
import torch.nn.functional as F

  cpu = _conversion_method_template(device=torch.device("cpu"))


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
embeddings = model.encode(['Ciao mondo', 'Hello world'])

In [2]:
# Il momento della verit√†
is_cuda = torch.cuda.is_available()
print(f"GPU Disponibile: {is_cuda}")

GPU Disponibile: True


### Weighted Averaging

In [3]:
# Input sequence (3 words, each 4-dim embedding)
sequence = torch.tensor([[0.1, 0.2, 0.3, 0.4],  # word 1: "The"
                         [0.5, 0.6, 0.7, 0.8],  # word 2: "cat"
                         [0.9, 1.0, 1.1, 1.2]]).cuda() # word 3: "sat"

In [4]:
attention_weights = torch.tensor([0.1, 0.3, 0.6]).cuda()  # Weights for each word

In [5]:
# Weighted average
output = torch.zeros(4).cuda()
for i, weight in enumerate(attention_weights):
    output += weight * sequence[i]

print(output)
# tensor([0.7000, 0.8000, 0.9000, 1.0000])

tensor([0.7000, 0.8000, 0.9000, 1.0000], device='cuda:0')


$$(\text{W}_1 \cdot \text{Seq}_1) + (\text{W}_2 \cdot \text{Seq}_2) + (\text{W}_3 \cdot \text{Seq}_3)$$

### Concrete Example

In [8]:
# Query: "Looking for subject-related information"
query = torch.tensor([1.0, 0.0, 1.0]).cuda()

In [9]:
# Keys: what each position represents
keys = torch.tensor([[1.0, 0.0, 1.0],  # Position 0: matches query well!
                     [0.0, 1.0, 0.0],  # Position 1: completely different
                     [1.0, 0.0, 0.8]]).cuda() # Position 2: somewhat similar

In [10]:
# Values: actual information at each position
values = torch.tensor([[10.0, 20.0],  # Info at position 0
                       [30.0, 40.0],  # Info at position 1
                       [50.0, 60.0]]).cuda()  # Info at position 2

In [11]:
# Dot product measures similarity
scores = keys @ query
print("Scores:", scores)
# tensor([2.0000, 0.0000, 1.8000])

Scores: tensor([2.0000, 0.0000, 1.8000], device='cuda:0')


In [12]:
weights = F.softmax(scores, dim=0)
print("Weights:", weights)
# tensor([0.5308, 0.0874, 0.3818])

Weights: tensor([0.5118, 0.0693, 0.4190], device='cuda:0')


In [13]:
# Combine values using attention weights
output = torch.zeros(2).cuda()
for i, weight in enumerate(weights):
    output += weight * values[i]

print("Output:", output)
# tensor([28.1820, 38.1820])

Output: tensor([28.1447, 38.1447], device='cuda:0')


### Build a simple attention

In [21]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SimpleAttention(nn.Module):
    def __init__(self, embed_dim):
        super().__init__()
        # Three learned linear transformations
        self.query = nn.Linear(embed_dim, embed_dim).cuda()
        self.key = nn.Linear(embed_dim, embed_dim).cuda()
        self.value = nn.Linear(embed_dim, embed_dim).cuda()

    def forward(self, x):
        # x shape: (batch, seq_len, embed_dim)
        # Example: (1, 10, 64) = 1 sentence, 10 words, 64-dim embeddings
        # Transform input into Query, Key, Value
        Q = self.query(x)  # What each position is looking for
        K = self.key(x)    # What each position represents
        V = self.value(x)  # What info each position has
        # Calculate similarity between queries and keys
        scores = Q @ K.transpose(-2, -1)
        # Shape: (batch, seq_len, seq_len)
        # scores[i,j] = how much position i attends to position j
        # Scale by square root of dimension
        d_k = Q.size(-1)
        scores = scores / (d_k ** 0.5)
        # Softmax gives probability distribution
        attn_weights = F.softmax(scores, dim=-1)
        # Each row sums to 1!
        # Weighted combination of values
        output = attn_weights @ V
        return output


In [22]:
# Create attention layer
attention = SimpleAttention(embed_dim=64)

# Input: 1 batch, 10 words, 64-dim embeddings
x = torch.randn(1, 10, 64).cuda()

# Apply attention
output = attention(x)

print(f"Input shape: {x.shape}")      # torch.Size([1, 10, 64])
print(f"Output shape: {output.shape}") # torch.Size([1, 10, 64])

Input shape: torch.Size([1, 10, 64])
Output shape: torch.Size([1, 10, 64])
