In [3]:
pip install torch

Collecting torch
  Downloading torch-2.10.0-cp312-cp312-win_amd64.whl.metadata (31 kB)
Collecting sympy>=1.13.3 (from torch)
  Using cached sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.10.0-cp312-cp312-win_amd64.whl (113.8 MB)
   ---------------------------------------- 0.0/113.8 MB ? eta -:--:--
   ---------------------------------------- 0.3/113.8 MB ? eta -:--:--
   ---------------------------------------- 0.8/113.8 MB 3.1 MB/s eta 0:00:37
    --------------------------------------- 1.8/113.8 MB 3.6 MB/s eta 0:00:32
    --------------------------------------- 2.6/113.8 MB 4.0 MB/s eta 0:00:28
   - -------------------------------------- 3.9/113.8 MB 4.2 MB/s eta 0:00:27
   - -------------------------------------- 5.0/113.8 MB 4.4 MB/s eta 0:00:25
   -- ------------------------------------- 6.0/113.8 MB 4.5 MB/s eta 0:00:24
   -- ------------------------------------- 7.1/113.8 MB 4.7 MB/s eta 0:00:23
   -- ------------------------------------- 8.4/113.8 MB 4.7 MB

In [25]:
sentence = 'Enjoy every moment, death is unexpected'

dc = {s:i for i,s 
      in enumerate(sorted(sentence.replace(',', '').split()))}

print(dc)

{'Enjoy': 0, 'death': 1, 'every': 2, 'is': 3, 'moment': 4, 'unexpected': 5}


In [9]:
# Input embeddings
import torch

# Example sentence embeddings
torch.manual_seed(123)

inputs = torch.tensor(
    [
        [1.0, 0.0, 1.0],
        [0.0, 2.0, 0.0],
        [1.0, 1.0, 1.0],
        [0.0, 1.0, 2.0],
        [1.0, 0.0, 0.0],
        [2.0, 1.0, 0.0]
    ]
)

d_in = inputs.shape[1]   # input embedding dimension
d_out = 2                # output dimension


In [27]:
import torch

sentence_int = torch.tensor(
    [dc[s] for s in sentence.replace(',', '').split()]
)
print(sentence_int)

tensor([0, 2, 4, 1, 3, 5])


In [29]:
vocab_size = 50_000
torch.manual_seed(123)
embed = torch.nn.Embedding(vocab_size, 3)
embedded_sentence = embed(sentence_int).detach()
print(embedded_sentence)
print(embedded_sentence.shape)


tensor([[ 0.3374, -0.1778, -0.3035],
        [-0.2196, -0.3792,  0.7671],
        [ 0.1794,  1.8951,  0.4954],
        [-0.5880,  0.3486,  0.6603],
        [-1.1925,  0.6984, -1.4097],
        [ 0.2692, -0.0770, -1.0205]])
torch.Size([6, 3])


In [11]:
# Defining the Weight Matrices
W_query = torch.rand(d_in, d_out)
W_key   = torch.rand(d_in, d_out)
W_value = torch.rand(d_in, d_out)

In [13]:
 # Computing Queries, Keys, and Values
queries = inputs @ W_query
keys    = inputs @ W_key
values  = inputs @ W_value

In [15]:
# Attention Score Calculation (Dot Product)
attention_scores = queries @ keys.T

In [17]:
# Scaling the Attention Scores (√dₖ)
d_k = keys.shape[1]
scaled_attention_scores = attention_scores / d_k**0.5

In [19]:
 # Softmax to Obtain Attention Weights
attention_weights = torch.softmax(
    scaled_attention_scores, dim=-1
)

In [21]:
# Context Vectors (Final Output)
context_vectors = attention_weights @ values
print(context_vectors)

tensor([[0.4944, 1.3908],
        [0.4953, 1.4006],
        [0.5197, 1.5423],
        [0.5272, 1.5856],
        [0.4474, 1.1954],
        [0.5106, 1.4955]])


In [23]:
# Weight Matrices
import torch
torch.manual_seed(123)

d_in, d_out = 3, 2

W_query = torch.rand(d_in, d_out)
W_key   = torch.rand(d_in, d_out)
W_value = torch.rand(d_in, d_out)

print("W_query:\n", W_query)
print("\nW_key:\n", W_key)
print("\nW_value:\n", W_value)

W_query:
 tensor([[0.2961, 0.5166],
        [0.2517, 0.6886],
        [0.0740, 0.8665]])

W_key:
 tensor([[0.1366, 0.1025],
        [0.1841, 0.7264],
        [0.3153, 0.6871]])

W_value:
 tensor([[0.0756, 0.1966],
        [0.3164, 0.4017],
        [0.1186, 0.8274]])
