Q23

In [1]:
import numpy as np
word_embedding = np.array([
    [0.8, 1.2, 0.3, 1.7],
    [1.1, 0.6, 0.9, 1.4],
    [0.5, 1.3, 0.7, 1.0]
])

def get_position_encoding(sequence_len, d_model):
    # Create position array [0, 1, 2, ..., sequence_len-1]
    position = np.arange(sequence_len)[:, np.newaxis]  # Shape: (seq_len, 1)
    
    # Create division term array [0, 2, 4, ..., d_model-2]
    div_term = np.arange(0, d_model, 2)
    
    denominator = np.power(10000, div_term / d_model)
    
    pe = np.zeros((sequence_len, d_model))
    
    pe[:, 0::2] = np.sin(position / denominator)
    pe[:, 1::2] = np.cos(position / denominator)
    
    return pe

sequence_len = word_embedding.shape[0]
d_model = word_embedding.shape[1]

positional_encoding = get_position_encoding(sequence_len, d_model)

positional_embedding = word_embedding + positional_encoding
print(np.round(positional_embedding, 1))


    

[[0.8 2.2 0.3 2.7]
 [1.9 1.1 0.9 2.4]
 [1.4 0.9 0.7 2. ]]


Q24

In [2]:
query_weights = np.array([
    [0.5, 0.3, 0.7, 0.2],
    [0.8, 0.1, 0.5, 0.2],
    [0.3, 0.8, 0.9, 0.5],
    [0.5, 0.8, 0.9, 0.2],
])
query_bias = np.array([
    [0.2, 0.7, 0.8, 0.8],
    [0.2, 0.7, 0.8, 0.8],
    [0.2, 0.7, 0.8, 0.8],
])

key_weights = np.array([
    [0.2, 0.7, 0.2, 0.5],
    [0.8, 0.8, 0.2, 0.8],
    [0.5, 0.3, 0.8, 0.8],
    [0.2, 0.9, 0.2, 0.7],
])
key_bias = np.array([
    [0.5, 0.5, 0.8, 0.2],
    [0.5, 0.5, 0.8, 0.2],
    [0.5, 0.5, 0.8, 0.2],
])

value_weights = np.array([
    [0.07, 0.33, 0.03, 0.37],
    [0.12, 0.28, 0.18, 0.22],
    [0.27, 0.13, 0.23, 0.17],
    [0.38, 0.04, 0.32, 0.06],
])
value_bias = np.array([
    [0.28, 0.47, 0.72, 0.95],
    [0.28, 0.47, 0.72, 0.95],
    [0.28, 0.47, 0.72, 0.95],
])

# Multi-head attention
query_matrix = np.matmul(positional_embedding, query_weights.T) + query_bias
key_matrix = np.matmul(positional_embedding, key_weights.T) + key_bias
value_matrix = np.matmul(positional_embedding, value_weights.T) + value_bias

print(np.round(query_matrix, 1))
print(np.round(key_matrix, 1))
print(np.round(value_matrix, 1))



[[2.  2.2 4.4 3.8]
 [2.6 3.3 4.3 4. ]
 [2.1 2.7 3.6 3.3]]
[[3.6 5.1 4.3 4.3]
 [3.1 5.1 4.8 3.5]
 [2.5 4.1 3.9 2.8]]
[[2.1 1.8 1.8 1.6]
 [1.7 1.7 2.  2.2]
 [1.4 1.5 1.7 1.9]]


Q25

In [4]:
scaled_dot_product_attention = np.matmul(query_matrix, key_matrix.T) / np.sqrt(d_model)
scaled_dot_product_attention = np.round(scaled_dot_product_attention, 1)
print(scaled_dot_product_attention)


[[26.9 25.9 21.2]
 [30.9 29.6 24.2]
 [25.2 24.1 19.8]]


Q26

In [6]:
scaled_dot_product = np.array([
    [27.1, 26.0, 21.2],
    [30.9, 29.6, 24.2],
    [25.3, 24.7, 19.8]
])

mask = np.array([
    [0, -float('inf'), -float('inf')],
    [0, 0, -float('inf')],
    [0, 0, 0]
])

def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=-1, keepdims=True)

scaled_dot_product_attention = softmax(scaled_dot_product + mask)
attention_output = np.matmul(scaled_dot_product_attention, value_matrix)
print(np.round(attention_output, 1))



[[2.1 1.8 1.8 1.6]
 [2.  1.8 1.8 1.7]
 [1.9 1.8 1.8 1.8]]


Q27

In [12]:
# attention_output = np.array([
#     [2.1, 1.8, 1.8, 1.6],
#     [2.0, 1.8, 1.8, 1.7],
#     [1.9, 1.8, 1.8, 1.8]
# ])

# positional_embedding = np.array([
#     [0.8, 2.2, 0.3, 2.7],
#     [1.9, 1.1, 0.9, 2.4],
#     [1.4, 0.9, 0.7, 2.0]
# ])

def layer_norm(x: np.ndarray, gamma: float = 1.0, beta: float = 0.0, epsilon: float = 0.0) -> np.ndarray:
    """
    Applies Layer Normalization to the input tensor.
    
    Args:
        x: Input tensor
        gamma: Scaling parameter (default = 1.0)
        beta: Shift parameter (default = 0.0)
        epsilon: Small constant for numerical stability (default = 0.0)
    
    Returns:
        Normalized tensor
    """
    # Calculate mean along the last axis
    mean = np.mean(x, axis=-1, keepdims=True)

    # Calculate variance along the last axis
    var = np.var(x, axis=-1, keepdims=True)

    # Normalize
    normalized = (x - mean) / np.sqrt(var + epsilon)

    # Scale and shift
    return gamma * normalized + beta

combined = np.round(attention_output, 1) + np.round(positional_embedding, 1)
layer_norm_output = layer_norm(combined, gamma=1.0, beta=0.0, epsilon=0.0)

print(np.round(layer_norm_output, 2))



[[-0.48  0.77 -1.39  1.11]
 [ 0.82 -0.82 -1.15  1.15]
 [ 0.44 -0.73 -1.12  1.42]]
