In [1]:
import numpy as np

In [2]:
MODEL_DIM = 64
SEQ_LENGTH = 10
VOCAB_SIZE = 100

In [3]:
def embedding(input, vocab_size, dim_model):
    embed = np.random.randn(vocab_size, dim_model)
    
    return np.array([embed[i] for i in input])

In [4]:
embedding(np.array([1, 2, 4]), vocab_size=VOCAB_SIZE, dim_model=MODEL_DIM).shape

(3, 64)

In [5]:
def softmax(x):
    e_x = np.exp(x - np.max(x))
    
    return e_x / e_x.sum(axis=-1).reshape(-1, 1)

In [6]:
softmax(
    np.array([1, 2, 3])
)

array([[0.09003057, 0.24472847, 0.66524096]])

In [8]:
def scaled_dot_product_attention(Q, K, V):
    
    matmul_qk = np.dot(Q, K.T)
    depth = K.shape[-1]
    logits = matmul_qk / np.sqrt(depth)
    attention_weights = softmax(logits)
    output = np.dot(attention_weights, V)
    
    return output

In [16]:
def linear_and_softmax(input, dim_model, vocab_size):
    
    weights = np.random.randn(dim_model, vocab_size)
    
    logits = np.dot(input, weights)
    
    return softmax(logits)

In [17]:
def transformer_model(input, vocab_size, dim_model):
    
    embedded_input = embedding(input, vocab_size, dim_model)

    attention_output = scaled_dot_product_attention(embedded_input, embedded_input, embedded_input)
    
    output_probabilities = linear_and_softmax(attention_output, dim_model, vocab_size)

    output_indices = np.argmax(output_probabilities, axis=-1)
    
    return output_indices

In [18]:
input_sequence = np.random.randint(0, VOCAB_SIZE, SEQ_LENGTH)
input_sequence

array([75,  5, 28,  8, 68, 24, 57, 82, 20, 89])

In [None]:
output = transformer_model(input_sequence, VOCAB_SIZE, MODEL_DIM) # foward step
output

array([40, 61, 65, 47, 94, 13, 10, 92, 57, 54])