In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import tensorflow as tf

In [3]:
from model import get_attention_mask, Attention, Transformer, PositionEmbedding, GPT

In [4]:
# batch_size, seq_len, context_size
B, L, C = [4, 10, 6]
vocab_size = 123
max_position = 40
attention_head = 3
attention_dropout = 0.1
residual_dropout = 0.1
layer_size = 4
embedding_dropout = 0.1

In [9]:
assert get_attention_mask(3).numpy().tolist() == [
    [1.0, 0.0, 0.0],
    [1.0, 1.0, 0.0],
    [1.0, 1.0, 1.0]
]

In [10]:
assert Attention(
    embedding_size=C,
    num_attention_heads=attention_head,
    attention_dropout=attention_dropout,
    residual_dropout=residual_dropout
)(tf.random.uniform((B, L, C)))[0].shape == [B, L, C]

In [11]:
assert Transformer(
    C,
    num_attention_heads=attention_head,
    attention_dropout=attention_dropout,
    residual_dropout=residual_dropout
)(tf.random.uniform((B, L, C)))[0].shape == [B, L, C]

In [12]:
assert PositionEmbedding(
    max_position, C
)(tf.random.uniform((B, L, C))).shape == [1, L, C]

In [13]:
assert GPT(
    vocab_size=vocab_size,
    layer_size=layer_size,
    block_size=max_position,
    embedding_dropout=embedding_dropout,
    embedding_size=C,
    num_attention_heads=attention_head,
    attention_dropout=attention_dropout,
    residual_dropout=residual_dropout
)(
    tf.random.uniform((B, L), dtype=tf.int64, maxval=10)
).shape == [B, L, vocab_size]

In [14]:
transformer = Transformer(C, attention_head, 0.1, 0.1)
x = tf.random.uniform((B, L, C))
y, kv = transformer(x[:, :-1, :])
y2, kv2 = transformer(x[:, -1:, :], kv)
y3, kv3 = transformer(x[:, :, :])
assert y.shape == (B, L - 1, C)
assert y2.shape == (B, 1, C)
assert y3.shape == (B, L, C)
assert tf.reduce_sum(tf.pow(y2[0][0] - y3[0][-1], 2)).numpy() < 1e-8

In [15]:
gpt = GPT(
    vocab_size=vocab_size,
    layer_size=layer_size,
    block_size=max_position,
    embedding_dropout=embedding_dropout,
    embedding_size=C,
    num_attention_heads=attention_head,
    attention_dropout=attention_dropout,
    residual_dropout=residual_dropout
)
x = tf.random.uniform((B, L, C))
y, kv = transformer(x[:, :-1, :])
y2, kv2 = transformer(x[:, -1:, :], kv)
y3, kv3 = transformer(x[:, :, :])
assert tf.reduce_sum(tf.pow(y2[0][0] - y3[0][-1], 2)).numpy() < 1e-8