In [1]:
import os
import re
from copy import deepcopy

import random
import numpy as np

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import tensorflow as tf

In [3]:
from model import GPT

In [4]:
model_path = '.'
ckpt_reader = tf.train.load_checkpoint(
        os.path.join(model_path, 'model.ckpt-220000'))

In [5]:
weights = list(ckpt_reader.get_variable_to_dtype_map().keys())

In [6]:
# batch_size, seq_len, context_size
embedding_size = 1536
B, L, C = [4, 1024, embedding_size]
vocab_size = 8021
max_position = 1024
attention_head = 24
attention_dropout = 0.1
residual_dropout = 0.1
layer_size = 48
embedding_dropout = 0.1

In [7]:
def GPTModel(**kwargs):
    input_ids = tf.keras.layers.Input(
        shape=(None, ),
        name='input_ids',
        dtype=tf.int64
    )
    out = GPT(**kwargs)(input_ids)
    model = tf.keras.Model(
        inputs=input_ids,
        outputs=out)
    return model

In [8]:
# gpt = GPTModel(
#     vocab_size=vocab_size,
#     layer_size=layer_size,
#     block_size=max_position,
#     embedding_dropout=embedding_dropout,
#     embedding_size=C,
#     num_attention_heads=attention_head,
#     attention_dropout=attention_dropout,
#     residual_dropout=residual_dropout
# )
gpt = GPT(
    vocab_size=vocab_size,
    layer_size=layer_size,
    block_size=max_position,
    embedding_dropout=embedding_dropout,
    embedding_size=C,
    num_attention_heads=attention_head,
    attention_dropout=attention_dropout,
    residual_dropout=residual_dropout
)
gpt._set_inputs(
    tf.keras.backend.placeholder((None, None), dtype=tf.int64))

In [9]:
gpt.build(tf.TensorShape([None, None]))

In [10]:
print(len(gpt.weights))

772


In [11]:
[x.name for x in gpt.weights[:10]]

['embedding/embeddings:0',
 'position_embeddings:0',
 'LayerNorm_embed_norm/gamma:0',
 'LayerNorm_embed_norm/beta:0',
 'layer00/attention/query_layer/kernel:0',
 'layer00/attention/query_layer/bias:0',
 'layer00/attention/key_layer/kernel:0',
 'layer00/attention/key_layer/bias:0',
 'layer00/attention/value_layer/kernel:0',
 'layer00/attention/value_layer/bias:0']

In [12]:
name_shape = {}
names = []
for w in gpt.weights:
    n = w.name[:-2]
    name_shape[n] = w.shape
    names.append(n)

In [13]:
print(len(names))

772


In [14]:
all_layer = sorted([
    x for x in weights
    if 'adafactor_' not in x
])

In [15]:
print(len(all_layer))

773


In [16]:
copy_name_shape = deepcopy(name_shape)
output = {}
for x in all_layer:
    v = ckpt_reader.get_tensor(x)
    # x = x.replace('newslm/', 'gpt/')
    x = x.replace('newslm/', '')
    x = x.replace('context_projection_layer', 'attention/context_projection_layer')
    x = x.replace('key_layer', 'attention/key_layer')
    x = x.replace('query_layer', 'attention/query_layer')
    x = x.replace('value_layer', 'attention/value_layer')
    x = x.replace('embeddings/LayerNorm_embed_norm', 'LayerNorm_embed_norm')
    x = x.replace('embeddings/pos_embed', 'position_embeddings')
    x = x.replace('embeddings/word_embed', 'embedding/embeddings')
    
    exists = x in copy_name_shape and v.shape == copy_name_shape[x]
    if exists:
        output[x] = v
        del copy_name_shape[x]
    else:
        print(x, v.shape, exists)

global_step () False


In [17]:
gpt.set_weights([output[w.name[:-2]] for w in gpt.weights])

In [18]:
# 输入 [687, 1646, 1646, 3134, 581, 5774]
# 输出 [1646, 2741, 3134, 2157, 5774, 6294]

In [19]:
from tokenizers import BertWordPieceTokenizer
tokenizer = BertWordPieceTokenizer(
    './clue-vocab.txt',
    lowercase=True,
    add_special_tokens=False)

In [20]:
def batch_gather(a, b):
    return tf.gather(a, b, batch_dims=1)


def top_p_sample(logits, num_samples=1, p=0.95):
    batch_size, vocab_size = logits.shape
    probs = tf.nn.softmax(logits, axis=-1)
    # [batch_size, vocab_perm]
    indices = tf.argsort(probs, direction='DESCENDING')
    cumulative_probabilities = tf.math.cumsum(batch_gather(probs, indices), axis=-1, exclusive=False)

    # find the top pth index to cut off. careful we don't want to cutoff everything!
    # result will be [batch_size, vocab_perm]
    p_expanded = p if isinstance(p, float) else p[:, None]
    exclude_mask = tf.logical_not(
        tf.logical_or(cumulative_probabilities < p_expanded, tf.range(vocab_size)[None] < 1))

    # OPTION A - sample in the sorted space, then unsort.
    logits_to_use = batch_gather(logits, indices) - tf.cast(exclude_mask, tf.float32) * 1e10
    sample_perm = tf.random.categorical(logits=logits_to_use, num_samples=num_samples)
    sample = batch_gather(indices, sample_perm)

    return tf.cast(sample, tf.int64)

In [21]:
# def sample(tokenizer, gpt, sentence,
#            number=5, length=15, layer_size=48,
#            embedding_size=1536, attention_head=24):

#     tokens = tokenizer.encode(sentence).ids
#     i = tf.constant(0, dtype=tf.int64)
#     initial_inputs = tf.constant([tokens] * number, dtype=tf.int64)
#     initial_logits, kv_cache = gpt(initial_inputs, use_cache=True)
#     inputs = top_p_sample(initial_logits[:, -1, :])
#     stores = tf.concat([initial_inputs, inputs], axis=1)
    
#     def _cond(i, inputs, kv_cache, stores):
#         return i < length

#     def _body(i, inputs, kv_cache, stores):
#         new_logits, new_kv_cache = gpt(inputs, kv_cache=kv_cache, use_cache=True)
#         new_inputs = top_p_sample(new_logits[:, -1, :])
#         new_stores = tf.concat([stores, new_inputs], axis=-1)
#         new_kv_cache = tf.concat([
#             kv_cache,
#             new_kv_cache
#         ], axis=-2)
#         new_i = i + 1
#         return [new_i, new_inputs, new_kv_cache, new_stores]

#     result = tf.while_loop(
#         _cond, _body,
#         loop_vars=[i, inputs, kv_cache, stores],
#         shape_invariants=[
#             tf.TensorShape(None),
#             tf.TensorShape([number, None]),
#             tf.TensorShape([
#                 layer_size, number, 2,
#                 attention_head, None,
#                 embedding_size // attention_head
#             ]),
#             tf.TensorShape([
#                 number, None
#             ])
#         ]
#     )
#     return result[-1]

In [22]:
# ret = sample(tokenizer, gpt, '生活总是要继续', length=50)

In [23]:
# for x in ret.numpy():
#     print(tokenizer.decode(x))
#     print()

In [24]:
@tf.function
def serve(inputs):
    return gpt(inputs, kv_cache=None, use_cache=True)


@tf.function
def serve_cache(inputs, kv_cache):
    return gpt(inputs, kv_cache=kv_cache, use_cache=True)

In [25]:
serve_concrete = serve.get_concrete_function(
    tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="inp")
)

In [26]:
serve_cache_concrete = serve_cache.get_concrete_function(
    tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="inp"),
    tf.TensorSpec(shape=[
        layer_size, None, 2, attention_head,
        None, embedding_size // attention_head
    ], dtype=tf.float32, name="kv_cache")
)

In [27]:
r = serve_concrete(
    tf.constant([[1]], tf.int64)
)

In [28]:
print(r[0].shape, r[1].shape)

(1, 1, 8021) (48, 1, 2, 24, 1, 64)


In [29]:
r2 = serve_cache_concrete(
    tf.constant([[1]], tf.int64),
    r[1]
)

In [30]:
print(r2[0].shape, r2[1].shape)

(1, 1, 8021) (48, 1, 2, 24, 1, 64)


In [31]:
@tf.function
def sample(initial_inputs, length):
    layer_size = 48
    embedding_size = 1536
    attention_head = 24

    i = tf.constant(0, dtype=tf.int64)
    initial_logits, kv_cache = serve(initial_inputs)
    inputs = top_p_sample(initial_logits[:, -1, :])
    stores = tf.concat([initial_inputs, inputs], axis=1)

    def _cond(i, inputs, kv_cache, stores):
        return i < length

    def _body(i, inputs, kv_cache, stores):
        new_logits, new_kv_cache = serve_cache(inputs, kv_cache)
        
        new_inputs = top_p_sample(new_logits[:, -1, :])
        new_stores = tf.concat([stores, new_inputs], axis=-1)
        new_kv_cache = tf.concat([
            kv_cache,
            new_kv_cache
        ], axis=-2)
        new_i = i + 1
        return [new_i, new_inputs, new_kv_cache, new_stores]

    result = tf.while_loop(
        _cond, _body,
        loop_vars=[i, inputs, kv_cache, stores],
        shape_invariants=[
            tf.TensorShape(None),
            tf.TensorShape([None, None]),
            tf.TensorShape([
                layer_size, None, 2,
                attention_head, None,
                embedding_size // attention_head
            ]),
            tf.TensorShape([
                None, None
            ])
        ]
    )
    return result[-1]

In [32]:
tokens = tokenizer.encode('天气不错').ids

In [33]:
tf.constant([tokens], dtype=tf.int64)

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[1646, 3134,  581, 5774]])>

In [34]:
ret = sample(
    tf.constant([tokens], dtype=tf.int64),
    tf.constant(15, dtype=tf.int64)
)
print(ret)
for s in ret.numpy():
    print(tokenizer.decode(s))

tf.Tensor(
[[1646 3134  581 5774 6294 2302 1153 3290 5494 3684 6294 2377  705 1181
   579 2805 1160 6294 4384 2863]], shape=(1, 20), dtype=int64)
天 气 不 错 ， 想 去 海 边 玩 ， 所 以 叫 上 朋 友 ， 结 果


In [35]:
ret = sample(
    tf.constant([tokens, tokens], dtype=tf.int64),
    tf.constant(6, dtype=tf.int64)
)
print(ret)
for s in ret.numpy():
    print(tokenizer.decode(s))

tf.Tensor(
[[1646 3134  581 5774 6294 1893 1840 5306  708 3261 3241]
 [1646 3134  581 5774 6294 2804  660  678 2413 5507 1047]], shape=(2, 11), dtype=int64)
天 气 不 错 ， 小 宝 贝 们 活 泼
天 气 不 错 ， 有 些 人 把 运 动


In [36]:
# gpt.save('./gpt_model_tf2', include_optimizer=False, signatures={
#     'serving_default': serve_concrete,
#     'serving_cache': serve_cache_concrete
# })

# v = gpt.signatures['serving_default'](
#     inp=tf.constant([
#         [1,],
#         [1,],
#         [1,],
#     ], dtype=tf.int64)
# )
# print(v['output_0'].shape, v['output_1'].shape)
# v2 = gpt.signatures['serving_cache'](
#     inp=tf.constant([
#         [1,],
#         [1,],
#         [1,],
#     ], dtype=tf.int64),
#     kv_cache=v['output_1']
# )
# print(v2['output_0'].shape, v2['output_1'].shape)

gpt.save('./gpt_model_tf2', include_optimizer=False, signatures={
    'serving_default': sample.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="inp"),
        tf.TensorSpec(shape=[None,], dtype=tf.int64, name="length")
    )
})

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: ./gpt_model_tf2/assets
