In [1]:
import re

import torch
import numpy as np
import tensorflow as tf

from tf2gpt.model import GPT

In [6]:
!du -sh ../merged/iter_0076000/mp_rank_00/model_optim_rng.pt

24G	../merged/iter_0076000/mp_rank_00/model_optim_rng.pt


In [7]:
m0 = torch.load('../merged/iter_0076000/mp_rank_00/model_optim_rng.pt', map_location='cpu')

In [16]:
m0_weights = []

def extract_weight(w = m0['model'], root=''):
    for k, v in w.items():
        if isinstance(v, dict):
            extract_weight(v, root + '.' + k)
        elif isinstance(v, torch.Tensor):
            k = root + '.' + k
            k = k.replace('.language_model.', '')
            k = k.replace('.topQueryLayer.', '.layers.39.')
            m0_weights.append((
                k,
                v
            ))
        else:
            print('what?', type(v))

In [17]:
extract_weight()

In [18]:
len(m0_weights)

645

In [19]:
pangu_weights = {}
for k, v in m0_weights:
    print(k, v.shape)
    pangu_weights[k] = v

embedding.word_embeddings.weight torch.Size([40064, 5120])
embedding.position_embeddings.weight torch.Size([1024, 5120])
topQueryEmbedding.top_query_embeddings.weight torch.Size([1024, 5120])
transformer.layers.0.input_layernorm.weight torch.Size([5120])
transformer.layers.0.input_layernorm.bias torch.Size([5120])
transformer.layers.0.attention.query.weight torch.Size([5120, 5120])
transformer.layers.0.attention.query.bias torch.Size([5120])
transformer.layers.0.attention.key.weight torch.Size([5120, 5120])
transformer.layers.0.attention.key.bias torch.Size([5120])
transformer.layers.0.attention.value.weight torch.Size([5120, 5120])
transformer.layers.0.attention.value.bias torch.Size([5120])
transformer.layers.0.attention.dense.weight torch.Size([5120, 5120])
transformer.layers.0.attention.dense.bias torch.Size([5120])
transformer.layers.0.post_attention_layernorm.weight torch.Size([5120])
transformer.layers.0.post_attention_layernorm.bias torch.Size([5120])
transformer.layers.0.mlp.d

In [12]:
gpt = GPT(
    vocab_size=40_064,
    layer_size=40,
    block_size=1024,
    embedding_dropout=0.0,
    embedding_size=5120,
    num_attention_heads=40,
    attention_dropout=0.0,
    residual_dropout=0.0)

In [13]:
print(gpt(tf.constant([[1]])).shape)

(1, 1, 40064)


In [14]:
for x in gpt.weights:
    if 'gpt/layer' in x.name:
        if 'gpt/layer00' in x.name:
            print(x.name, x.shape)
    else:
        print(x.name, x.shape)

gpt/embedding/embeddings:0 (40064, 5120)
position_embeddings:0 (1024, 5120)
top_query:0 (1024, 5120)
gpt/layer00/attention/query_layer/kernel:0 (5120, 5120)
gpt/layer00/attention/query_layer/bias:0 (5120,)
gpt/layer00/attention/key_layer/kernel:0 (5120, 5120)
gpt/layer00/attention/key_layer/bias:0 (5120,)
gpt/layer00/attention/value_layer/kernel:0 (5120, 5120)
gpt/layer00/attention/value_layer/bias:0 (5120,)
gpt/layer00/attention/context_projection_layer/kernel:0 (5120, 5120)
gpt/layer00/attention/context_projection_layer/bias:0 (5120,)
gpt/layer00/LayerNorm_mlp_ln0/gamma:0 (5120,)
gpt/layer00/LayerNorm_mlp_ln0/beta:0 (5120,)
gpt/layer00/LayerNorm_mlp_ln1/gamma:0 (5120,)
gpt/layer00/LayerNorm_mlp_ln1/beta:0 (5120,)
gpt/layer00/intermediate/kernel:0 (5120, 20480)
gpt/layer00/intermediate/bias:0 (20480,)
gpt/layer00/output/kernel:0 (20480, 5120)
gpt/layer00/output/bias:0 (5120,)
gpt/LayerNorm_final_norm/gamma:0 (5120,)
gpt/LayerNorm_final_norm/beta:0 (5120,)


In [20]:
new_weights = []

for x in gpt.weights:
    xs = tuple(x.shape)

    if 'gpt/embedding/embeddings:' in x.name:
        pname = 'embedding.word_embeddings.weight'
        w = pangu_weights[pname]
        assert w.shape == (4_0064, 5120)
        new_weights.append((x.name, xs, pname, w))

    elif 'position_embeddings' in x.name:
        pname = 'embedding.position_embeddings.weight'
        w = pangu_weights[pname]
        assert xs == w.shape
        new_weights.append((x.name, xs, pname, w))
    
    elif 'top_query' in x.name:
        pname = 'topQueryEmbedding.top_query_embeddings.weight'
        w = pangu_weights[pname]
        assert xs == w.shape
        new_weights.append((x.name, xs, pname, w))

    elif 'gpt/layer' in x.name:
        n_layer = int(x.name[len('gpt/layer'):][:2])
        if 'query_layer/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.query.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'key_layer/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.key.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'value_layer/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.value.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'query_layer/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.query.bias'
            w = pangu_weights[pname]
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'key_layer/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.key.bias'
            w = pangu_weights[pname]
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'value_layer/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.value.bias'
            w = pangu_weights[pname]
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))

        elif 'attention/context_projection_layer/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.dense.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert w.shape == xs
            new_weights.append((x.name, xs, pname, w))

        elif 'attention/context_projection_layer/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.dense.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, xs, pname, w))

        elif 'LayerNorm_mlp_ln0/gamma' in x.name:
            pname = f'transformer.layers.{n_layer}.input_layernorm.weight'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'LayerNorm_mlp_ln1/gamma' in x.name:
            pname = f'transformer.layers.{n_layer}.post_attention_layernorm.weight'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'LayerNorm_mlp_ln0/beta' in x.name:
            pname = f'transformer.layers.{n_layer}.input_layernorm.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'LayerNorm_mlp_ln1/beta' in x.name:
            pname = f'transformer.layers.{n_layer}.post_attention_layernorm.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'intermediate/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.mlp.dense_h_to_4h.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'intermediate/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.mlp.dense_h_to_4h.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif '/output/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.mlp.dense_4h_to_h.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif '/output/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.mlp.dense_4h_to_h.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        else:
            print('BAD', x.name, xs)
            break
    elif 'gpt/LayerNorm_final_norm/gamma' in x.name:
        pname = 'transformer.final_layernorm.weight'
        w = pangu_weights[pname]
        assert w.shape == xs
        new_weights.append((x.name, x.shape, pname, w))

    elif 'gpt/LayerNorm_final_norm/beta' in x.name:
        pname = 'transformer.final_layernorm.bias'
        w = pangu_weights[pname]
        assert w.shape == xs
        new_weights.append((x.name, x.shape, pname, w))

    else:
        print('BAD', x.name, xs)
        break

In [21]:
assert len(new_weights) == len(gpt.weights)
for x in new_weights:
    assert tuple(x[1]) == x[-1].shape

In [22]:
len(gpt.weights)

645

In [23]:
gpt.set_weights([x[-1] for x in new_weights])

In [24]:
from tokenization_jieba import JIEBATokenizer
cbpe = JIEBATokenizer(
    'PanGu-Alpha-GPU/panguAlpha_pytorch/megatron/tokenizer/bpe_4w_pcl/vocab.vocab',
    'PanGu-Alpha-GPU/panguAlpha_pytorch/megatron/tokenizer/bpe_4w_pcl/vocab.model')

In [25]:
cbpe.vocab_size

40000

In [26]:
ids = cbpe.encode('青椒肉丝的做法：')

for i in range(10):
    output = gpt(tf.constant([ids]))
    nid = np.argmax(output[0, -1])
    ids += [int(nid)]
    print(i, cbpe.decode(ids))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.513 seconds.
Prefix dict has been built successfully.


0 青椒肉丝的做法:
1 青椒肉丝的做法: 
2 青椒肉丝的做法: 1
3 青椒肉丝的做法: 1.
4 青椒肉丝的做法: 1.青
5 青椒肉丝的做法: 1.青椒
6 青椒肉丝的做法: 1.青椒洗净
7 青椒肉丝的做法: 1.青椒洗净去
8 青椒肉丝的做法: 1.青椒洗净去籽
9 青椒肉丝的做法: 1.青椒洗净去籽,


In [28]:
@tf.function
def batch_gather(a, b):
    return tf.gather(a, b, batch_dims=1)


@tf.function
def top_k_top_p_sample(logits, num_samples=1, top_k=0, p=0.95):
    batch_size, vocab_size = logits.shape
    probs = tf.nn.softmax(logits, axis=-1)
    
    # [batch_size, vocab_perm]
    indices = tf.argsort(probs, direction='DESCENDING')
    logits_to_use = batch_gather(logits, indices)
    cumulative_probabilities = tf.math.cumsum(batch_gather(probs, indices), axis=-1, exclusive=False)

    # find the top pth index to cut off. careful we don't want to cutoff everything!
    # result will be [batch_size, vocab_perm]
    if p > 0.0:
        exclude_mask = tf.logical_not(
            tf.logical_or(cumulative_probabilities < p, tf.range(vocab_size)[None] < 1))
        # OPTION A - sample in the sorted space, then unsort.
        logits_to_use = logits_to_use - tf.cast(exclude_mask, tf.float32) * 1e10
    
    if top_k > 0:
        logits_to_use = logits_to_use - tf.cast(
            tf.argsort(logits_to_use, direction='DESCENDING') >= top_k,
            dtype=tf.float32
        ) * 1e10
    
    sample_perm = tf.random.categorical(logits=logits_to_use, num_samples=num_samples)
    sample = batch_gather(indices, sample_perm)

    return tf.cast(sample, tf.int64)

@tf.function
def serve(inputs):
    return gpt(inputs, kv_cache=None, use_cache=True)


@tf.function
def serve_cache(inputs, kv_cache):
    return gpt(inputs, kv_cache=kv_cache, use_cache=True)

serve_concrete = serve.get_concrete_function(
    tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="inp")
)

layer_size = 40
attention_head = 40
embedding_size = 5120

serve_cache_concrete = serve_cache.get_concrete_function(
    tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="inp"),
    tf.TensorSpec(shape=[
        layer_size, None, 2, attention_head,
        None, embedding_size // attention_head
    ], dtype=tf.float32, name="kv_cache")
)

@tf.function
def sample(initial_inputs, length, top_k, top_p, temperature):
    layer_size = 40
    embedding_size = 5120
    attention_head = 40

    i = tf.constant(0, dtype=tf.int64)
    initial_logits, kv_cache = serve(initial_inputs)
    logits_with_temperature = initial_logits[:, -1, :]
    if temperature > 0.0:
        logits_with_temperature /= temperature
    inputs = top_k_top_p_sample(logits_with_temperature, 1, top_k, top_p)
    stores = tf.concat([initial_inputs, inputs], axis=1)

    def _cond(i, inputs, kv_cache, stores):
        return i < length

    def _body(i, inputs, kv_cache, stores):
        new_logits, new_kv_cache = serve_cache(inputs, kv_cache)
        logits_with_temperature = new_logits[:, -1, :]
        if temperature > 0.0:
            logits_with_temperature /= temperature
        new_inputs = top_k_top_p_sample(logits_with_temperature, 1, top_k, top_p)
        new_stores = tf.concat([stores, new_inputs], axis=-1)
        new_kv_cache = tf.concat([
            kv_cache,
            new_kv_cache
        ], axis=-2)
        new_i = i + 1
        return [new_i, new_inputs, new_kv_cache, new_stores]

    result = tf.while_loop(
        _cond, _body,
        loop_vars=[i, inputs, kv_cache, stores],
        shape_invariants=[
            tf.TensorShape(None),
            tf.TensorShape([None, None]),
            tf.TensorShape([
                layer_size, None, 2,
                attention_head, None,
                embedding_size // attention_head
            ]),
            tf.TensorShape([
                None, None
            ])
        ]
    )
    return result[-1]

In [29]:
ids = cbpe.encode('今天天气不错')

ret = sample(
    tf.constant([ids], dtype=tf.int64),
    tf.constant(15, dtype=tf.int64),
    tf.constant(15, dtype=tf.int32),
    tf.constant(0.95, dtype=tf.float32),
    tf.constant(0.9, dtype=tf.float32)
)
print(ret)
print(cbpe.decode(ret.numpy().tolist()[0]))

tf.Tensor(
[[  465   235   464  1123    12  1896   256 25468    12    27    23    68
   1710   156   256 25468  1539    13     3  8703]], shape=(1, 20), dtype=int64)
今天天气不错。晚上吃火锅。你有多久没吃火锅啦
好久


In [30]:
gpt.save('./pangu-13B-tf2', include_optimizer=False, signatures={
    'serving_default': sample.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="inp"),
        tf.TensorSpec(shape=[None,], dtype=tf.int64, name="length"),
        tf.TensorSpec(shape=[None,], dtype=tf.int32, name="top_k"),
        tf.TensorSpec(shape=[None,], dtype=tf.float32, name="top_p"),
        tf.TensorSpec(shape=[None,], dtype=tf.float32, name="temperature")
    )
})





INFO:tensorflow:Assets written to: ./pangu-13B-tf2/assets


INFO:tensorflow:Assets written to: ./pangu-13B-tf2/assets
