In [1]:
import re

import torch
import numpy as np
import tensorflow as tf

from tf2gpt.model import GPT

In [2]:
!du -sh ../Pangu-alpha_2.6B_mgt/iter_0001000/mp_rank_00/model_optim_rng.pt

4.9G	../Pangu-alpha_2.6B_mgt/iter_0001000/mp_rank_00/model_optim_rng.pt


In [3]:
m0 = torch.load('../Pangu-alpha_2.6B_mgt/iter_0001000/mp_rank_00/model_optim_rng.pt', map_location='cpu')

In [4]:
m0_weights = []

def extract_weight(w = m0['model'], root=''):
    for k, v in w.items():
        if isinstance(v, dict):
            extract_weight(v, root + '.' + k)
        elif isinstance(v, torch.Tensor):
            k = root + '.' + k
            k = k.replace('.language_model.', '')
            k = k.replace('.topQueryLayer.', '.layers.31.')
            m0_weights.append((
                k,
                v
            ))
        else:
            print('what?', type(v))

In [5]:
extract_weight()

In [6]:
len(m0_weights)

517

In [7]:
pangu_weights = {}
for k, v in m0_weights:
    print(k, v.shape)
    pangu_weights[k] = v

embedding.word_embeddings.weight torch.Size([40064, 2560])
embedding.position_embeddings.weight torch.Size([1024, 2560])
topQueryEmbedding.top_query_embeddings.weight torch.Size([1024, 2560])
transformer.layers.0.input_layernorm.weight torch.Size([2560])
transformer.layers.0.input_layernorm.bias torch.Size([2560])
transformer.layers.0.attention.query.weight torch.Size([2560, 2560])
transformer.layers.0.attention.query.bias torch.Size([2560])
transformer.layers.0.attention.key.weight torch.Size([2560, 2560])
transformer.layers.0.attention.key.bias torch.Size([2560])
transformer.layers.0.attention.value.weight torch.Size([2560, 2560])
transformer.layers.0.attention.value.bias torch.Size([2560])
transformer.layers.0.attention.dense.weight torch.Size([2560, 2560])
transformer.layers.0.attention.dense.bias torch.Size([2560])
transformer.layers.0.post_attention_layernorm.weight torch.Size([2560])
transformer.layers.0.post_attention_layernorm.bias torch.Size([2560])
transformer.layers.0.mlp.d

In [8]:
gpt = GPT(
    vocab_size=40_064,
    layer_size=32,
    block_size=1024,
    embedding_dropout=0.0,
    embedding_size=2560,
    num_attention_heads=32,
    attention_dropout=0.0,
    residual_dropout=0.0,
    use_cache=True
)

In [10]:
print(gpt(tf.constant([[1]]))[0].shape)

(1, 1, 40064)


In [11]:
for x in gpt.weights:
    if 'gpt/layer' in x.name:
        if 'gpt/layer00' in x.name:
            print(x.name, x.shape)
    else:
        print(x.name, x.shape)

gpt/embedding/embeddings:0 (40064, 2560)
position_embeddings:0 (1024, 2560)
top_query:0 (1024, 2560)
gpt/layer00/attention/query_layer/kernel:0 (2560, 2560)
gpt/layer00/attention/query_layer/bias:0 (2560,)
gpt/layer00/attention/key_layer/kernel:0 (2560, 2560)
gpt/layer00/attention/key_layer/bias:0 (2560,)
gpt/layer00/attention/value_layer/kernel:0 (2560, 2560)
gpt/layer00/attention/value_layer/bias:0 (2560,)
gpt/layer00/attention/context_projection_layer/kernel:0 (2560, 2560)
gpt/layer00/attention/context_projection_layer/bias:0 (2560,)
gpt/layer00/LayerNorm_mlp_ln0/gamma:0 (2560,)
gpt/layer00/LayerNorm_mlp_ln0/beta:0 (2560,)
gpt/layer00/LayerNorm_mlp_ln1/gamma:0 (2560,)
gpt/layer00/LayerNorm_mlp_ln1/beta:0 (2560,)
gpt/layer00/intermediate/kernel:0 (2560, 10240)
gpt/layer00/intermediate/bias:0 (10240,)
gpt/layer00/output/kernel:0 (10240, 2560)
gpt/layer00/output/bias:0 (2560,)
gpt/LayerNorm_final_norm/gamma:0 (2560,)
gpt/LayerNorm_final_norm/beta:0 (2560,)


In [12]:
new_weights = []

for x in gpt.weights:
    xs = tuple(x.shape)

    if 'gpt/embedding/embeddings:' in x.name:
        pname = 'embedding.word_embeddings.weight'
        w = pangu_weights[pname]
        assert w.shape == (4_0064, 2560)
        new_weights.append((x.name, xs, pname, w))

    elif 'position_embeddings' in x.name:
        pname = 'embedding.position_embeddings.weight'
        w = pangu_weights[pname]
        assert xs == w.shape
        new_weights.append((x.name, xs, pname, w))
    
    elif 'top_query' in x.name:
        pname = 'topQueryEmbedding.top_query_embeddings.weight'
        w = pangu_weights[pname]
        assert xs == w.shape
        new_weights.append((x.name, xs, pname, w))

    elif 'gpt/layer' in x.name:
        n_layer = int(x.name[len('gpt/layer'):][:2])
        if 'query_layer/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.query.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'key_layer/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.key.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'value_layer/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.value.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'query_layer/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.query.bias'
            w = pangu_weights[pname]
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'key_layer/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.key.bias'
            w = pangu_weights[pname]
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'value_layer/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.value.bias'
            w = pangu_weights[pname]
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))

        elif 'attention/context_projection_layer/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.dense.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert w.shape == xs
            new_weights.append((x.name, xs, pname, w))

        elif 'attention/context_projection_layer/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.dense.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, xs, pname, w))

        elif 'LayerNorm_mlp_ln0/gamma' in x.name:
            pname = f'transformer.layers.{n_layer}.input_layernorm.weight'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'LayerNorm_mlp_ln1/gamma' in x.name:
            pname = f'transformer.layers.{n_layer}.post_attention_layernorm.weight'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'LayerNorm_mlp_ln0/beta' in x.name:
            pname = f'transformer.layers.{n_layer}.input_layernorm.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'LayerNorm_mlp_ln1/beta' in x.name:
            pname = f'transformer.layers.{n_layer}.post_attention_layernorm.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'intermediate/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.mlp.dense_h_to_4h.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'intermediate/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.mlp.dense_h_to_4h.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif '/output/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.mlp.dense_4h_to_h.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif '/output/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.mlp.dense_4h_to_h.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        else:
            print('BAD', x.name, xs)
            break
    elif 'gpt/LayerNorm_final_norm/gamma' in x.name:
        pname = 'transformer.final_layernorm.weight'
        w = pangu_weights[pname]
        assert w.shape == xs
        new_weights.append((x.name, x.shape, pname, w))

    elif 'gpt/LayerNorm_final_norm/beta' in x.name:
        pname = 'transformer.final_layernorm.bias'
        w = pangu_weights[pname]
        assert w.shape == xs
        new_weights.append((x.name, x.shape, pname, w))

    else:
        print('BAD', x.name, xs)
        break

In [13]:
assert len(new_weights) == len(gpt.weights)
for x in new_weights:
    assert tuple(x[1]) == x[-1].shape

In [14]:
len(gpt.weights)

517

In [15]:
gpt.set_weights([x[-1] for x in new_weights])

In [16]:
from tokenization_jieba import JIEBATokenizer
cbpe = JIEBATokenizer(
    'PanGu-Alpha-GPU/panguAlpha_pytorch/megatron/tokenizer/bpe_4w_pcl/vocab.vocab',
    'PanGu-Alpha-GPU/panguAlpha_pytorch/megatron/tokenizer/bpe_4w_pcl/vocab.model')

In [17]:
cbpe.vocab_size

40000

In [19]:
ids = cbpe.encode('青椒肉丝的做法：')

for i in range(10):
    output = gpt(tf.constant([ids]))[0]
    nid = np.argmax(output[0, -1])
    ids += [int(nid)]
    print(i, cbpe.decode(ids))

0 青椒肉丝的做法:是
1 青椒肉丝的做法:是青
2 青椒肉丝的做法:是青椒
3 青椒肉丝的做法:是青椒洗净
4 青椒肉丝的做法:是青椒洗净切
5 青椒肉丝的做法:是青椒洗净切丝
6 青椒肉丝的做法:是青椒洗净切丝<eot>
7 青椒肉丝的做法:是青椒洗净切丝<eot>青
8 青椒肉丝的做法:是青椒洗净切丝<eot>青椒
9 青椒肉丝的做法:是青椒洗净切丝<eot>青椒肉


In [20]:
@tf.function
def batch_gather(a, b):
    return tf.gather(a, b, batch_dims=1)


@tf.function
def top_k_top_p_sample(logits, num_samples=1, top_k=0, p=0.95):
    batch_size, vocab_size = logits.shape
    probs = tf.nn.softmax(logits, axis=-1)
    
    # [batch_size, vocab_perm]
    indices = tf.argsort(probs, direction='DESCENDING')
    logits_to_use = batch_gather(logits, indices)
    cumulative_probabilities = tf.math.cumsum(batch_gather(probs, indices), axis=-1, exclusive=False)

    # find the top pth index to cut off. careful we don't want to cutoff everything!
    # result will be [batch_size, vocab_perm]
    if p > 0.0:
        exclude_mask = tf.logical_not(
            tf.logical_or(cumulative_probabilities < p, tf.range(vocab_size)[None] < 1))
        # OPTION A - sample in the sorted space, then unsort.
        logits_to_use = logits_to_use - tf.cast(exclude_mask, tf.float32) * 1e10
    
    if top_k > 0:
        logits_to_use = logits_to_use - tf.cast(
            tf.argsort(logits_to_use, direction='DESCENDING') >= top_k,
            dtype=tf.float32
        ) * 1e10
    
    sample_perm = tf.random.categorical(logits=logits_to_use, num_samples=num_samples)
    sample = batch_gather(indices, sample_perm)

    return tf.cast(sample, tf.int64)

@tf.function
def serve(inputs):
    return gpt(inputs, kv_cache=None, use_cache=True)


@tf.function
def serve_cache(inputs, kv_cache):
    return gpt(inputs, kv_cache=kv_cache, use_cache=True)

serve_concrete = serve.get_concrete_function(
    tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="inp")
)

layer_size = 32
attention_head = 32
embedding_size = 2560

serve_cache_concrete = serve_cache.get_concrete_function(
    tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="inp"),
    tf.TensorSpec(shape=[
        layer_size, None, 2, attention_head,
        None, embedding_size // attention_head
    ], dtype=tf.float32, name="kv_cache")
)

In [21]:
# ids = cbpe.encode('今天天气不错')

# ret = sample(
#     tf.constant([ids], dtype=tf.int64),
#     tf.constant(15, dtype=tf.int64),
#     tf.constant(15, dtype=tf.int32),
#     tf.constant(0.95, dtype=tf.float32),
#     tf.constant(0.9, dtype=tf.float32)
# )
# print(ret)
# print(cbpe.decode(ret.numpy().tolist()[0]))

In [22]:
gpt.save('./pangu-2.6B-tf2', include_optimizer=False, signatures={
    'serving_default': serve.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="input_ids"),
    )
})





INFO:tensorflow:Assets written to: ./pangu-2.6B-tf2/assets


INFO:tensorflow:Assets written to: ./pangu-2.6B-tf2/assets


In [23]:
gpt.save('./pangu-2.6B-tf2-kv', include_optimizer=False, signatures={
    'serving_default': serve_cache.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="input_ids"),
        tf.TensorSpec(shape=[
            layer_size, None, 2, attention_head,
            None, embedding_size // attention_head
        ], dtype=tf.float32, name="kv_cache")
    )
})

































































































































INFO:tensorflow:Assets written to: ./pangu-2.6B-tf2-kv/assets


INFO:tensorflow:Assets written to: ./pangu-2.6B-tf2-kv/assets


In [24]:
!rm -rf onnx
!mkdir -p onnx
!python -m tf2onnx.convert \
    --saved-model pangu-2.6B-tf2 \
    --output onnx/pangu.zip --large_model --opset=13

2021-10-13 23:43:00.016579: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-13 23:43:00.016608: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-10-13 23:43:02.475426: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-10-13 23:43:02.475453: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-10-13 23:43:02.475478: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (iZuf6fokcl2k1pwfopz0n4Z): /proc/driver/nvidia/version does not exist
2021-10-13 23:43:02.475722: I tensorflow/core/platfor

2021-10-13 23:47:53,182 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/position_embedding/ReadVariableOp
2021-10-13 23:47:53,198 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 23:47:53,198 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 23:47:53,198 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 23:47:53,225 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-13 23:47:53,225 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/value_layer/Tensordot/ReadVariableOp
2021-10-13 23:47:53,259 - INFO - folding node using tf type=Identity, name=Stateful

2021-10-13 23:47:55,141 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 23:47:55,141 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/attention/query_layer/Tensordot/ReadVariableOp
2021-10-13 23:47:55,176 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 23:47:55,177 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-13 23:47:55,219 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-13 23:47:55,220 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-13 23:47:55,220 - INFO -

2021-10-13 23:47:57,303 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer06/intermediate/BiasAdd/ReadVariableOp
2021-10-13 23:47:57,304 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer06/output/Tensordot/ReadVariableOp
2021-10-13 23:47:57,564 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer06/output/BiasAdd/ReadVariableOp
2021-10-13 23:47:57,564 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 23:47:57,565 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 23:47:57,565 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 23:47:57,595 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gp

2021-10-13 23:47:59,379 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 23:47:59,379 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/query_layer/Tensordot/ReadVariableOp
2021-10-13 23:47:59,414 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 23:47:59,414 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-13 23:47:59,471 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-13 23:47:59,473 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-13 23:47:59,473 - INFO -

2021-10-13 23:48:01,502 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer13/intermediate/BiasAdd/ReadVariableOp
2021-10-13 23:48:01,503 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer13/output/Tensordot/ReadVariableOp
2021-10-13 23:48:01,704 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer13/output/BiasAdd/ReadVariableOp
2021-10-13 23:48:01,704 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer14/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 23:48:01,704 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer14/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 23:48:01,704 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer14/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 23:48:01,741 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gp

2021-10-13 23:48:03,334 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-13 23:48:03,334 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/value_layer/Tensordot/ReadVariableOp
2021-10-13 23:48:03,362 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 23:48:03,362 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/query_layer/Tensordot/ReadVariableOp
2021-10-13 23:48:03,390 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 23:48:03,390 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-13 23:48:03,416 - INFO - folding node u

2021-10-13 23:48:04,997 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-13 23:48:04,998 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-13 23:48:04,998 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/LayerNorm_mlp_ln1/batchnorm/ReadVariableOp
2021-10-13 23:48:04,998 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/intermediate/Tensordot/ReadVariableOp
2021-10-13 23:48:05,192 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/intermediate/BiasAdd/ReadVariableOp
2021-10-13 23:48:05,193 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/output/Tensordot/ReadVariableOp
2021-10-13 23:48:05,387 - INFO - folding node using tf type=Identity, name=State

2021-10-13 23:48:06,946 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer23/output/BiasAdd/ReadVariableOp
2021-10-13 23:48:06,947 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 23:48:06,947 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 23:48:06,947 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 23:48:06,974 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-13 23:48:06,974 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/attention/value_layer/Tensordot/ReadVariableOp
2021-10-13 23:48:07,014 - INFO - folding node using tf type=Identity, name=Stat

2021-10-13 23:48:08,567 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 23:48:08,567 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/query_layer/Tensordot/ReadVariableOp
2021-10-13 23:48:08,597 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 23:48:08,597 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-13 23:48:08,633 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-13 23:48:08,633 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-13 23:48:08,633 - INFO -

2021-10-13 23:48:10,172 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-13 23:48:10,172 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/LayerNorm_mlp_ln1/batchnorm/ReadVariableOp
2021-10-13 23:48:10,172 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/intermediate/Tensordot/ReadVariableOp
2021-10-13 23:48:10,366 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/intermediate/BiasAdd/ReadVariableOp
2021-10-13 23:48:10,366 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/output/Tensordot/ReadVariableOp
2021-10-13 23:48:10,559 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/output/BiasAdd/ReadVariableOp
2021-10-13 23:48:10,559 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/Layer

In [28]:
!cd onnx && unzip -q pangu.zip

In [29]:
!rm -rf onnx_kv
!mkdir -p onnx_kv
!python -m tf2onnx.convert \
    --saved-model pangu-2.6B-tf2-kv \
    --output onnx_kv/pangu.zip --large_model --opset=13

2021-10-14 00:15:59.812010: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-14 00:15:59.812039: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-10-14 00:16:01.012691: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-10-14 00:16:01.012718: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-10-14 00:16:01.012742: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (iZuf6fokcl2k1pwfopz0n4Z): /proc/driver/nvidia/version does not exist
2021-10-14 00:16:01.012975: I tensorflow/core/platfor

2021-10-14 00:19:14,134 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer28/attention/query_layer/Tensordot/concat
2021-10-14 00:19:14,134 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer28/attention/context_projection_layer/Tensordot/concat
2021-10-14 00:19:14,134 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer28/intermediate/Tensordot/concat
2021-10-14 00:19:14,134 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer28/output/Tensordot/concat
2021-10-14 00:19:14,134 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer29/attention/key_layer/Tensordot/concat
2021-10-14 00:19:14,134 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer29/attention/value_layer/Tensordot/concat
2021-10-14 00:19:14,134 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/laye

2021-10-14 00:19:14,197 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:14,197 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/value_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:14,224 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:14,224 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/query_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:14,254 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:14,254 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:14,282 - INFO - folding node u

2021-10-14 00:19:15,822 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:15,823 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-14 00:19:15,823 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/LayerNorm_mlp_ln1/batchnorm/ReadVariableOp
2021-10-14 00:19:15,823 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/intermediate/Tensordot/ReadVariableOp
2021-10-14 00:19:16,023 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/intermediate/BiasAdd/ReadVariableOp
2021-10-14 00:19:16,024 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/output/Tensordot/ReadVariableOp
2021-10-14 00:19:16,226 - INFO - folding node using tf type=Identity, name=State

2021-10-14 00:19:17,753 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer06/output/BiasAdd/ReadVariableOp
2021-10-14 00:19:17,753 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-14 00:19:17,753 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-14 00:19:17,753 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/attention/key_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:17,779 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:17,779 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/attention/value_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:17,807 - INFO - folding node using tf type=Identity, name=Stat

2021-10-14 00:19:19,307 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:19,307 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:19,341 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:19,341 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-14 00:19:19,342 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/LayerNorm_mlp_ln1/batchnorm/ReadVariableOp
2021-10-14 00:19:19,342 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/intermediate/Tensordot/ReadVariableOp
2021-10-14 00:19:19,531 - INFO - folding no

2021-10-14 00:19:21,040 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer13/intermediate/BiasAdd/ReadVariableOp
2021-10-14 00:19:21,041 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer13/output/Tensordot/ReadVariableOp
2021-10-14 00:19:21,230 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer13/output/BiasAdd/ReadVariableOp
2021-10-14 00:19:21,230 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer14/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-14 00:19:21,230 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer14/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-14 00:19:21,230 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer14/attention/key_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:21,267 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gp

2021-10-14 00:19:22,770 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:22,770 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/value_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:22,797 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:22,797 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/query_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:22,831 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:22,831 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:22,859 - INFO - folding node u

2021-10-14 00:19:24,888 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:24,889 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-14 00:19:24,889 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/LayerNorm_mlp_ln1/batchnorm/ReadVariableOp
2021-10-14 00:19:24,889 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/intermediate/Tensordot/ReadVariableOp
2021-10-14 00:19:25,181 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/intermediate/BiasAdd/ReadVariableOp
2021-10-14 00:19:25,181 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/output/Tensordot/ReadVariableOp
2021-10-14 00:19:25,477 - INFO - folding node using tf type=Identity, name=State

2021-10-14 00:19:27,683 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer23/output/BiasAdd/ReadVariableOp
2021-10-14 00:19:27,683 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-14 00:19:27,683 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-14 00:19:27,683 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/attention/key_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:27,721 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:27,721 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/attention/value_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:27,760 - INFO - folding node using tf type=Identity, name=Stat

2021-10-14 00:19:29,938 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:29,939 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/query_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:29,978 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:29,979 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-14 00:19:30,011 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-14 00:19:30,011 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-14 00:19:30,011 - INFO -

2021-10-14 00:19:32,497 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/intermediate/BiasAdd/ReadVariableOp
2021-10-14 00:19:32,497 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/output/Tensordot/ReadVariableOp
2021-10-14 00:19:32,792 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/output/BiasAdd/ReadVariableOp
2021-10-14 00:19:32,792 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/LayerNorm_final_norm/batchnorm/mul/ReadVariableOp
2021-10-14 00:19:32,792 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/LayerNorm_final_norm/batchnorm/ReadVariableOp
2021-10-14 00:19:32,792 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer31/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-14 00:19:32,793 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer3

In [30]:
!cd onnx_kv && unzip -q pangu.zip

In [31]:
import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

In [32]:
!rm -rf onnx_q && mkdir -p onnx_q

In [33]:
quantized_model = quantize_dynamic(
    './onnx/__MODEL_PROTO.onnx',
    './onnx_q/pangu.onnx',
    weight_type=QuantType.QUInt8,
    use_external_data_format=True
)

In [34]:
!rm -rf onnx_kv_q && mkdir -p onnx_kv_q

In [35]:
quantized_model = quantize_dynamic(
    './onnx_kv/__MODEL_PROTO.onnx',
    './onnx_kv_q/pangu.onnx',
    weight_type=QuantType.QUInt8,
    use_external_data_format=True
)

In [36]:
!rm -rf onnx
!rm -rf onnx_kv