In [3]:
import re

import torch
import numpy as np
import tensorflow as tf
import onnx
import onnxruntime
from onnxruntime.quantization import quantize_dynamic, QuantType

from tf2gpt.model import GPT

In [4]:
print(tf.__version__)
print(onnx.__version__)
print(onnxruntime.__version__)

2.9.0
1.9.0
1.11.1


In [11]:
# download and unzip from
# https://git.openi.org.cn/PCL-Platform.Intelligence/PanGu-Alpha-Evolution
!md5sum ../pangu-alpha-evolution_2.6B_fp16.zip

e5c8cbb713fd916b12cbed7fb94a1242  ../pangu-alpha-evolution_2.6B_fp16.zip


In [12]:
!du -sh ../pangu-alpha-evolution_2.6b_fp16/iter_0055000/mp_rank_00/model_optim_rng.pt

4.9G	../pangu-alpha-evolution_2.6b_fp16/iter_0055000/mp_rank_00/model_optim_rng.pt


In [13]:
m0 = torch.load(
    '../pangu-alpha-evolution_2.6b_fp16/iter_0055000/mp_rank_00/model_optim_rng.pt',
    map_location='cpu'
)

In [14]:
m0_weights = []

def extract_weight(w = m0['model'], root=''):
    for k, v in w.items():
        if isinstance(v, dict):
            extract_weight(v, root + '.' + k)
        elif isinstance(v, torch.Tensor):
            k = root + '.' + k
            k = k.replace('.language_model.', '')
            k = k.replace('.topQueryLayer.', '.layers.31.')
            m0_weights.append((
                k,
                v
            ))
        else:
            print('what?', type(v))

In [15]:
extract_weight()

In [16]:
len(m0_weights)

517

In [17]:
pangu_weights = {}
for k, v in m0_weights:
    print(k, v.shape)
    pangu_weights[k] = v

embedding.word_embeddings.weight torch.Size([40000, 2560])
embedding.position_embeddings.weight torch.Size([1024, 2560])
transformer.layers.0.input_layernorm.weight torch.Size([2560])
transformer.layers.0.input_layernorm.bias torch.Size([2560])
transformer.layers.0.attention.query.weight torch.Size([2560, 2560])
transformer.layers.0.attention.query.bias torch.Size([2560])
transformer.layers.0.attention.key.weight torch.Size([2560, 2560])
transformer.layers.0.attention.key.bias torch.Size([2560])
transformer.layers.0.attention.value.weight torch.Size([2560, 2560])
transformer.layers.0.attention.value.bias torch.Size([2560])
transformer.layers.0.attention.dense.weight torch.Size([2560, 2560])
transformer.layers.0.attention.dense.bias torch.Size([2560])
transformer.layers.0.post_attention_layernorm.weight torch.Size([2560])
transformer.layers.0.post_attention_layernorm.bias torch.Size([2560])
transformer.layers.0.mlp.dense_h_to_4h.weight torch.Size([10240, 2560])
transformer.layers.0.mlp.

In [18]:
gpt = GPT(
    vocab_size=40_000,
    layer_size=32,
    block_size=1024,
    embedding_dropout=0.0,
    embedding_size=2560,
    num_attention_heads=32,
    attention_dropout=0.0,
    residual_dropout=0.0,
    use_cache=True
)

In [19]:
print(gpt(tf.constant([[1]]))[0].shape)

(1, 1, 40000)


In [20]:
for x in gpt.weights:
    if 'gpt/layer' in x.name:
        if 'gpt/layer00' in x.name:
            print(x.name, x.shape)
    else:
        print(x.name, x.shape)

gpt/embedding/embeddings:0 (40000, 2560)
position_embeddings:0 (1024, 2560)
top_query:0 (1024, 2560)
gpt/layer00/attention/query_layer/kernel:0 (2560, 2560)
gpt/layer00/attention/query_layer/bias:0 (2560,)
gpt/layer00/attention/key_layer/kernel:0 (2560, 2560)
gpt/layer00/attention/key_layer/bias:0 (2560,)
gpt/layer00/attention/value_layer/kernel:0 (2560, 2560)
gpt/layer00/attention/value_layer/bias:0 (2560,)
gpt/layer00/attention/context_projection_layer/kernel:0 (2560, 2560)
gpt/layer00/attention/context_projection_layer/bias:0 (2560,)
gpt/layer00/LayerNorm_mlp_ln0/gamma:0 (2560,)
gpt/layer00/LayerNorm_mlp_ln0/beta:0 (2560,)
gpt/layer00/LayerNorm_mlp_ln1/gamma:0 (2560,)
gpt/layer00/LayerNorm_mlp_ln1/beta:0 (2560,)
gpt/layer00/intermediate/kernel:0 (2560, 10240)
gpt/layer00/intermediate/bias:0 (10240,)
gpt/layer00/output/kernel:0 (10240, 2560)
gpt/layer00/output/bias:0 (2560,)
gpt/LayerNorm_final_norm/gamma:0 (2560,)
gpt/LayerNorm_final_norm/beta:0 (2560,)


In [21]:
new_weights = []

for x in gpt.weights:
    xs = tuple(x.shape)

    if 'gpt/embedding/embeddings:' in x.name:
        pname = 'embedding.word_embeddings.weight'
        w = pangu_weights[pname]
        assert w.shape == (4_0000, 2560)
        new_weights.append((x.name, xs, pname, w))

    elif 'position_embeddings' in x.name:
        pname = 'embedding.position_embeddings.weight'
        w = pangu_weights[pname]
        assert xs == w.shape
        new_weights.append((x.name, xs, pname, w))
    
    elif 'top_query' in x.name:
        pname = 'task_embedding.top_query_embeddings.weight'
        w = pangu_weights[pname]
        assert xs == w.shape
        new_weights.append((x.name, xs, pname, w))

    elif 'gpt/layer' in x.name:
        n_layer = int(x.name[len('gpt/layer'):][:2])
        if 'query_layer/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.query.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'key_layer/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.key.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'value_layer/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.value.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'query_layer/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.query.bias'
            w = pangu_weights[pname]
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'key_layer/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.key.bias'
            w = pangu_weights[pname]
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))
        elif 'value_layer/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.value.bias'
            w = pangu_weights[pname]
            assert xs == w.shape
            new_weights.append((x.name, xs, pname, w))

        elif 'attention/context_projection_layer/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.dense.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert w.shape == xs
            new_weights.append((x.name, xs, pname, w))

        elif 'attention/context_projection_layer/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.attention.dense.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, xs, pname, w))

        elif 'LayerNorm_mlp_ln0/gamma' in x.name:
            pname = f'transformer.layers.{n_layer}.input_layernorm.weight'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'LayerNorm_mlp_ln1/gamma' in x.name:
            pname = f'transformer.layers.{n_layer}.post_attention_layernorm.weight'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'LayerNorm_mlp_ln0/beta' in x.name:
            pname = f'transformer.layers.{n_layer}.input_layernorm.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'LayerNorm_mlp_ln1/beta' in x.name:
            pname = f'transformer.layers.{n_layer}.post_attention_layernorm.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'intermediate/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.mlp.dense_h_to_4h.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif 'intermediate/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.mlp.dense_h_to_4h.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif '/output/kernel' in x.name:
            pname = f'transformer.layers.{n_layer}.mlp.dense_4h_to_h.weight'
            w = pangu_weights[pname]
            w = np.transpose(w)
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        elif '/output/bias' in x.name:
            pname = f'transformer.layers.{n_layer}.mlp.dense_4h_to_h.bias'
            w = pangu_weights[pname]
            assert w.shape == xs
            new_weights.append((x.name, x.shape, pname, w))

        else:
            print('BAD', x.name, xs)
            break
    elif 'gpt/LayerNorm_final_norm/gamma' in x.name:
        pname = 'transformer.final_layernorm.weight'
        w = pangu_weights[pname]
        assert w.shape == xs
        new_weights.append((x.name, x.shape, pname, w))

    elif 'gpt/LayerNorm_final_norm/beta' in x.name:
        pname = 'transformer.final_layernorm.bias'
        w = pangu_weights[pname]
        assert w.shape == xs
        new_weights.append((x.name, x.shape, pname, w))

    else:
        print('BAD', x.name, xs)
        break

In [22]:
assert len(new_weights) == len(gpt.weights)
for x in new_weights:
    assert tuple(x[1]) == x[-1].shape

In [23]:
len(gpt.weights)

517

In [24]:
gpt.set_weights([x[-1] for x in new_weights])

In [25]:
from tokenization_jieba import JIEBATokenizer
cbpe = JIEBATokenizer(
    'PanGu-Alpha-GPU/panguAlpha_pytorch/megatron/tokenizer/bpe_4w_pcl/vocab.vocab',
    'PanGu-Alpha-GPU/panguAlpha_pytorch/megatron/tokenizer/bpe_4w_pcl/vocab.model')

In [26]:
cbpe.vocab_size

40000

In [27]:
ids = cbpe.encode('青椒肉丝的做法：')

for i in range(10):
    output = gpt(tf.constant([ids]))[0]
    nid = np.argmax(output[0, -1])
    ids += [int(nid)]
    print(i, cbpe.decode(ids))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.506 seconds.
Prefix dict has been built successfully.


0 青椒肉丝的做法:1
1 青椒肉丝的做法:1.
2 青椒肉丝的做法:1.青
3 青椒肉丝的做法:1.青椒
4 青椒肉丝的做法:1.青椒洗净
5 青椒肉丝的做法:1.青椒洗净切
6 青椒肉丝的做法:1.青椒洗净切丝
7 青椒肉丝的做法:1.青椒洗净切丝,
8 青椒肉丝的做法:1.青椒洗净切丝,肉
9 青椒肉丝的做法:1.青椒洗净切丝,肉丝


In [31]:
# 模型 + kv模型结合
ids = cbpe.encode('青椒肉丝的做法：')

logits, kv_cache = gpt(tf.constant([ids]), use_cache=True)
nid = np.argmax(logits[0, -1])
print(cbpe.decode([int(nid)]))

for i in range(10):
    logits, kv_cache_new = gpt(tf.constant([[nid]]), kv_cache=kv_cache, use_cache=True)
    kv_cache = tf.concat([kv_cache, kv_cache_new], axis=-2)
    nid = np.argmax(logits[0, -1])
    print(cbpe.decode([int(nid)]))

1
.
青
椒
洗净
切
丝
,
肉
丝
用


In [30]:
# 计算一个初始 kv_cache 作为常量，13是计算出来对模型影响最小的
_, kv_cache_start = gpt(tf.constant([[13]]), use_cache=True)
np.save('kv_cache', kv_cache_start.numpy())

In [33]:
# 只用包含kv的模型
logits, kv_cache = gpt(tf.constant([ids]), kv_cache=np.load('kv_cache.npy'), use_cache=True)
nid = np.argmax(logits[0, -1])
print(cbpe.decode([int(nid)]))

for i in range(10):
    logits, kv_cache_new = gpt(tf.constant([[nid]]), kv_cache=kv_cache, use_cache=True)
    kv_cache = tf.concat([kv_cache, kv_cache_new], axis=-2)
    nid = np.argmax(logits[0, -1])
    print(cbpe.decode([int(nid)]))

1
.
青
椒
洗净
切
丝
,
肉
丝
用


In [34]:
@tf.function
def serve(inputs):
    return gpt(inputs, kv_cache=None, use_cache=True)


@tf.function
def serve_cache(inputs, kv_cache):
    return gpt(inputs, kv_cache=kv_cache, use_cache=True)

serve_concrete = serve.get_concrete_function(
    tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="inp")
)

layer_size = 32
attention_head = 32
embedding_size = 2560

serve_cache_concrete = serve_cache.get_concrete_function(
    tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="inp"),
    tf.TensorSpec(shape=[
        layer_size, None, 2, attention_head,
        None, embedding_size // attention_head
    ], dtype=tf.float32, name="kv_cache")
)

In [37]:
# gpt.save('./pangu-2.6B-tf2', include_optimizer=False, signatures={
#     'serving_default': serve.get_concrete_function(
#         tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="input_ids"),
#     )
# })

# !rm -rf onnx
# !mkdir -p onnx
# !python -m tf2onnx.convert \
#     --saved-model pangu-2.6B-tf2 \
#     --output onnx/pangu.zip --large_model --opset=13

# !cd onnx && unzip -q pangu.zip

In [35]:
gpt.save('./pangu-2.6B-tf2-kv', include_optimizer=False, signatures={
    'serving_default': serve_cache.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64, name="input_ids"),
        tf.TensorSpec(shape=[
            layer_size, None, 2, attention_head,
            None, embedding_size // attention_head
        ], dtype=tf.float32, name="kv_cache")
    )
})





INFO:tensorflow:Assets written to: ./pangu-2.6B-tf2-kv/assets


INFO:tensorflow:Assets written to: ./pangu-2.6B-tf2-kv/assets




In [40]:
!rm -rf onnx_kv
!mkdir -p onnx_kv
!python -m tf2onnx.convert \
    --saved-model pangu-2.6B-tf2-kv \
    --output onnx_kv/pangu.zip --large_model --opset=13

2022-05-25 02:02:48.268623: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-25 02:02:51.934274: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-05-25 02:02:51.934649: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-05-25 02:03:20,770 - INFO - Signatures found in model: [serving_default].
2022-05-25 02:03:20,773 - INFO - Output names: ['output_0', 'output_1']
2022-05-25 02:04:13,128 - INFO - Using tensorflow=2.9.0, onnx=1.9.0, tf2onnx=1.10.1/a37f29
2022-05-25 02:04:13,129 - INFO - Using opset <onnx, 13>
2022-05-25 02:05:44,811 - INFO - Computed 808 values for constant folding
2022-05-25 02:06:30,874 - INFO - folding node using t

2022-05-25 02:06:31,757 - INFO - folding node using tf type=Pack, name=StatefulPartitionedCall/gpt/layer11/attention/strided_slice_2/stack_1
2022-05-25 02:06:31,758 - INFO - folding node using tf type=Pack, name=StatefulPartitionedCall/gpt/layer11/attention/strided_slice_2/stack_2
2022-05-25 02:06:31,758 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer11/attention/context_projection_layer/Tensordot/concat
2022-05-25 02:06:31,758 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer11/intermediate/Tensordot/concat
2022-05-25 02:06:31,759 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer11/output/Tensordot/concat
2022-05-25 02:06:31,759 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer12/attention/key_layer/Tensordot/concat
2022-05-25 02:06:31,760 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer12/attention/valu

2022-05-25 02:06:31,809 - INFO - folding node using tf type=Pack, name=StatefulPartitionedCall/gpt/layer27/attention/strided_slice_2/stack_2
2022-05-25 02:06:31,809 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer27/attention/context_projection_layer/Tensordot/concat
2022-05-25 02:06:31,810 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer27/intermediate/Tensordot/concat
2022-05-25 02:06:31,810 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer27/output/Tensordot/concat
2022-05-25 02:06:31,811 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer28/attention/key_layer/Tensordot/concat
2022-05-25 02:06:31,811 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer28/attention/value_layer/Tensordot/concat
2022-05-25 02:06:31,812 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer28/atten

2022-05-25 02:06:31,874 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/position_embedding/ReadVariableOp
2022-05-25 02:06:31,894 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2022-05-25 02:06:31,894 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2022-05-25 02:06:31,895 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/key_layer/Tensordot/ReadVariableOp
2022-05-25 02:06:32,072 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/key_layer/BiasAdd/ReadVariableOp
2022-05-25 02:06:32,073 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/value_layer/Tensordot/ReadVariableOp
2022-05-25 02:06:32,148 - INFO - folding node using tf type=Identity, name=Stateful

2022-05-25 02:06:36,326 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/attention/value_layer/BiasAdd/ReadVariableOp
2022-05-25 02:06:36,326 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/attention/query_layer/Tensordot/ReadVariableOp
2022-05-25 02:06:36,383 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/attention/query_layer/BiasAdd/ReadVariableOp
2022-05-25 02:06:36,384 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/attention/context_projection_layer/Tensordot/ReadVariableOp
2022-05-25 02:06:36,444 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/attention/context_projection_layer/BiasAdd/ReadVariableOp
2022-05-25 02:06:36,444 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2022-05-25 02:06:36,445 - INFO -

2022-05-25 02:06:40,143 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer06/intermediate/BiasAdd/ReadVariableOp
2022-05-25 02:06:40,143 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer06/output/Tensordot/ReadVariableOp
2022-05-25 02:06:40,533 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer06/output/BiasAdd/ReadVariableOp
2022-05-25 02:06:40,534 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2022-05-25 02:06:40,535 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2022-05-25 02:06:40,536 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/attention/key_layer/Tensordot/ReadVariableOp
2022-05-25 02:06:40,578 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gp

2022-05-25 02:06:44,620 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/key_layer/BiasAdd/ReadVariableOp
2022-05-25 02:06:44,627 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/value_layer/Tensordot/ReadVariableOp
2022-05-25 02:06:44,771 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/value_layer/BiasAdd/ReadVariableOp
2022-05-25 02:06:44,796 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/query_layer/Tensordot/ReadVariableOp
2022-05-25 02:06:44,924 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/query_layer/BiasAdd/ReadVariableOp
2022-05-25 02:06:44,924 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/attention/context_projection_layer/Tensordot/ReadVariableOp
2022-05-25 02:06:45,058 - INFO - folding node u

2022-05-25 02:06:51,720 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer13/attention/context_projection_layer/BiasAdd/ReadVariableOp
2022-05-25 02:06:51,720 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer13/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2022-05-25 02:06:51,720 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer13/LayerNorm_mlp_ln1/batchnorm/ReadVariableOp
2022-05-25 02:06:51,721 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer13/intermediate/Tensordot/ReadVariableOp
2022-05-25 02:06:52,024 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer13/intermediate/BiasAdd/ReadVariableOp
2022-05-25 02:06:52,025 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer13/output/Tensordot/ReadVariableOp
2022-05-25 02:06:52,307 - INFO - folding node using tf type=Identity, name=State

2022-05-25 02:06:55,526 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer16/output/BiasAdd/ReadVariableOp
2022-05-25 02:06:55,527 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2022-05-25 02:06:55,528 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2022-05-25 02:06:55,528 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/key_layer/Tensordot/ReadVariableOp
2022-05-25 02:06:55,605 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/key_layer/BiasAdd/ReadVariableOp
2022-05-25 02:06:55,607 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/attention/value_layer/Tensordot/ReadVariableOp
2022-05-25 02:06:55,681 - INFO - folding node using tf type=Identity, name=Stat

2022-05-25 02:07:00,707 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/attention/value_layer/Tensordot/ReadVariableOp
2022-05-25 02:07:00,817 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/attention/value_layer/BiasAdd/ReadVariableOp
2022-05-25 02:07:00,818 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/attention/query_layer/Tensordot/ReadVariableOp
2022-05-25 02:07:00,921 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/attention/query_layer/BiasAdd/ReadVariableOp
2022-05-25 02:07:00,922 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/attention/context_projection_layer/Tensordot/ReadVariableOp
2022-05-25 02:07:01,085 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer20/attention/context_projection_layer/BiasAdd/ReadVariableOp
2022-05-25 02:07:01,086 - INFO -

2022-05-25 02:07:05,099 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer23/attention/context_projection_layer/BiasAdd/ReadVariableOp
2022-05-25 02:07:05,099 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer23/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2022-05-25 02:07:05,100 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer23/LayerNorm_mlp_ln1/batchnorm/ReadVariableOp
2022-05-25 02:07:05,101 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer23/intermediate/Tensordot/ReadVariableOp
2022-05-25 02:07:05,432 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer23/intermediate/BiasAdd/ReadVariableOp
2022-05-25 02:07:05,433 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer23/output/Tensordot/ReadVariableOp
2022-05-25 02:07:05,914 - INFO - folding node using tf type=Identity, name=State

2022-05-25 02:07:09,953 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer26/output/BiasAdd/ReadVariableOp
2022-05-25 02:07:09,954 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2022-05-25 02:07:09,955 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2022-05-25 02:07:09,955 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/key_layer/Tensordot/ReadVariableOp
2022-05-25 02:07:09,991 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/key_layer/BiasAdd/ReadVariableOp
2022-05-25 02:07:09,992 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer27/attention/value_layer/Tensordot/ReadVariableOp
2022-05-25 02:07:10,029 - INFO - folding node using tf type=Identity, name=Stat

2022-05-25 02:07:12,738 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/attention/value_layer/BiasAdd/ReadVariableOp
2022-05-25 02:07:12,739 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/attention/query_layer/Tensordot/ReadVariableOp
2022-05-25 02:07:12,784 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/attention/query_layer/BiasAdd/ReadVariableOp
2022-05-25 02:07:12,784 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/attention/context_projection_layer/Tensordot/ReadVariableOp
2022-05-25 02:07:12,826 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/attention/context_projection_layer/BiasAdd/ReadVariableOp
2022-05-25 02:07:12,826 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer30/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2022-05-25 02:07:12,826 - INFO -

In [41]:
!cd onnx_kv && unzip -q pangu.zip

In [42]:
# !rm -rf onnx_q && mkdir -p onnx_q
# quantized_model = quantize_dynamic(
#     './onnx/__MODEL_PROTO.onnx',
#     './onnx_q/pangu.onnx',
#     weight_type=QuantType.QUInt8,
#     use_external_data_format=True,
#     extra_options={
#         'DisableShapeInference': True,
#     }
# )

In [43]:
# https://github.com/microsoft/onnxruntime/issues/7017#issuecomment-900716895
# https://github.com/microsoft/onnxruntime/blob/60089f7093e4e26f837be4bbf74d38cb97b43e4b/onnxruntime/python/tools/quantization/quantize.py#L189
!rm -rf onnx_kv_q && mkdir -p onnx_kv_q
quantized_model = quantize_dynamic(
    './onnx_kv/__MODEL_PROTO.onnx',
    './onnx_kv_q/pangu.onnx',
    weight_type=QuantType.QUInt8,
    use_external_data_format=True,
    extra_options={
        'DisableShapeInference': True,
    }
)

Ignore MatMul due to non constant B: /[StatefulPartitionedCall/gpt/layer00/attention/MatMul]
Ignore MatMul due to non constant B: /[StatefulPartitionedCall/gpt/layer00/attention/MatMul_1]
Ignore MatMul due to non constant B: /[StatefulPartitionedCall/gpt/layer01/attention/MatMul]
Ignore MatMul due to non constant B: /[StatefulPartitionedCall/gpt/layer01/attention/MatMul_1]
Ignore MatMul due to non constant B: /[StatefulPartitionedCall/gpt/layer02/attention/MatMul]
Ignore MatMul due to non constant B: /[StatefulPartitionedCall/gpt/layer02/attention/MatMul_1]
Ignore MatMul due to non constant B: /[StatefulPartitionedCall/gpt/layer03/attention/MatMul]
Ignore MatMul due to non constant B: /[StatefulPartitionedCall/gpt/layer03/attention/MatMul_1]
Ignore MatMul due to non constant B: /[StatefulPartitionedCall/gpt/layer04/attention/MatMul]
Ignore MatMul due to non constant B: /[StatefulPartitionedCall/gpt/layer04/attention/MatMul_1]
Ignore MatMul due to non constant B: /[StatefulPartitionedCa

In [44]:
!rm -rf onnx
!rm -rf onnx_kv

In [45]:
!rm -rf pangu-2.6B-tf2-kv