In [1]:
from tqdm import tqdm

In [2]:
import struct
import numpy as np
from collections import OrderedDict


def load_dtype(fp):
    v = struct.unpack("B", fp.read(1))[0]
    if v == 0:
        return np.int8
    elif v == 1:
        return np.float16
    elif v == 2:
        return np.float32
    else:
        raise TypeError("Unknown dtype %d" % v)

def load_string(fp):
    size = struct.unpack("I", fp.read(4))[0]
    v = fp.read(size)
    return v.decode("utf-8")

def load_tuple(fp):
    dim_tuple = struct.unpack("B", fp.read(1))[0]
    ret = []
    for _ in range(dim_tuple):
        ret.append(struct.unpack("I", fp.read(4))[0]) 
    return tuple(ret)

def load_parameter(fp):    
    shape = load_tuple(fp)
    value_size = struct.unpack("I", fp.read(4))[0]
    dtype = load_dtype(fp)
    value = fp.read(value_size)
    return shape, value, dtype

def load(fp, parent_name=''):
    num_parameters, num_sub_layers = struct.unpack("II", fp.read(8))
    parameters = []

    for _ in range(num_parameters):
        name = load_string(fp)
        shape, value, dtype = load_parameter(fp)
        parameters.append((parent_name + '.' + name, np.frombuffer(value, dtype).reshape(shape)))
    for _ in range(num_sub_layers):
        name = load_string(fp)
        parameters += load(fp, parent_name + '.' + name)
    return parameters

In [3]:
with open('../zhiyuan/cpm/checkpoint.pt', 'rb') as fp:
    parameters = load(fp)

In [4]:
pindex = {x[0]: x[1] for x in parameters}

In [5]:
npara = {}
for name, value in tqdm(parameters):
    if '_scale' not in name:
        has_scale = name + '_scale'
        if has_scale in pindex:
            scale = pindex[has_scale]
            value = value.astype(np.float16) * scale
        npara[name] = value

100%|██████████| 516/516 [00:45<00:00, 11.39it/s]


In [6]:
for k, v in npara.items():
    print(k, v.shape)

.input_embedding.weight (30000, 2560)
.position_embedding.weight (1024, 2560)
.layers.0.layer_nrom_before_self_attn.weight (2560,)
.layers.0.layer_nrom_before_self_attn.bias (2560,)
.layers.0.self_attention.w_project_qkv (3, 2560, 2560)
.layers.0.self_attention.w_project_bias (3, 2560)
.layers.0.self_attention.w_out (2560, 2560)
.layers.0.self_attention.w_out_bias (2560,)
.layers.0.layer_nrom_before_ff.weight (2560,)
.layers.0.layer_nrom_before_ff.bias (2560,)
.layers.0.dense_gelu_dense.wi.weight (10240, 2560)
.layers.0.dense_gelu_dense.wi.weight_bias (10240,)
.layers.0.dense_gelu_dense.wo.weight (2560, 10240)
.layers.0.dense_gelu_dense.wo.weight_bias (2560,)
.layers.1.layer_nrom_before_self_attn.weight (2560,)
.layers.1.layer_nrom_before_self_attn.bias (2560,)
.layers.1.self_attention.w_project_qkv (3, 2560, 2560)
.layers.1.self_attention.w_project_bias (3, 2560)
.layers.1.self_attention.w_out (2560, 2560)
.layers.1.self_attention.w_out_bias (2560,)
.layers.1.layer_nrom_before_ff.weig

In [7]:
def get_layer(n):
    rets = {}
    for k, v in npara.items():
        if f'.layers.{n}.' in k:
            rets[k] = v
    weights = [
        np.transpose(rets[f'.layers.{n}.self_attention.w_project_qkv'][0]),
        rets[f'.layers.{n}.self_attention.w_project_bias'][0],
        np.transpose(rets[f'.layers.{n}.self_attention.w_project_qkv'][1]),
        rets[f'.layers.{n}.self_attention.w_project_bias'][1],
        np.transpose(rets[f'.layers.{n}.self_attention.w_project_qkv'][2]),
        rets[f'.layers.{n}.self_attention.w_project_bias'][2],
        np.transpose(rets[f'.layers.{n}.self_attention.w_out']),
        rets[f'.layers.{n}.self_attention.w_out_bias'],
        rets[f'.layers.{n}.layer_nrom_before_self_attn.weight'],
        rets[f'.layers.{n}.layer_nrom_before_self_attn.bias'],
        rets[f'.layers.{n}.layer_nrom_before_ff.weight'],
        rets[f'.layers.{n}.layer_nrom_before_ff.bias'],
        np.transpose(rets[f'.layers.{n}.dense_gelu_dense.wi.weight']),
        rets[f'.layers.{n}.dense_gelu_dense.wi.weight_bias'],
        np.transpose(rets[f'.layers.{n}.dense_gelu_dense.wo.weight']),
        rets[f'.layers.{n}.dense_gelu_dense.wo.weight_bias'],
    ]
    
    return weights

In [8]:
weights = [
    npara['.input_embedding.weight'],
    npara['.position_embedding.weight'],
]
for i in range(32):
    weights += get_layer(i)
weights += [
    npara['.encoder_final_layer_nrom.weight'],
    npara['.encoder_final_layer_nrom.bias'],
]

In [9]:
len(weights)

516

In [11]:
import re

import torch
import numpy as np
import tensorflow as tf

from tf2gpt.model import GPT

from gpt2_tokenizer import GPT2Tokenizer
cbpe = GPT2Tokenizer(
    'CPM-Generate/bpe_3w_new/vocab.json',
    'CPM-Generate/bpe_3w_new/merges.txt',
    model_file='CPM-Generate/bpe_3w_new/chinese_vocab.model')

In [58]:
# %%time
# ids = cbpe.encode('今天天气不错')

# kv_cache = None
# for i in range(20):
#     output = gpt(tf.constant([ids]), kv_cache)
#     output, new_kv_cache = output
#     if kv_cache is None:
#         kv_cache = new_kv_cache
#     else:
#         kv_cache = tf.concat([
#             kv_cache,
#             new_kv_cache
#         ], axis=-2)
    
#     nid = np.argmax(output[0, -1])
#     ids = [nid]
#     print(i, cbpe.decode(ids))

In [59]:
# %%time
# ids = cbpe.encode('今天天气不错')

# for i in range(20):
#     output = gpt(tf.constant([ids]))[0]

#     nid = np.argmax(output[0, -1])
#     ids += [nid]
#     print(i, cbpe.decode(ids))

In [60]:
gpt = GPT(
    vocab_size=30_000,
    layer_size=32,
    block_size=1024,
    embedding_dropout=0.0,
    embedding_size=2560,
    num_attention_heads=32,
    attention_dropout=0.0,
    residual_dropout=0.0,
    use_cache=True
)
gpt._set_inputs(
    tf.TensorSpec(shape=[None, None],
        dtype=tf.int64,
        name="input_ids"
    ))
out = gpt(tf.constant([[1]]))
gpt.set_weights(weights)
gpt.save('cpm_saved_model')



INFO:tensorflow:Assets written to: cpm_saved_model/assets


INFO:tensorflow:Assets written to: cpm_saved_model/assets


In [69]:
out[1]

<tf.Tensor: shape=(32, 1, 2, 32, 1, 80), dtype=float32, numpy=
array([[[[[[ 3.22329879e-01,  4.90205526e-01,  3.10286999e-01, ...,
             3.39013398e-01,  2.28684947e-01, -6.23727083e-01]],

          [[ 1.67696393e+00,  8.82031769e-03, -5.94259560e-01, ...,
             6.17827654e-01,  7.00745225e-01,  8.16019654e-01]],

          [[-1.44008726e-01,  1.03477716e+00,  6.53821044e-03, ...,
            -9.18062627e-01, -5.55618048e-01, -1.20327711e+00]],

          ...,

          [[ 5.26708841e-01, -1.57522810e+00,  1.21947922e-01, ...,
            -6.22768939e-01,  1.04231811e+00, -1.65548638e-01]],

          [[ 8.74828935e-01,  1.49547327e+00, -7.50782013e-01, ...,
             2.77377510e+00,  1.05143869e+00, -1.11437336e-01]],

          [[-2.21316171e+00, -7.64421582e-01,  8.60675752e-01, ...,
             8.10929596e-01,  2.09386677e-01,  1.83064258e+00]]],


         [[[ 9.31456268e-01,  1.34661651e+00,  1.99885219e-01, ...,
            -7.19597816e-01, -2.24874735e+00,  

In [79]:
gpt = GPT(
    vocab_size=30_000,
    layer_size=32,
    block_size=1024,
    embedding_dropout=0.0,
    embedding_size=2560,
    num_attention_heads=32,
    attention_dropout=0.0,
    residual_dropout=0.0,
    use_cache=True
)
embedding_size = 2560
attention_head = 32
layer_size = 32
gpt._set_inputs(
    tf.TensorSpec(shape=[None, None],
        dtype=tf.int64,
        name="input_ids"
    ),
    tf.TensorSpec(shape=[
        layer_size, None, 2, attention_head,
        None, embedding_size // attention_head
    ], dtype=tf.float32, name="kv_cache")
)
out2 = gpt(tf.constant([[1]]), out[1])
gpt.set_weights(weights)

@tf.function
def sample(input_ids, kv_cache):
    return gpt(input_ids, kv_cache)

gpt.save('./cpm_saved_model_with_kv_cache', include_optimizer=False, signatures={
    'serving_default': sample.get_concrete_function(
        tf.TensorSpec(shape=[None, None],
            dtype=tf.int64,
            name="input_ids"),
        tf.TensorSpec(shape=[
                layer_size, None, 2, attention_head,
                None, embedding_size // attention_head
            ], dtype=tf.float32, name="kv_cache")
    )
})



INFO:tensorflow:Assets written to: ./cpm_saved_model_with_kv_cache/assets


INFO:tensorflow:Assets written to: ./cpm_saved_model_with_kv_cache/assets


In [62]:
!rm -rf onnx
!mkdir -p onnx
!python -m tf2onnx.convert --saved-model cpm_saved_model --output onnx/cpm.zip --large_model --opset=13

2021-10-13 16:35:10.305733: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-13 16:35:10.305761: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-10-13 16:35:11.733973: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-10-13 16:35:11.734000: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-10-13 16:35:11.734022: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (iZuf6fokcl2k1pwfopz0n4Z): /proc/driver/nvidia/version does not exist
2021-10-13 16:35:11.734267: I tensorflow/core/platfor

2021-10-13 16:38:59,029 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer00/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 16:38:59,029 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer00/attention/query_layer/Tensordot/ReadVariableOp
2021-10-13 16:38:59,065 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer00/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 16:38:59,065 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer00/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-13 16:38:59,101 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer00/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-13 16:38:59,101 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer00/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-13 16:38:59,

2021-10-13 16:39:01,545 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer03/intermediate/BiasAdd/ReadVariableOp
2021-10-13 16:39:01,545 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer03/output/Tensordot/ReadVariableOp
2021-10-13 16:39:01,823 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer03/output/BiasAdd/ReadVariableOp
2021-10-13 16:39:01,823 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer04/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 16:39:01,823 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer04/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 16:39:01,823 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer04/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:01,865 - INFO - folding node using tf type=Identity, name=StatefulPartit

2021-10-13 16:39:04,005 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer07/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:04,005 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer07/attention/value_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:04,040 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer07/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:04,040 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer07/attention/query_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:04,081 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer07/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:04,082 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer07/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:04,116 - INFO - fo

2021-10-13 16:39:06,526 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer10/intermediate/BiasAdd/ReadVariableOp
2021-10-13 16:39:06,527 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer10/output/Tensordot/ReadVariableOp
2021-10-13 16:39:06,807 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer10/output/BiasAdd/ReadVariableOp
2021-10-13 16:39:06,808 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer11/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 16:39:06,808 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer11/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 16:39:06,808 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer11/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:06,844 - INFO - folding node using tf type=Identity, name=StatefulPartit

2021-10-13 16:39:09,022 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer14/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:09,022 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer14/attention/query_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:09,064 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer14/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:09,064 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer14/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:09,105 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer14/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:09,106 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer14/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-13 16:39:09,

2021-10-13 16:39:11,546 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer17/intermediate/BiasAdd/ReadVariableOp
2021-10-13 16:39:11,546 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer17/output/Tensordot/ReadVariableOp
2021-10-13 16:39:11,822 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer17/output/BiasAdd/ReadVariableOp
2021-10-13 16:39:11,822 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer18/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 16:39:11,822 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer18/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 16:39:11,822 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer18/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:11,859 - INFO - folding node using tf type=Identity, name=StatefulPartit

2021-10-13 16:39:14,000 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer21/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:14,001 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer21/attention/value_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:14,043 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer21/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:14,043 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer21/attention/query_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:14,085 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer21/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:14,085 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer21/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:14,126 - INFO - fo

2021-10-13 16:39:16,275 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer24/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:16,275 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer24/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-13 16:39:16,276 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer24/LayerNorm_mlp_ln1/batchnorm/ReadVariableOp
2021-10-13 16:39:16,276 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer24/intermediate/Tensordot/ReadVariableOp
2021-10-13 16:39:16,555 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer24/intermediate/BiasAdd/ReadVariableOp
2021-10-13 16:39:16,555 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer24/output/Tensordot/ReadVariableOp
2021-10-13 16:39:16,833 - INFO - folding node using tf type=Identity

2021-10-13 16:39:18,964 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer27/output/BiasAdd/ReadVariableOp
2021-10-13 16:39:18,965 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer28/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 16:39:18,965 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer28/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 16:39:18,965 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer28/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:19,001 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer28/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:19,001 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer28/attention/value_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:19,043 - INFO - folding node using tf type=Identit

2021-10-13 16:39:21,222 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer31/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:21,222 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer31/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-13 16:39:21,255 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer31/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-13 16:39:21,255 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer31/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-13 16:39:21,255 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer31/LayerNorm_mlp_ln1/batchnorm/ReadVariableOp
2021-10-13 16:39:21,256 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_1/layer31/intermediate/Tensordot/ReadVariableOp
2021-10-13 16:39:21,531 - INFO 

In [None]:
!cd onnx && unzip -q cpm.zip

In [82]:
!rm -rf onnx_kv
!mkdir -p onnx_kv
!python -m tf2onnx.convert \
    --saved-model cpm_saved_model_with_kv_cache \
    --output onnx_kv/cpm.zip --large_model --opset=13

2021-10-13 17:27:27.591847: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-13 17:27:27.591878: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-10-13 17:27:30.162299: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-10-13 17:27:30.162329: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-10-13 17:27:30.162356: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (iZuf6fokcl2k1pwfopz0n4Z): /proc/driver/nvidia/version does not exist
2021-10-13 17:27:30.162617: I tensorflow/core/platfor

2021-10-13 17:31:47,640 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer00/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 17:31:47,640 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer00/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 17:31:47,641 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer00/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 17:31:47,663 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer00/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-13 17:31:47,663 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer00/attention/value_layer/Tensordot/ReadVariableOp
2021-10-13 17:31:47,699 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer00/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 17:31:47,699 - INFO - folding node using 

2021-10-13 17:31:49,549 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer03/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 17:31:49,549 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer03/attention/query_layer/Tensordot/ReadVariableOp
2021-10-13 17:31:49,585 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer03/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 17:31:49,585 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer03/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-13 17:31:49,620 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer03/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-13 17:31:49,620 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer03/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-13 17:31:49,

2021-10-13 17:31:52,022 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer06/intermediate/BiasAdd/ReadVariableOp
2021-10-13 17:31:52,022 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer06/output/Tensordot/ReadVariableOp
2021-10-13 17:31:52,318 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer06/output/BiasAdd/ReadVariableOp
2021-10-13 17:31:52,318 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer07/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 17:31:52,318 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer07/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 17:31:52,318 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer07/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 17:31:52,377 - INFO - folding node using tf type=Identity, name=StatefulPartit

2021-10-13 17:31:54,261 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer10/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-13 17:31:54,262 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer10/attention/value_layer/Tensordot/ReadVariableOp
2021-10-13 17:31:54,296 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer10/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 17:31:54,297 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer10/attention/query_layer/Tensordot/ReadVariableOp
2021-10-13 17:31:54,332 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer10/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 17:31:54,332 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer10/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-13 17:31:54,367 - INFO - fo

2021-10-13 17:31:56,385 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer13/intermediate/BiasAdd/ReadVariableOp
2021-10-13 17:31:56,386 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer13/output/Tensordot/ReadVariableOp
2021-10-13 17:31:56,615 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer13/output/BiasAdd/ReadVariableOp
2021-10-13 17:31:56,616 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer14/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 17:31:56,616 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer14/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 17:31:56,616 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer14/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 17:31:56,651 - INFO - folding node using tf type=Identity, name=StatefulPartit

2021-10-13 17:31:58,437 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer17/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 17:31:58,464 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer17/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-13 17:31:58,465 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer17/attention/value_layer/Tensordot/ReadVariableOp
2021-10-13 17:31:58,492 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer17/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 17:31:58,493 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer17/attention/query_layer/Tensordot/ReadVariableOp
2021-10-13 17:31:58,527 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer17/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 17:31:58,528 - INFO - folding node usin

2021-10-13 17:32:00,373 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer20/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-13 17:32:00,374 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer20/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-13 17:32:00,374 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer20/LayerNorm_mlp_ln1/batchnorm/ReadVariableOp
2021-10-13 17:32:00,374 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer20/intermediate/Tensordot/ReadVariableOp
2021-10-13 17:32:00,567 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer20/intermediate/BiasAdd/ReadVariableOp
2021-10-13 17:32:00,568 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer20/output/Tensordot/ReadVariableOp
2021-10-13 17:32:00,763 - INFO - folding node using tf type=Identity

2021-10-13 17:32:02,293 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer23/output/BiasAdd/ReadVariableOp
2021-10-13 17:32:02,293 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer24/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 17:32:02,294 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer24/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 17:32:02,294 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer24/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 17:32:02,330 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer24/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-13 17:32:02,330 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer24/attention/value_layer/Tensordot/ReadVariableOp
2021-10-13 17:32:02,359 - INFO - folding node using tf type=Identit

2021-10-13 17:32:03,900 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer27/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-13 17:32:03,900 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer27/attention/query_layer/Tensordot/ReadVariableOp
2021-10-13 17:32:03,928 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer27/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-13 17:32:03,928 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer27/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-13 17:32:03,956 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer27/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-13 17:32:03,956 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer27/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-13 17:32:03,

2021-10-13 17:32:05,700 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer30/intermediate/BiasAdd/ReadVariableOp
2021-10-13 17:32:05,700 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer30/output/Tensordot/ReadVariableOp
2021-10-13 17:32:05,895 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer30/output/BiasAdd/ReadVariableOp
2021-10-13 17:32:05,895 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer31/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-13 17:32:05,895 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer31/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-13 17:32:05,896 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt_5/layer31/attention/key_layer/Tensordot/ReadVariableOp
2021-10-13 17:32:05,922 - INFO - folding node using tf type=Identity, name=StatefulPartit

In [83]:
!cd onnx_kv && unzip -q cpm.zip

In [84]:
import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

In [85]:
!rm -rf onnx_q && mkdir -p onnx_q

In [87]:
quantized_model = quantize_dynamic(
    './onnx/__MODEL_PROTO.onnx',
    './onnx_q/cpm.onnx',
    weight_type=QuantType.QUInt8,
    use_external_data_format=True
)

In [88]:
!rm -rf onnx_kv_q && mkdir -p onnx_kv_q

In [89]:
quantized_model = quantize_dynamic(
    './onnx_kv/__MODEL_PROTO.onnx',
    './onnx_kv_q/cpm.onnx',
    weight_type=QuantType.QUInt8,
    use_external_data_format=True
)