In [5]:
from tqdm import tqdm

In [1]:
import struct
import numpy as np
from collections import OrderedDict


def load_dtype(fp):
    v = struct.unpack("B", fp.read(1))[0]
    if v == 0:
        return np.int8
    elif v == 1:
        return np.float16
    elif v == 2:
        return np.float32
    else:
        raise TypeError("Unknown dtype %d" % v)

def load_string(fp):
    size = struct.unpack("I", fp.read(4))[0]
    v = fp.read(size)
    return v.decode("utf-8")

def load_tuple(fp):
    dim_tuple = struct.unpack("B", fp.read(1))[0]
    ret = []
    for _ in range(dim_tuple):
        ret.append(struct.unpack("I", fp.read(4))[0]) 
    return tuple(ret)

def load_parameter(fp):    
    shape = load_tuple(fp)
    value_size = struct.unpack("I", fp.read(4))[0]
    dtype = load_dtype(fp)
    value = fp.read(value_size)
    return shape, value, dtype

def load(fp, parent_name=''):
    num_parameters, num_sub_layers = struct.unpack("II", fp.read(8))
    parameters = []

    for _ in range(num_parameters):
        name = load_string(fp)
        shape, value, dtype = load_parameter(fp)
        parameters.append((parent_name + '.' + name, np.frombuffer(value, dtype).reshape(shape)))
    for _ in range(num_sub_layers):
        name = load_string(fp)
        parameters += load(fp, parent_name + '.' + name)
    return parameters

In [2]:
with open('../zhiyuan/cpm/checkpoint.pt', 'rb') as fp:
    parameters = load(fp)

In [3]:
pindex = {x[0]: x[1] for x in parameters}

In [6]:
npara = {}
for name, value in tqdm(parameters):
    if '_scale' not in name:
        has_scale = name + '_scale'
        if has_scale in pindex:
            scale = pindex[has_scale]
            value = value.astype(np.float16) * scale
        npara[name] = value

100%|██████████| 516/516 [00:42<00:00, 12.03it/s]


In [8]:
for k, v in npara.items():
    print(k, v.shape)

.input_embedding.weight (30000, 2560)
.position_embedding.weight (1024, 2560)
.layers.0.layer_nrom_before_self_attn.weight (2560,)
.layers.0.layer_nrom_before_self_attn.bias (2560,)
.layers.0.self_attention.w_project_qkv (3, 2560, 2560)
.layers.0.self_attention.w_project_bias (3, 2560)
.layers.0.self_attention.w_out (2560, 2560)
.layers.0.self_attention.w_out_bias (2560,)
.layers.0.layer_nrom_before_ff.weight (2560,)
.layers.0.layer_nrom_before_ff.bias (2560,)
.layers.0.dense_gelu_dense.wi.weight (10240, 2560)
.layers.0.dense_gelu_dense.wi.weight_bias (10240,)
.layers.0.dense_gelu_dense.wo.weight (2560, 10240)
.layers.0.dense_gelu_dense.wo.weight_bias (2560,)
.layers.1.layer_nrom_before_self_attn.weight (2560,)
.layers.1.layer_nrom_before_self_attn.bias (2560,)
.layers.1.self_attention.w_project_qkv (3, 2560, 2560)
.layers.1.self_attention.w_project_bias (3, 2560)
.layers.1.self_attention.w_out (2560, 2560)
.layers.1.self_attention.w_out_bias (2560,)
.layers.1.layer_nrom_before_ff.weig

In [35]:
def get_layer(n):
    rets = {}
    for k, v in npara.items():
        if f'.layers.{n}.' in k:
            rets[k] = v
    weights = [
        np.transpose(rets[f'.layers.{n}.self_attention.w_project_qkv'][0]),
        rets[f'.layers.{n}.self_attention.w_project_bias'][0],
        np.transpose(rets[f'.layers.{n}.self_attention.w_project_qkv'][1]),
        rets[f'.layers.{n}.self_attention.w_project_bias'][1],
        np.transpose(rets[f'.layers.{n}.self_attention.w_project_qkv'][2]),
        rets[f'.layers.{n}.self_attention.w_project_bias'][2],
        np.transpose(rets[f'.layers.{n}.self_attention.w_out']),
        rets[f'.layers.{n}.self_attention.w_out_bias'],
        rets[f'.layers.{n}.layer_nrom_before_self_attn.weight'],
        rets[f'.layers.{n}.layer_nrom_before_self_attn.bias'],
        rets[f'.layers.{n}.layer_nrom_before_ff.weight'],
        rets[f'.layers.{n}.layer_nrom_before_ff.bias'],
        np.transpose(rets[f'.layers.{n}.dense_gelu_dense.wi.weight']),
        rets[f'.layers.{n}.dense_gelu_dense.wi.weight_bias'],
        np.transpose(rets[f'.layers.{n}.dense_gelu_dense.wo.weight']),
        rets[f'.layers.{n}.dense_gelu_dense.wo.weight_bias'],
    ]
    
    return weights

In [36]:
weights = [
    npara['.input_embedding.weight'],
    npara['.position_embedding.weight'],
]
for i in range(32):
    weights += get_layer(i)
weights += [
    npara['.encoder_final_layer_nrom.weight'],
    npara['.encoder_final_layer_nrom.bias'],
]

In [37]:
len(weights)

516

In [9]:
import re

import torch
import numpy as np
import tensorflow as tf

from tf2gpt.model import GPT

In [10]:
gpt = GPT(
    vocab_size=30_000,
    layer_size=32,
    block_size=1024,
    embedding_dropout=0.0,
    embedding_size=2560,
    num_attention_heads=32,
    attention_dropout=0.0,
    residual_dropout=0.0)

In [14]:
gpt._set_inputs(
    tf.TensorSpec(shape=[None, None],
        dtype=tf.int32,
        name="input_ids"
    ))

In [15]:
print(gpt(tf.constant([[1]])).shape)

(1, 1, 30000)


In [16]:
for x in :
    if 'gpt/layer' in x.name:
        if 'gpt/layer00' in x.name:
            print(x.name, x.shape)
    else:
        print(x.name, x.shape)

gpt/embedding/embeddings:0 (30000, 2560)
position_embeddings:0 (1024, 2560)
gpt/layer00/attention/query_layer/kernel:0 (2560, 2560)
gpt/layer00/attention/query_layer/bias:0 (2560,)
gpt/layer00/attention/key_layer/kernel:0 (2560, 2560)
gpt/layer00/attention/key_layer/bias:0 (2560,)
gpt/layer00/attention/value_layer/kernel:0 (2560, 2560)
gpt/layer00/attention/value_layer/bias:0 (2560,)
gpt/layer00/attention/context_projection_layer/kernel:0 (2560, 2560)
gpt/layer00/attention/context_projection_layer/bias:0 (2560,)
gpt/layer00/LayerNorm_mlp_ln0/gamma:0 (2560,)
gpt/layer00/LayerNorm_mlp_ln0/beta:0 (2560,)
gpt/layer00/LayerNorm_mlp_ln1/gamma:0 (2560,)
gpt/layer00/LayerNorm_mlp_ln1/beta:0 (2560,)
gpt/layer00/intermediate/kernel:0 (2560, 10240)
gpt/layer00/intermediate/bias:0 (10240,)
gpt/layer00/output/kernel:0 (10240, 2560)
gpt/layer00/output/bias:0 (2560,)
gpt/LayerNorm_final_norm/gamma:0 (2560,)
gpt/LayerNorm_final_norm/beta:0 (2560,)


In [30]:
len(gpt.weights)

516

In [38]:
gpt.set_weights(weights)

In [32]:
from gpt2_tokenizer import GPT2Tokenizer

In [33]:
cbpe = GPT2Tokenizer(
    'CPM-Generate/bpe_3w_new/vocab.json',
    'CPM-Generate/bpe_3w_new/merges.txt',
    model_file='CPM-Generate/bpe_3w_new/chinese_vocab.model')

In [39]:
ids = cbpe.encode('今天天气不错')

for i in range(10):
    output = gpt(tf.constant([ids]))
    nid = np.argmax(output[0, -1])
    ids += [nid]
    print(i, cbpe.decode(ids))

0 今天天气 不错 
1 今天天气 不错 ,
2 今天天气 不错 , 我
3 今天天气 不错 , 我 带
4 今天天气 不错 , 我 带你
5 今天天气 不错 , 我 带你去
6 今天天气 不错 , 我 带你去 
7 今天天气 不错 , 我 带你去 个
8 今天天气 不错 , 我 带你去 个 地方
9 今天天气 不错 , 我 带你去 个 地方 


In [40]:
gpt.save('cpm_saved_model')



INFO:tensorflow:Assets written to: cpm_saved_model/assets


INFO:tensorflow:Assets written to: cpm_saved_model/assets


In [50]:
!mkdir -p onnx
!python -m tf2onnx.convert --saved-model cpm_saved_model --output onnx/cpm.zip --large_model --opset=13

2021-10-12 16:31:26.635547: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-12 16:31:26.635574: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-10-12 16:31:27.831891: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-10-12 16:31:27.831915: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-10-12 16:31:27.831942: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (iZuf6fokcl2k1pwfopz0n4Z): /proc/driver/nvidia/version does not exist
2021-10-12 16:31:27.832198: I tensorflow/core/platfor

2021-10-12 16:34:16,820 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer19/intermediate/Tensordot/concat
2021-10-12 16:34:16,820 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer19/output/Tensordot/concat
2021-10-12 16:34:16,820 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer20/attention/key_layer/Tensordot/concat
2021-10-12 16:34:16,820 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer20/attention/value_layer/Tensordot/concat
2021-10-12 16:34:16,820 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer20/attention/query_layer/Tensordot/concat
2021-10-12 16:34:16,821 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/layer20/attention/context_projection_layer/Tensordot/concat
2021-10-12 16:34:16,821 - INFO - folding node using tf type=ConcatV2, name=StatefulPartitionedCall/gpt/laye

2021-10-12 16:34:16,881 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:16,881 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/query_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:16,897 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:16,897 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:16,913 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:16,913 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer00/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-12 16:34:16,914 - INFO -

2021-10-12 16:34:18,568 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/intermediate/BiasAdd/ReadVariableOp
2021-10-12 16:34:18,568 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/output/Tensordot/ReadVariableOp
2021-10-12 16:34:18,756 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer03/output/BiasAdd/ReadVariableOp
2021-10-12 16:34:18,756 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer04/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-12 16:34:18,756 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer04/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-12 16:34:18,756 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer04/attention/key_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:18,783 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gp

2021-10-12 16:34:20,276 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:20,277 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/attention/query_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:20,303 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:20,303 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:20,330 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:20,330 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer07/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-12 16:34:20,330 - INFO -

2021-10-12 16:34:21,986 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/intermediate/BiasAdd/ReadVariableOp
2021-10-12 16:34:21,986 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/output/Tensordot/ReadVariableOp
2021-10-12 16:34:22,175 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer10/output/BiasAdd/ReadVariableOp
2021-10-12 16:34:22,176 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer11/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-12 16:34:22,176 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer11/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-12 16:34:22,176 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer11/attention/key_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:22,203 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gp

2021-10-12 16:34:23,695 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer14/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:23,695 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer14/attention/query_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:23,723 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer14/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:23,723 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer14/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:23,751 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer14/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:23,751 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer14/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-12 16:34:23,752 - INFO -

2021-10-12 16:34:25,416 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/intermediate/BiasAdd/ReadVariableOp
2021-10-12 16:34:25,417 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/output/Tensordot/ReadVariableOp
2021-10-12 16:34:25,606 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer17/output/BiasAdd/ReadVariableOp
2021-10-12 16:34:25,607 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer18/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-12 16:34:25,607 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer18/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-12 16:34:25,607 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer18/attention/key_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:25,634 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gp

2021-10-12 16:34:27,125 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer21/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:27,125 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer21/attention/query_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:27,152 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer21/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:27,152 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer21/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:27,179 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer21/attention/context_projection_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:27,179 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer21/LayerNorm_mlp_ln1/batchnorm/mul/ReadVariableOp
2021-10-12 16:34:27,179 - INFO -

2021-10-12 16:34:28,830 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/intermediate/BiasAdd/ReadVariableOp
2021-10-12 16:34:28,831 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/output/Tensordot/ReadVariableOp
2021-10-12 16:34:29,020 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer24/output/BiasAdd/ReadVariableOp
2021-10-12 16:34:29,020 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer25/LayerNorm_mlp_ln0/batchnorm/mul/ReadVariableOp
2021-10-12 16:34:29,020 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer25/LayerNorm_mlp_ln0/batchnorm/ReadVariableOp
2021-10-12 16:34:29,021 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer25/attention/key_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:29,048 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gp

2021-10-12 16:34:30,513 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer28/attention/key_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:30,514 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer28/attention/value_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:30,542 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer28/attention/value_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:30,542 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer28/attention/query_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:30,568 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer28/attention/query_layer/BiasAdd/ReadVariableOp
2021-10-12 16:34:30,569 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer28/attention/context_projection_layer/Tensordot/ReadVariableOp
2021-10-12 16:34:30,595 - INFO - folding node u

2021-10-12 16:34:32,252 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer31/intermediate/BiasAdd/ReadVariableOp
2021-10-12 16:34:32,252 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer31/output/Tensordot/ReadVariableOp
2021-10-12 16:34:32,442 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/layer31/output/BiasAdd/ReadVariableOp
2021-10-12 16:34:32,442 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/LayerNorm_final_norm/batchnorm/mul/ReadVariableOp
2021-10-12 16:34:32,442 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/LayerNorm_final_norm/batchnorm/ReadVariableOp
2021-10-12 16:34:32,442 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/gpt/Identity
2021-10-12 16:34:37,117 - INFO - Optimizing ONNX model
2021-10-12 16:36:30,567 - INFO - After optimization: Cast -514 (901->387), Concat -320 (545->225), C

In [None]:
# to unzip it

In [51]:
import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

In [52]:
!mkdir -p onnx_q

In [55]:
quantized_model = quantize_dynamic(
    './onnx/__MODEL_PROTO.onnx',
    './onnx_q/cpm.onnx',
    weight_type=QuantType.QUInt8,
    use_external_data_format=True
)