In [1]:
import torch
import tensorflow as tf
import numpy as np

from transformers import TFT5EncoderModel, TFT5Model
from transformers import T5Config

In [2]:
from tokenization_enc_dec import EncDecTokenizer
tokenizer = EncDecTokenizer('./vocab.txt')

In [3]:
config = T5Config(
    vocab_size=26240,
#     n_positions=self.n_positions,
    d_model=4096,
    d_ff=10240,
    d_kv=4096 // 64,
    num_layers=24,
    num_heads=64,
    relative_attention_num_buckets=32,
    dropout_rate=0.0,
    initializer_factor=1.0,
    eos_token_id=tokenizer.eod_id,
    bos_token_id=tokenizer.pad_id,
    pad_token_id=tokenizer.pad_id,
    decoder_start_token_id=tokenizer.pad_id,
    feed_forward_proj='gated-gelu',
    tie_word_embeddings=False
)

In [4]:
model = TFT5EncoderModel(config)

In [5]:
_ = model(input_ids=tf.constant([[1]]))

In [6]:
len(model.variables)

219

In [7]:
def get_weight(name):
    return state_dict[name].numpy()


def get_block_weight(n, t='encoder', name=False, dim=4096):
    weights = []
    for k, v in state_dict.items():
        if t in k and f'blocks.{n}.' in k:
            # pytorch和tensorflow版本的weights是矩阵转置的
            w = v.numpy()
            if 'self_attn.project' in k:
                w0, w1, w2 = w[:dim, :], w[dim:dim*2, :], w[dim*2:, :]
                w0 = np.transpose(w0)
                w1 = np.transpose(w1)
                w2 = np.transpose(w2)
                weights.append((k, w0))
                weights.append((k, w1))
                weights.append((k, w2))
            elif 'cross_attn.project_q' in k:
                w = np.transpose(w)
                weights.append((k, w))
            elif 'cross_attn.project_kv' in k:
                w0, w1 = w[:dim, :], w[dim:, :]
                w0 = np.transpose(w0)
                w1 = np.transpose(w1)
                weights.append((k, w0))
                weights.append((k, w1))
            else:
                if 'dense' in k:
                    w = np.transpose(w)
                weights.append((k, w))
    if 'relative_attention_bias' in weights[3][0]:
        weights = weights[3:4] + weights[:3] + weights[4:]
    if not name:
        weights = [x[1] for x in weights]
    return weights

In [8]:
state_dict = torch.load('../converted.zip')

In [9]:
model_new_weights = [get_weight('word_embeds.weight')]
for i in range(24):
    model_new_weights += get_block_weight(i, t='encoder')
model_new_weights += [get_weight('encoder.final_layernorm.weight')]

In [10]:
len(model_new_weights)

219

In [11]:
len(model.variables)

219

In [12]:
for k, v in state_dict.items():
    print(k, v.shape)

word_embeds.weight torch.Size([26240, 4096])
lm_head.weight torch.Size([26240, 4096])
encoder.word_embeds.weight torch.Size([26240, 4096])
encoder.final_layernorm.weight torch.Size([4096])
encoder.blocks.0.self_attn.self_attn.project.weight torch.Size([12288, 4096])
encoder.blocks.0.self_attn.self_attn.relative_attention_bias.weight torch.Size([32, 64])
encoder.blocks.0.self_attn.self_attn.dense.weight torch.Size([4096, 4096])
encoder.blocks.0.self_attn.layer_norm.weight torch.Size([4096])
encoder.blocks.0.ff.dense_relu_dense.wi_0.weight torch.Size([10240, 4096])
encoder.blocks.0.ff.dense_relu_dense.wi_1.weight torch.Size([10240, 4096])
encoder.blocks.0.ff.dense_relu_dense.wo.weight torch.Size([4096, 10240])
encoder.blocks.0.ff.layer_norm.weight torch.Size([4096])
encoder.blocks.1.self_attn.self_attn.project.weight torch.Size([12288, 4096])
encoder.blocks.1.self_attn.self_attn.dense.weight torch.Size([4096, 4096])
encoder.blocks.1.self_attn.layer_norm.weight torch.Size([4096])
encoder.

In [13]:
for k in model_new_weights:
    print(k.shape)

(26240, 4096)
(32, 64)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096,)
(4096, 10240)
(4096, 10240)
(10240, 4096)
(4096,)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096,)
(4096, 10240)
(4096, 10240)
(10240, 4096)
(4096,)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096,)
(4096, 10240)
(4096, 10240)
(10240, 4096)
(4096,)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096,)
(4096, 10240)
(4096, 10240)
(10240, 4096)
(4096,)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096,)
(4096, 10240)
(4096, 10240)
(10240, 4096)
(4096,)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096,)
(4096, 10240)
(4096, 10240)
(10240, 4096)
(4096,)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096,)
(4096, 10240)
(4096, 10240)
(10240, 4096)
(4096,)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096,)
(4096, 10240)
(4096, 10240)
(10240, 4096)
(4096,)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096, 4096)
(4096,)
(4096, 10240)
(4096, 10240)
(10240, 4

In [14]:
for x in model.variables:
    print(x.name, x.shape)

shared/shared/weight:0 (26240, 4096)
tf_t5encoder_model/encoder/block_._0/layer_._0/SelfAttention/relative_attention_bias/embeddings:0 (32, 64)
tf_t5encoder_model/encoder/block_._0/layer_._0/SelfAttention/q/kernel:0 (4096, 4096)
tf_t5encoder_model/encoder/block_._0/layer_._0/SelfAttention/k/kernel:0 (4096, 4096)
tf_t5encoder_model/encoder/block_._0/layer_._0/SelfAttention/v/kernel:0 (4096, 4096)
tf_t5encoder_model/encoder/block_._0/layer_._0/SelfAttention/o/kernel:0 (4096, 4096)
tf_t5encoder_model/encoder/block_._0/layer_._0/layer_norm/weight:0 (4096,)
tf_t5encoder_model/encoder/block_._0/layer_._1/DenseReluDense/wi_0/kernel:0 (4096, 10240)
tf_t5encoder_model/encoder/block_._0/layer_._1/DenseReluDense/wi_1/kernel:0 (4096, 10240)
tf_t5encoder_model/encoder/block_._0/layer_._1/DenseReluDense/wo/kernel:0 (10240, 4096)
tf_t5encoder_model/encoder/block_._0/layer_._1/layer_norm/weight:0 (4096,)
tf_t5encoder_model/encoder/block_._1/layer_._0/SelfAttention/q/kernel:0 (4096, 4096)
tf_t5encoder_

In [15]:
assert len(model_new_weights) == len(model.variables)

In [16]:
model.set_weights(model_new_weights)

In [17]:
input_text = '''当地时间9月6日是美国劳工节，但就在这一天，上千万美国劳动者却陷入新的困境。因为美国政府为疫情期间失业者提供的主要救助同日到期，而且白宫表示没有进一步延长救助的计划。
在德尔塔变异株已把美国推入新一轮疫情的背景下，失业救济的突然“断供”意味着有上千万美国人将全部或部分失去他们的生活来源。'''
input_ids = tf.constant([tokenizer.encode(input_text)])

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.541 seconds.
Prefix dict has been built successfully.


In [18]:
out = model(
    input_ids
)

In [19]:
out.keys()

odict_keys(['last_hidden_state'])

In [20]:
model.save('cpm_2_0_tf_encoder')

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method




INFO:tensorflow:Assets written to: cpm_2_0_tf/assets


INFO:tensorflow:Assets written to: cpm_2_0_tf/assets


In [21]:
!du -sh 'cpm_2_0_tf_encoder'

18G	cpm_2_0_tf


In [23]:
# !pip install -U tf2onnx

In [24]:
!mkdir -p onnx_tf

In [26]:
!python -m tf2onnx.convert --large_model --saved-model 'cpm_2_0_tf_encoder' --opset 13 --output cpm_2_0_encoder.zip


2021-09-19 22:57:33.165119: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-19 22:57:33.165150: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2021-09-19 22:57:36.270205: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2021-09-19 22:57:36.270232: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2021-09-19 22:57:36.270257: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (iZuf6fokcl2k1pwfopz0n4Z): /proc/driver/nvidia/version does not exist
2021-09-19 22:57:36.270555: I tensorflow/core/platfor

2021-09-19 23:07:58,608 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._0/layer_._0/SelfAttention/k/Tensordot/ReadVariableOp
2021-09-19 23:07:58,787 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._0/layer_._0/SelfAttention/v/Tensordot/ReadVariableOp
2021-09-19 23:07:58,965 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._0/layer_._0/SelfAttention/o/Tensordot/ReadVariableOp
2021-09-19 23:07:59,141 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._0/layer_._1/layer_norm/ReadVariableOp
2021-09-19 23:07:59,141 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._0/layer_._1/DenseReluDense/wi_0/Tensordot/ReadVariableOp
2021-09-19 23:07:59,623 - INFO - folding node using tf type=Identity, name=S

2021-09-19 23:08:09,017 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._5/layer_._0/SelfAttention/k/Tensordot/ReadVariableOp
2021-09-19 23:08:09,170 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._5/layer_._0/SelfAttention/v/Tensordot/ReadVariableOp
2021-09-19 23:08:09,327 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._5/layer_._0/SelfAttention/o/Tensordot/ReadVariableOp
2021-09-19 23:08:09,480 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._5/layer_._1/layer_norm/ReadVariableOp
2021-09-19 23:08:09,481 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._5/layer_._1/DenseReluDense/wi_0/Tensordot/ReadVariableOp
2021-09-19 23:08:09,910 - INFO - folding node using tf type=Identity, name=S

2021-09-19 23:08:18,480 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._10/layer_._0/SelfAttention/k/Tensordot/ReadVariableOp
2021-09-19 23:08:18,634 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._10/layer_._0/SelfAttention/v/Tensordot/ReadVariableOp
2021-09-19 23:08:18,786 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._10/layer_._0/SelfAttention/o/Tensordot/ReadVariableOp
2021-09-19 23:08:18,945 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._10/layer_._1/layer_norm/ReadVariableOp
2021-09-19 23:08:18,945 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._10/layer_._1/DenseReluDense/wi_0/Tensordot/ReadVariableOp
2021-09-19 23:08:19,363 - INFO - folding node using tf type=Identity, n

2021-09-19 23:08:28,422 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._15/layer_._0/SelfAttention/k/Tensordot/ReadVariableOp
2021-09-19 23:08:28,595 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._15/layer_._0/SelfAttention/v/Tensordot/ReadVariableOp
2021-09-19 23:08:28,765 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._15/layer_._0/SelfAttention/o/Tensordot/ReadVariableOp
2021-09-19 23:08:28,938 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._15/layer_._1/layer_norm/ReadVariableOp
2021-09-19 23:08:28,938 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._15/layer_._1/DenseReluDense/wi_0/Tensordot/ReadVariableOp
2021-09-19 23:08:29,411 - INFO - folding node using tf type=Identity, n

2021-09-19 23:08:38,915 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._20/layer_._0/SelfAttention/k/Tensordot/ReadVariableOp
2021-09-19 23:08:39,086 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._20/layer_._0/SelfAttention/v/Tensordot/ReadVariableOp
2021-09-19 23:08:39,258 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._20/layer_._0/SelfAttention/o/Tensordot/ReadVariableOp
2021-09-19 23:08:39,429 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._20/layer_._1/layer_norm/ReadVariableOp
2021-09-19 23:08:39,429 - INFO - folding node using tf type=Identity, name=StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._20/layer_._1/DenseReluDense/wi_0/Tensordot/ReadVariableOp
2021-09-19 23:08:39,896 - INFO - folding node using tf type=Identity, n

2021-09-19 23:13:53,344 - INFO - replacing einsum node 'StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._10/layer_._0/SelfAttention/einsum/Einsum' by its decomposed version, name of the last node 'Identity__3728'.
2021-09-19 23:13:53,352 - INFO - replacing einsum node 'StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._6/layer_._0/SelfAttention/einsum/Einsum' by its decomposed version, name of the last node 'Identity__3779'.
2021-09-19 23:13:53,360 - INFO - replacing einsum node 'StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._15/layer_._0/SelfAttention/einsum/Einsum' by its decomposed version, name of the last node 'Identity__3830'.
2021-09-19 23:13:53,368 - INFO - replacing einsum node 'StatefulPartitionedCall/tf_t5encoder_model/encoder/block_._12/layer_._0/SelfAttention/einsum/Einsum' by its decomposed version, name of the last node 'Identity__3881'.
2021-09-19 23:13:53,376 - INFO - replacing einsum node 'StatefulPartitionedCall/tf_t5encoder_model/encode