In [2]:
# Load conpono weights into BERT
import tensorflow as tf
from transformers import TFBertModel


In [44]:
import os, re, logging
import numpy as np
import tensorflow as tf
import torch
from transformers import BertConfig, BertModel, BertForNextSentencePrediction
def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path, save_dir):
    config_path = os.path.abspath(bert_config_file)
    tf_path = os.path.abspath(tf_checkpoint_path)
    print("Converting TensorFlow checkpoint from {} with config at {}".format(tf_path, config_path))
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    excluded = ["BERTAdam", "_power", "global_step", "_CHECKPOINTABLE_OBJECT_GRAPH"]
    init_vars = list(filter(lambda x: all([True if e not in x[0] else False for e in excluded]), init_vars))
    names = []
    arrays = []
    for name, shape in init_vars:
        if "adam_v" in name or "adam_m" in name:
            continue
        print("Loading TF weight {} with shape {}".format(name, shape))
        array = tf.train.load_variable(tf_path, name)
        names.append(name)
        arrays.append(array)

    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    # Initialise PyTorch model
    model = BertModel(config)

    for name, array in zip(names, arrays):
        name = name.split("/")
        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
        # which are not required for using pretrained model
        if any(n in ["adam_v", "adam_m", "global_step", "bad_steps", "global_step", "good_steps", "loss_scale",
                     "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "save_counter", ".OPTIMIZER_SLOT"] for n in name) or \
                name[0] == "optimizer":
            print("Skipping {}".format("/".join(name)))
            continue
        if "cls" in name[0]:
            continue
        if ".OPTIMIZER_SLOT" in name:
            idx = name.index(".OPTIMIZER_SLOT")
            name = name[:idx]
        elif ".ATTRIBUTES" in name:
            idx = name.index(".ATTRIBUTES")
            name = name[:idx]
        print(name)
        pointer = model
        for m_name in name:
            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
                scope_names = re.split(r"_(\d+)", m_name)
            else:
                scope_names = [m_name]
            print("\t", scope_names)
            
            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
                pointer = getattr(pointer, "bias")
            elif scope_names[0] == "output_weights":
                pointer = getattr(pointer, "weight")
            elif scope_names[0] == "squad":
                pointer = getattr(pointer, "classifier")
            elif scope_names[0] == "dense_output" or scope_names[0] == "bert_output":
                pointer = getattr(pointer, "output")
            elif scope_names[0] == "self_attention":
                pointer = getattr(pointer, "self")
            elif scope_names[0] == "predictions":
                pointer = getattr(pointer, "seq_relationship")
            else:
                try:
                    pointer = getattr(pointer, scope_names[0])
                except AttributeError:
                    if scope_names[0] != 'bert':
                        print("Possibly ok - Skipping {}".format("/".join(name)), scope_names[0])
                    continue
            if len(scope_names) >= 2:
                num = int(scope_names[1])
                pointer = pointer[num]
        if m_name[-11:] == "_embeddings":
            pointer = getattr(pointer, "weight")
        elif m_name == "kernel" or m_name == "gamma" or m_name == "output_weights":
            array = np.transpose(array)
        # print("Initialize PyTorch weight {}".format(name))
        pointer.data = torch.from_numpy(array)
        print("## Updating weights for ", name)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    #torch.save(model.state_dict(), pytorch_dump_path)
    model.save_pretrained(save_dir)


tf_path = "../weights/model.ckpt"
config_path = "../weights/config.json"
pytorch_dump_path = "../weights/pytorch_model.bin"
save_dir = "../weights/transformers/"
convert_tf_checkpoint_to_pytorch(tf_path, config_path, pytorch_dump_path, save_dir)

Converting TensorFlow checkpoint from /Users/daniter/Documents/jurafsky/conpono/weights/model.ckpt with config at /Users/daniter/Documents/jurafsky/conpono/weights/config.json
Loading TF weight bert/embeddings/LayerNorm/beta with shape [768]
Loading TF weight bert/embeddings/LayerNorm/gamma with shape [768]
Loading TF weight bert/embeddings/position_embeddings with shape [512, 768]
Loading TF weight bert/embeddings/token_type_embeddings with shape [2, 768]
Loading TF weight bert/embeddings/word_embeddings with shape [30522, 768]
Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/beta with shape [768]
Loading TF weight bert/encoder/layer_0/attention/output/LayerNorm/gamma with shape [768]
Loading TF weight bert/encoder/layer_0/attention/output/dense/bias with shape [768]
Loading TF weight bert/encoder/layer_0/attention/output/dense/kernel with shape [768, 768]
Loading TF weight bert/encoder/layer_0/attention/self/key/bias with shape [768]
Loading TF weight bert/encoder/la

Loading TF weight bert/encoder/layer_3/output/LayerNorm/beta with shape [768]
Loading TF weight bert/encoder/layer_3/output/LayerNorm/gamma with shape [768]
Loading TF weight bert/encoder/layer_3/output/dense/bias with shape [768]
Loading TF weight bert/encoder/layer_3/output/dense/kernel with shape [3072, 768]
Loading TF weight bert/encoder/layer_4/attention/output/LayerNorm/beta with shape [768]
Loading TF weight bert/encoder/layer_4/attention/output/LayerNorm/gamma with shape [768]
Loading TF weight bert/encoder/layer_4/attention/output/dense/bias with shape [768]
Loading TF weight bert/encoder/layer_4/attention/output/dense/kernel with shape [768, 768]
Loading TF weight bert/encoder/layer_4/attention/self/key/bias with shape [768]
Loading TF weight bert/encoder/layer_4/attention/self/key/kernel with shape [768, 768]
Loading TF weight bert/encoder/layer_4/attention/self/query/bias with shape [768]
Loading TF weight bert/encoder/layer_4/attention/self/query/kernel with shape [768, 76

['bert', 'embeddings', 'LayerNorm', 'beta']
	 ['bert']
	 ['embeddings']
	 ['LayerNorm']
	 ['beta']
## Updating weights for  ['bert', 'embeddings', 'LayerNorm', 'beta']
['bert', 'embeddings', 'LayerNorm', 'gamma']
	 ['bert']
	 ['embeddings']
	 ['LayerNorm']
	 ['gamma']
## Updating weights for  ['bert', 'embeddings', 'LayerNorm', 'gamma']
['bert', 'embeddings', 'position_embeddings']
	 ['bert']
	 ['embeddings']
	 ['position_embeddings']
## Updating weights for  ['bert', 'embeddings', 'position_embeddings']
['bert', 'embeddings', 'token_type_embeddings']
	 ['bert']
	 ['embeddings']
	 ['token_type_embeddings']
## Updating weights for  ['bert', 'embeddings', 'token_type_embeddings']
['bert', 'embeddings', 'word_embeddings']
	 ['bert']
	 ['embeddings']
	 ['word_embeddings']
## Updating weights for  ['bert', 'embeddings', 'word_embeddings']
['bert', 'encoder', 'layer_0', 'attention', 'output', 'LayerNorm', 'beta']
	 ['bert']
	 ['encoder']
	 ['layer', '0', '']
	 ['attention']
	 ['output']
	 ['

## Updating weights for  ['bert', 'encoder', 'layer_6', 'attention', 'self', 'key', 'kernel']
['bert', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias']
	 ['bert']
	 ['encoder']
	 ['layer', '6', '']
	 ['attention']
	 ['self']
	 ['query']
	 ['bias']
## Updating weights for  ['bert', 'encoder', 'layer_6', 'attention', 'self', 'query', 'bias']
['bert', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel']
	 ['bert']
	 ['encoder']
	 ['layer', '6', '']
	 ['attention']
	 ['self']
	 ['query']
	 ['kernel']
## Updating weights for  ['bert', 'encoder', 'layer_6', 'attention', 'self', 'query', 'kernel']
['bert', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias']
	 ['bert']
	 ['encoder']
	 ['layer', '6', '']
	 ['attention']
	 ['self']
	 ['value']
	 ['bias']
## Updating weights for  ['bert', 'encoder', 'layer_6', 'attention', 'self', 'value', 'bias']
['bert', 'encoder', 'layer_6', 'attention', 'self', 'value', 'kernel']
	 ['bert']
	 ['encoder']
	 ['layer', '6', '']
	 ['at

In [41]:
config = BertConfig.from_json_file(config_path)
model = BertModel(config)


In [42]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [32]:
for name, p in model.named_parameters():
    print(name)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [45]:
conpono = BertModel.from_pretrained("../weights/transformers/")

In [48]:
def get_cpc_weights(tf_checkpoint_path):
    tf_path = os.path.abspath(tf_checkpoint_path)
    print("Converting TensorFlow checkpoint from {}".format(tf_path))
    # Load weights from TF model
    init_vars = tf.train.list_variables(tf_path)
    excluded = ["BERTAdam", "_power", "global_step", "_CHECKPOINTABLE_OBJECT_GRAPH"]
    init_vars = list(filter(lambda x: all([True if e not in x[0] else False for e in excluded]), init_vars))
    for name, shape in init_vars:
        if "cpc" not in name:
            continue
        array = tf.train.load_variable(tf_path, name)
        return array

In [49]:
cpc_weights= get_cpc_weights(tf_path)

Converting TensorFlow checkpoint from /Users/daniter/Documents/jurafsky/conpono/weights/model.ckpt
