In [7]:
import numpy as np
import tensorflow as tf
import modeling
import tokenization

In [26]:
def model_builder(bert_config, init_checkpoint):
    """The `model_fn` for TPUEstimator."""
    input_ids = tf.placeholder(tf.int32, [None, None], name='input_ids')
    input_mask = tf.placeholder(tf.int32, [None, None], name='input_mask')
    masked_lm_positions = tf.placeholder(tf.int32, [None, None], name='masked_lm_positions')
    segment_ids = tf.zeros_like(input_mask)

    model = modeling.BertModel(
            config=bert_config,
            is_training=False,
            input_ids=input_ids,
            input_mask=(input_mask),
            token_type_ids=segment_ids)

    log_probs = get_masked_lm_output(
            bert_config,
            model.get_sequence_output(),
            model.get_embedding_table(),
            masked_lm_positions)

    outputs = tf.reshape(
        log_probs, [tf.shape(masked_lm_positions)[0], tf.shape(masked_lm_positions)[1], bert_config.vocab_size])

    tvars = tf.trainable_variables()
    initialized_variable_names = {}

    if init_checkpoint:
        (assignment_map, initialized_variable_names
        ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    inputs = (input_ids, input_mask, masked_lm_positions, masked_lm_positions)
    return inputs, outputs


def get_masked_lm_output(bert_config, input_tensor, output_weights, positions):
    """Get loss and log probs for the masked LM."""
    input_tensor = gather_indexes(input_tensor, positions)

    with tf.variable_scope("cls/predictions"):
        # We apply one more non-linear transformation before the output layer.
        # This matrix is not used after pre-training.
        with tf.variable_scope("transform"):
            input_tensor = tf.layers.dense(
                    input_tensor,
                    units=bert_config.hidden_size,
                    activation=modeling.get_activation(bert_config.hidden_act),
                    kernel_initializer=modeling.create_initializer(
                            bert_config.initializer_range))
            input_tensor = modeling.layer_norm(input_tensor)

        output_bias = tf.get_variable(
                "output_bias",
                shape=[bert_config.vocab_size],
                initializer=tf.zeros_initializer())
        logits = tf.matmul(input_tensor, output_weights, transpose_b=True)
        logits = tf.nn.bias_add(logits, output_bias)
        log_probs = tf.nn.log_softmax(logits, axis=-1)

    return log_probs


def gather_indexes(sequence_tensor, positions):
    """Gathers the vectors at the specific positions over a minibatch."""
    sequence_shape = modeling.get_shape_list(sequence_tensor, expected_rank=3)
    batch_size = sequence_shape[0]
    seq_length = sequence_shape[1]
    width = sequence_shape[2]

    flat_offsets = tf.reshape(
        tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
    flat_positions = tf.reshape(positions + flat_offsets, [-1])
    flat_sequence_tensor = tf.reshape(sequence_tensor,
                                      [batch_size * seq_length, width])
    output_tensor = tf.gather(flat_sequence_tensor, flat_positions)
    return output_tensor

In [9]:
bert_path = '/Users/easton/Data/pre-train/chinese_L-12_H-768_A-12/'
tokenizer = tokenization.FullTokenizer(vocab_file=bert_path +'vocab.txt', do_lower_case=True)
MASKED_TOKEN = "[MASK]"
MASKED_ID = tokenizer.convert_tokens_to_ids([MASKED_TOKEN])[0]

tf.logging.set_verbosity(tf.logging.INFO)
tf.logging.info("***** Running Fixing *****")

bert_config = modeling.BertConfig.from_json_file(bert_path + 'bert_config.json')
input_pl, log_prob_op = model_builder(bert_config, bert_path + 'bert_model.ckpt')

INFO:tensorflow:***** Running Fixing *****
Instructions for updating:
Colocations handled automatically by placer.

For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
Use keras.layers.dense instead.


In [28]:
sess = tf.InteractiveSession()
init_op = tf.global_variables_initializer()
sess.run(init_op)



In [14]:
def create_masked_lm_prediction(input_ids, mask_position):
    new_input_ids = list(input_ids)
    masked_lm_positions = list(range(mask_position, mask_position+1))
    for i in masked_lm_positions:
        new_input_ids[i] = MASKED_ID

    return new_input_ids, masked_lm_positions

In [65]:
max_seq_length = 30
tokens = list('在中国经济新强台的大背景下')
# tokens = list('床前白月光,疑是地上霜')
index = 6
masked_lm_positions = []

input_tokens = ["[CLS]"] + tokens + ["[SEP]"]
len_pad = max_seq_length - len(input_tokens)
input_ids = tokenizer.convert_tokens_to_ids(input_tokens) + [0] * len_pad
input_mask = [1] * len(input_tokens) + [0] * len_pad
new_input_ids, masked_lm_positions = create_masked_lm_prediction(input_ids, index+1)

pad_len = max_seq_length - len(masked_lm_positions)
masked_lm_positions += [0] * pad_len

print(new_input_ids, input_mask, masked_lm_positions)
dict_feed = {input_pl[0]: [new_input_ids],
             input_pl[1]: [input_mask],
             input_pl[2]: [masked_lm_positions]}
log_probs = sess.run(log_prob_op, feed_dict=dict_feed)

[101, 1762, 704, 1744, 5307, 3845, 3173, 103, 1378, 4638, 1920, 5520, 3250, 678, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] [7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [66]:
[np.exp(log_probs[0][0][i]) for i in tokenizer.convert_tokens_to_ids(['强', '常'])]

[3.2652958e-06, 1.8304976e-05]

In [44]:
np.argmax(np.exp(log_probs[0][0]))

3209