In [7]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from typing import *

In [8]:
from src.fewshot_ner_viz_component.data_processing import *
from src.fewshot_ner_viz_component.utils import *

In [9]:
from deeppavlov.core.layers.tf_layers import cudnn_bi_lstm, cudnn_bi_gru, bi_rnn, stacked_cnn, INITIALIZER

In [10]:
TRAIN_ELMO = False
TRAIN_ALL_ELMO_PARAMS = 0

In [11]:
dataset_orig = read_data()
ne_type = 'PERSON'
dataset = filter_dataset_by_ne_types(dataset_orig, ne_type)

Num of train sentences: 75187
Num of valid sentences: 9603
Num of test sentences: 9479
[(['Actions', 'had', 'to', 'be', 'taken', 'to', 'break', 'through', 'the', 'blockade', '.'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']), (['On', 'a', 'night', 'in', 'late', 'July', '1940', ',', 'the', 'atmosphere', 'in', 'Zhuanbi', 'Village', 'in', 'Shaanxi', 'was', 'unusual', '.'], ['O', 'B-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'I-DATE', 'O', 'O', 'O', 'O', 'B-GPE', 'I-GPE', 'O', 'B-GPE', 'O', 'O', 'O']), (['Villager', 'Xiao', 'Jianghe', 'has', 'a', 'vivid', 'memory', 'of', 'this', 'piece', 'of', 'history', '.'], ['O', 'B-PERSON', 'I-PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']), (['On', 'that', 'dark', 'night', ',', 'everyone', 'was', 'sleeping', 'when', 'human', 'voices', 'and', 'neighing', 'horses', 'were', 'heard', 'within', 'the', 'village', '.'], ['O', 'B-TIME', 'I-TIME', 'I-TIME', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '

In [12]:
INITIALIZER = tf.contrib.layers.xavier_initializer
def build_cudnn_rnn(units, mask, n_hidden_list:Tuple[int]=(128,), cell_type:str='lstm', intra_layer_dropout:bool=False, dropout_ph=None):
    sequence_lengths = tf.to_int32(tf.reduce_sum(mask, axis=1))
    for n, n_hidden in enumerate(n_hidden_list):
        with tf.variable_scope(cell_type.upper() + '_' + str(n)):
            if cell_type.lower() == 'lstm':
                units, _ = cudnn_bi_lstm(units, n_hidden, sequence_lengths)
            elif cell_type.lower() == 'gru':
                units, _ = cudnn_bi_gru(units, n_hidden, sequence_lengths)
            else:
                raise RuntimeError('Wrong cell type "{}"! Only "gru" and "lstm"!'.format(cell_type))
            units = tf.concat(units, -1)
            if intra_layer_dropout and n != len(n_hidden_list) - 1:
                units = variational_dropout(units, dropout_ph)
    return units

def build_rnn(units, n_hidden_list:Tuple[int]=(128,), cell_type:str='lstm', intra_layer_dropout:bool=False, dropout_ph=None):
    for n, n_hidden in enumerate(n_hidden_list):
        units, _ = bi_rnn(units, n_hidden, cell_type=cell_type, name='Layer_' + str(n))
        units = tf.concat(units, -1)
        if intra_layer_dropout and n != len(n_hidden_list) - 1:
            units = variational_dropout(units, dropout_ph)
    return units

def build_top(units, n_tags=1, top_dropout:bool=False, two_dense_on_top:bool=False, n_hidden=128):
    if top_dropout:
        units = variational_dropout(units, dropout_ph)
    if two_dense_on_top:
        units = tf.layers.dense(units, n_hidden, activation=tf.nn.relu,
                                kernel_initializer=INITIALIZER(),
                                kernel_regularizer=tf.nn.l2_loss)
    logits = tf.layers.dense(units, n_tags, activation=None,
                             kernel_initializer=INITIALIZER(),
                             kernel_regularizer=tf.nn.l2_loss)
    return logits

def build_train_predict(logits, n_tags, mask, sequence_lengths, y_ph, use_crf, learning_rate_ph, clip_grad_norm, l2_reg):
    res = {}
    if use_crf:
        sequence_lengths = tf.reduce_sum(mask, axis=1)
        log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(logits, y_ph, sequence_lengths)
        loss_tensor = -log_likelihood
        res['transition_params'] = transition_params
    else:
        ground_truth_labels = tf.one_hot(y_ph, n_tags)
        loss_tensor = tf.nn.softmax_cross_entropy_with_logits(labels=ground_truth_labels, logits=logits)
        loss_tensor = loss_tensor * mask
        y_pred = tf.argmax(logits, axis=-1)
        res['y_pred'] = y_pred

    loss = tf.reduce_mean(loss_tensor)

    # L2 regularization
    if l2_reg > 0:
        loss += l2_reg * tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES))
    res['loss'] = loss
        
    # optimizer = partial(tf.train.MomentumOptimizer, momentum=0.9, use_nesterov=True)
    optimizer = tf.train.AdamOptimizer
    train_op = get_train_op(loss, learning_rate_ph, optimizer, clip_norm=clip_grad_norm)
    res['train_op'] = train_op
    return res

def predict_no_crf(y_pred, mask, feed_dict):
    pred_idxs, mask = sess.run([y_pred, mask], feed_dict)

    # Filter by sequece length
    sequence_lengths = np.sum(mask, axis=1).astype(np.int32)
    pred = []
    for utt, l in zip(pred_idxs, sequence_lengths):
        pred.append(utt[:l])
    return pred

def predict_crf(logits, transition_params, mask, feed_dict):
    logits, trans_params, mask = sess.run([logits,
                                           transition_params,
                                           mask],
                                           feed_dict=feed_dict)
    sequence_lengths = np.maximum(np.sum(mask, axis=1).astype(np.int32), 1)
    # iterate over the sentences because no batching in viterbi_decode
    y_pred = []
    for logit, sequence_length in zip(logits, sequence_lengths):
        logit = logit[:int(sequence_length)]  # keep only the valid steps
        viterbi_seq, viterbi_score = tf.contrib.crf.viterbi_decode(logit, trans_params)
        y_pred += [viterbi_seq]
    return y_pred

def get_train_op(loss,
                 learning_rate,
                 optimizer=None,
                 clip_norm=None,
                 learnable_scopes=None,
                 optimizer_scope_name=None):
    """ Get train operation for given loss

    Args:
        loss: loss, tf tensor or scalar
        learning_rate: scalar or placeholder
        clip_norm: clip gradients norm by clip_norm
        learnable_scopes: which scopes are trainable (None for all)
        optimizer: instance of tf.train.Optimizer, default Adam

    Returns:
        train_op
    """
    if optimizer_scope_name is None:
        opt_scope = tf.variable_scope('Optimizer')
    else:
        opt_scope = tf.variable_scope(optimizer_scope_name)
    with opt_scope:
        if learnable_scopes is None:
            variables_to_train = tf.global_variables()
        else:
            variables_to_train = []
            for scope_name in learnable_scopes:
                for var in tf.global_variables():
                    if scope_name in var.name:
                        variables_to_train.append(var)

        if optimizer is None:
            optimizer = tf.train.AdamOptimizer

        # For batch norm it is necessary to update running averages
        extra_update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(extra_update_ops):
            opt = optimizer(learning_rate)
            grads_and_vars = opt.compute_gradients(loss, var_list=variables_to_train)
            if clip_norm is not None:
                grads_and_vars = [(tf.clip_by_norm(grad, clip_norm), var)
                                  for grad, var in grads_and_vars] #  if grad is not None
            train_op = opt.apply_gradients(grads_and_vars)
    return train_op

In [13]:
def predict_labels(prob: np.ndarray, threshold=0.5):
    labels = np.zeros(prob.shape)
    labels[prob > threshold] = 1
    return labels
def flat_array(a: np.ndarray):
#     return np.reshape(a, a.size)
    return a.flatten()
def calc_f1(y, pred_prob):
    return f1_score(flat_array(y), flat_array(predict_labels(pred_prob)))
def tags2binaryPadded(tags:list):
    if isinstance(tags[0], str):
        tags = [tags]
    n_sentences = len(tags)
    tokens_length = get_tokens_len(tags)
    max_len = np.max(tokens_length)
    tokens_length = np.tile(np.expand_dims(tokens_length, -1), (1,max_len))
    y = np.zeros((n_sentences, max_len))
    range_ar = np.tile(np.arange(1, max_len+1, 1), (n_sentences, 1))
    for i, sen in enumerate(tags):
        for j, tag in enumerate(sen):
            if tags[i][j] != 'O':
                y[i][j] = 1
#     y[range_ar > tokens_length] = -1
    return y
def get_batch(dataset, batch_size=None):
    if not batch_size:
        batch_size = len(dataset)
    tokens, tags = get_data_sample(dataset, batch_size)
    tokens_length = get_tokens_len(tokens)
    tokens = add_padding(tokens)
    y = tags2binaryPadded(tags)
    return tokens, tags, tokens_length, y

In [14]:
tf.reset_default_graph()
sess = tf.Session()

In [15]:
elmo = hub.Module("https://tfhub.dev/google/elmo/1", trainable=TRAIN_ELMO)

INFO:tensorflow:Using /tmp/tfhub_modules to cache modules.


2018-08-24 19:05:58.998 INFO in 'tensorflow'['tf_logging'] at line 159: Using /tmp/tfhub_modules to cache modules.


In [16]:
print(tf.trainable_variables())
if(TRAIN_ELMO):
    elmo_params = {'layer_coefficients': tf.trainable_variables()[-2], 'scaling': tf.trainable_variables()[-1]}
    print(elmo_params)
elmo_vars = tf.trainable_variables()
vars_dict = {v.name:v for v in tf.trainable_variables()}
if TRAIN_ALL_ELMO_PARAMS:
    cell0_kernel = vars_dict['module/bilm/RNN_0/RNN/MultiRNNCell/Cell0/rnn/lstm_cell/kernel:0']
    cell1_kernel = vars_dict['module/bilm/RNN_0/RNN/MultiRNNCell/Cell1/rnn/lstm_cell/kernel:0']

[]


In [17]:
# Configuration
use_cudnn_rnn = False
l2_reg = 0.01
n_hidden_list = (128,)
cell_type = 'lstm'
n_tags = 2
use_crf = True
clip_grad_norm = 5.0
learning_rate = 1e-3
dropout_keep_prob = 0.5

### Build computational graph

In [18]:
# Placeholders
tokens_input_ph = tf.placeholder(shape=[None, None], dtype=tf.string)
tokens_length_ph = tf.placeholder(shape=[None], dtype=tf.int32)
y_ph = tf.placeholder(shape=[None, None], dtype=tf.int32, name='y_ph')
learning_rate_ph = tf.placeholder_with_default(learning_rate, shape=[], name='learning_rate')
dropout_ph = tf.placeholder_with_default(dropout_keep_prob, shape=[], name='dropout')
training_ph = tf.placeholder_with_default(False, shape=[], name='is_training')

In [19]:
emb = elmo(inputs={"tokens": tokens_input_ph,
                    "sequence_len": tokens_length_ph},
                  signature="tokens",
                  as_dict=True)['elmo']
mask = tf.sequence_mask(lengths=tokens_length_ph, dtype=tf.float32)
features = emb
if use_cudnn_rnn:
    units = build_cudnn_rnn(features, mask, n_hidden_list, cell_type)
else:
    units = build_rnn(features, n_hidden_list, cell_type)

logits = build_top(units, n_tags=n_tags)

out_dict = build_train_predict(logits, n_tags, mask, tokens_length_ph, y_ph, use_crf, learning_rate_ph, clip_grad_norm, l2_reg)
train_op = out_dict['train_op']
loss = out_dict['loss']
if use_crf:
    transition_params = out_dict['transition_params']
else:
    y_pred = out_dict['y_pred']

predict = predict_crf if use_crf else predict_no_crf

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


2018-08-24 19:05:59.854 INFO in 'tensorflow'['tf_logging'] at line 115: Saver not created because there are no variables in the graph to restore
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [20]:
all_vars = tf.trainable_variables()
model_vars = [v for v in all_vars if v not in elmo_vars]
print(model_vars)
vars_dict = {v.name:v for v in tf.trainable_variables()}

[<tf.Variable 'Layer_0_LSTM/bidirectional_rnn/fw/lstm_cell/kernel:0' shape=(1152, 512) dtype=float32_ref>, <tf.Variable 'Layer_0_LSTM/bidirectional_rnn/fw/lstm_cell/bias:0' shape=(512,) dtype=float32_ref>, <tf.Variable 'Layer_0_LSTM/bidirectional_rnn/bw/lstm_cell/kernel:0' shape=(1152, 512) dtype=float32_ref>, <tf.Variable 'Layer_0_LSTM/bidirectional_rnn/bw/lstm_cell/bias:0' shape=(512,) dtype=float32_ref>, <tf.Variable 'dense/kernel:0' shape=(256, 2) dtype=float32_ref>, <tf.Variable 'dense/bias:0' shape=(2,) dtype=float32_ref>, <tf.Variable 'transitions:0' shape=(2, 2) dtype=float32_ref>]


### Train model

In [21]:
initialize_op = tf.global_variables_initializer()
sess.run([initialize_op])

[None]

In [22]:
valid_sen_size = 100
tokens_valid, tags_valid, tokens_len_valid, y_valid = get_batch(dataset['valid'], valid_sen_size)
feed_valid = {tokens_input_ph: tokens_valid, tokens_length_ph: tokens_len_valid, y_ph: y_valid}

In [None]:
num_steps = 200
batch_size = 64
display_step = 5
valid_step = 10
losses = {'train': [], 'valid': []}
f1_scores = {'train': [], 'valid': []}
best_valid_f1 = 0
d_elmo_cells_list = {'cell0':[], 'cell1':[]}
for step in range(1, num_steps+1):
    print('Step {}/{}'.format(step, num_steps))
    tokens_batch, tags_batch, tokens_len_batch, y_batch = get_batch(dataset['train'], batch_size)
    feed = {tokens_input_ph: tokens_batch, tokens_length_ph: tokens_len_batch, y_ph: y_batch}
    if TRAIN_ALL_ELMO_PARAMS:
        cell0_kernel_val1 = cell0_kernel.eval(session=sess)
        cell1_kernel_val1 = cell1_kernel.eval(session=sess)
    # Train
    with tf.variable_scope('', reuse=tf.AUTO_REUSE):
        loss_cur, _ = sess.run([loss, train_op], feed_dict=feed)
    losses['train'].append(loss_cur)
    if TRAIN_ALL_ELMO_PARAMS:
        cell0_kernel_val2 = cell0_kernel.eval(session=sess)
        cell1_kernel_val2 = cell1_kernel.eval(session=sess)
        d_cell0_kernel = np.linalg.norm(cell0_kernel_val2 - cell0_kernel_val1)/np.linalg.norm(cell0_kernel_val1)
        d_cell1_kernel = np.linalg.norm(cell1_kernel_val2 - cell1_kernel_val1)/np.linalg.norm(cell1_kernel_val1)
        d_elmo_cells_list['cell0'].append(d_cell0_kernel)
        d_elmo_cells_list['cell1'].append(d_cell1_kernel)
#     print('ELMo cells change per step: cell0: {}, cell1: {}'.format(d_cell0_kernel, d_cell1_kernel))
    # Validate
    with tf.variable_scope('', reuse=tf.AUTO_REUSE):
        loss_valid = sess.run([loss], feed_dict=feed_valid)[0]
    # Get elmo params
    with tf.variable_scope('', reuse=tf.AUTO_REUSE):
        if TRAIN_ELMO:
            layer_coeff, scale = sess.run([elmo_params['layer_coefficients'], elmo_params['scaling']])
    losses['valid'].append(loss_valid)
    if step % display_step == 0 or step == 1:
        print('Train loss = {}'.format(losses['train'][-1]))
#         print('Train F1 score = {}'.format(f1_scores['train'][-1]))
        if TRAIN_ELMO:
            with tf.variable_scope('', reuse=tf.AUTO_REUSE):
                print('ELMo weights:')
                print('Coefficients = {}, scale = {}'.format(layer_coeff, scale))
        if TRAIN_ALL_ELMO_PARAMS:
            print('ELMo cells change per step: cell0: {:.2f}%, cell1: {:.2f}%'.format(d_cell0_kernel*100, d_cell1_kernel*100))
        
    if step % valid_step == 0 or step == 1:
        print('Valid loss = {}'.format(losses['valid'][-1]))
#         print('Valid F1 score = {}'.format(f1_scores['valid'][-1]))

Step 1/200
Train loss = 9.197015762329102
Valid loss = 6.311370372772217
Step 2/200
Step 3/200
Step 4/200
Step 5/200
Train loss = 4.33183479309082
Step 6/200
Step 7/200
Step 8/200
Step 9/200
Step 10/200
Train loss = 1.7397642135620117
Valid loss = 1.5812721252441406
Step 11/200
Step 12/200
Step 13/200
Step 14/200
Step 15/200
Train loss = 1.2727947235107422
Step 16/200
Step 17/200
Step 18/200
Step 19/200
Step 20/200
Train loss = 0.4056224822998047
Valid loss = 1.8678202629089355
Step 21/200
Step 22/200
Step 23/200
Step 24/200
Step 25/200
Train loss = 2.3028202056884766
Step 26/200
Step 27/200
Step 28/200
Step 29/200
Step 30/200
Train loss = 0.41156673431396484
Valid loss = 1.4713494777679443
Step 31/200
Step 32/200
Step 33/200
Step 34/200
Step 35/200
Train loss = 0.9195282459259033
Step 36/200
Step 37/200
Step 38/200
Step 39/200
Step 40/200
Train loss = 1.0717930793762207
Valid loss = 1.463246464729309
Step 41/200
Step 42/200
Step 43/200
Step 44/200
Step 45/200
Train loss = 2.0763401985

In [None]:
# Plot learning curve
plt.figure()
steps = np.arange(1, num_steps+1, 1)
plt.plot(steps, losses['train'], c='b', label='train')
plt.plot(steps, losses['valid'], c='r', label='valid')
plt.xlabel('iter')
plt.ylabel('loss')
plt.grid()
plt.legend()