In [1]:
#!pip install -q git+https://github.com/deepmipt/bert.git@feat/keras deeppavlov

In [2]:
import tensorflow as tf
v = str(tf.__version__).split('.')[:2]
assert int(v[0]) > 1 or (int(v[1]) >= 13), 'this notebook was tested with TF v1.14.0 and v2.0.0-beta1'
# tf.compat.v1.enable_eager_execution()
is_eager = tf.compat.v1.executing_eagerly()
is_eager

False

In [3]:
from bert_dp import bert
try:
    from tensorflow_addons.metrics import F1Score
except ImportError:
    from bert_dp.metrics import F1Score

# Paraphrases data

In [4]:
BATCH_SIZE = 64  # both for training and evaluation for now
MAX_LEN = 52  # the longest sequence in the paraphrase dataset, workaroud that make shape of all batch tensors equal
NUM_TRAIN_SAMPLES = 6656  # total of 7202 samples minus ~500 (actually 546) samples for validation
DYNAMIC_SEQ_LEN = True  # whether to pad each batch to the maximum length in the batch or in the whole dataset
NUM_CLASSES = 2  # binary classification as multiclass
DROP_REMAINDER = True # whether to for batches of fixed size

In [5]:
tf.keras.utils.get_file(
    fname='paraphraser.zip',
    origin='http://files.deeppavlov.ai/datasets/paraphraser.zip',
    cache_subdir='dataset',
    extract=True,
    cache_dir='.'
)
tf.keras.utils.get_file(
    fname='paraphraser_gold.zip',
    origin='http://files.deeppavlov.ai/datasets/paraphraser_gold.zip',
    cache_subdir='dataset',
    extract=True,
    archive_format='zip',
    cache_dir='.'
)
# tf.keras.utils.get_file(
#     fname='rubert_cased_L-12_H-768_A-12_v1.tar.gz',
#     origin='http://files.deeppavlov.ai/deeppavlov_data/bert/rubert_cased_L-12_H-768_A-12_v1.tar.gz',
#     cache_subdir='models',
#     extract=True,
#     cache_dir='.'
# )
tf.keras.utils.get_file(
    fname='paraphraser_rubert_v0.tar.gz',
    origin='http://files.deeppavlov.ai/deeppavlov_data/bert/paraphraser_rubert_v0.tar.gz',
    cache_subdir='models',
    extract=True,
    cache_dir='.'
)

'./models/paraphraser_rubert_v0.tar.gz'

In [6]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


def data_gen(train_data=True):
    from deeppavlov.dataset_readers.paraphraser_reader import ParaphraserReader
    from bert_dp.tokenization import FullTokenizer
    ds = ParaphraserReader().read(data_path='dataset', do_lower_case=False)
    tokenizer = FullTokenizer(vocab_file='models/rubert_cased_L-12_H-768_A-12_v1_converted/vocab.txt', do_lower_case=False)
    tds = ds['train'] if train_data else ds['test']
    for pair, label in tds:
        t1 = tokenizer.tokenize(pair[0])
        t2 = tokenizer.tokenize(pair[1])
        _truncate_seq_pair(t1, t2, 512)
        s = ['[CLS]'] + t1 + ['[SEP]'] + t2 + ['[SEP]']
        yield tokenizer.convert_tokens_to_ids(s), label

In [7]:
import numpy as np

x_train_, y_train_, x_test_, y_test_ = [], [], [], []
for x, y in data_gen():
    x_train_.append(np.pad(x, pad_width=(0, MAX_LEN - len(x)), mode='constant'))
    y_train_.append(y)
for x, y in data_gen(train_data=False):
    x_test_.append(np.pad(x, pad_width=(0, MAX_LEN - len(x)), mode='constant'))
    y_test_.append(y)
x_train = np.array(x_train_)
y_train = np.array(y_train_)
x_test = np.array(x_test_)
y_test = np.array(y_test_)
len(x_train_), len(x_test_)

(7202, 1899)

In [8]:
# train_ds_ = tf.data.Dataset.from_generator(data_gen,
#                                            output_types=(tf.int32, tf.int32),
#                                            output_shapes=(tf.TensorShape([None]),
#                                                           tf.TensorShape([])))
# train_ds = train_ds_.take(NUM_TRAIN_SAMPLES)
# valid_ds = train_ds_.skip(NUM_TRAIN_SAMPLES)
# batched_train_ds = train_ds.padded_batch(batch_size=BATCH_SIZE,
#                                          padded_shapes=([-1] if DYNAMIC_SEQ_LEN else MAX_LEN,
#                                                         ()),
#                                          drop_remainder=DROP_REMAINDER)
# batched_valid_ds = valid_ds.padded_batch(batch_size=BATCH_SIZE,
#                                          padded_shapes=([-1] if DYNAMIC_SEQ_LEN else MAX_LEN,
#                                                         ()),
#                                          drop_remainder=DROP_REMAINDER)
# test_ds = tf.data.Dataset.from_generator(data_gen,
#                                          output_types=(tf.int32, tf.int32),
#                                          output_shapes=(tf.TensorShape([None]),
#                                                         tf.TensorShape([])),
#                                          args=(False,))
# batched_test_ds = test_ds.padded_batch(batch_size=BATCH_SIZE,
#                                        padded_shapes=([-1] if DYNAMIC_SEQ_LEN else MAX_LEN,
#                                                       ()),
#                                        drop_remainder=DROP_REMAINDER)

# Model construction and compilation for training

In [9]:
from bert_dp import bert

class BERTClassifierCompat(tf.keras.Model):#bert.BERT):
    """Subclassed model for classification compatible with official checkpoints from Google."""
    def __init__(self, num_classes=2, **kwargs):
        kwargs['name'] = 'bert'
#         with tf.name_scope('bert'):
        super().__init__(**kwargs)
        self.num_classes = num_classes
        self.bert = bert.BERT(name='bert')
        self.dropout = tf.keras.layers.Dropout(rate=0.5)
        self.softmax = tf.keras.layers.Activation(activation='softmax')
        
    def build(self, batch_input_shape):
        self.output_weights = self.add_weight(shape=(self.num_classes, 768),
                                              dtype=tf.float32,
                                              initializer=tf.keras.initializers.TruncatedNormal(stddev=0.02),
                                              name="output_weights")
        self.output_bias = self.add_weight(shape=(self.num_classes),
                                           dtype=tf.float32,
                                           initializer=tf.keras.initializers.Zeros(),
                                           name='output_bias')
        super().build(batch_input_shape)
        
    def call(self, token_ids, training=None, mask=None, **kwargs):
        po = self.bert(token_ids, training=training, mask=mask)
        po = self.dropout(po, training=training)
        int_logits = tf.matmul(po, self.output_weights, transpose_b=True)
        logits = tf.nn.bias_add(int_logits, self.output_bias)
        if training:
            out = logits
        else:
            out = self.softmax(logits)
        return out

In [10]:
paraphrase_detector = BERTClassifierCompat()
paraphrase_detector.build(batch_input_shape=(BATCH_SIZE, None))
r1 = paraphrase_detector.predict(x_train[:BATCH_SIZE])

W0620 18:33:33.448317 140220619470656 deprecation_wrapper.py:119] From /home/nab/PycharmProjects/work/bert/bert_dp/embeddings.py:63: The name tf.keras.initializers.TruncatedNormal is deprecated. Please use tf.compat.v1.keras.initializers.TruncatedNormal instead.

W0620 18:33:33.449323 140220619470656 deprecation.py:506] From /home/nab/PycharmProjects/work/venv/lib/python3.6/site-packages/tensorflow/python/keras/initializers.py:94: calling TruncatedNormal.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
W0620 18:33:33.460418 140220619470656 deprecation.py:506] From /home/nab/PycharmProjects/work/venv/lib/python3.6/site-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instru

In [11]:
paraphrase_detector.variables

[<tf.Variable 'bert/embeddings/word_embeddings:0' shape=(119547, 768) dtype=float32>,
 <tf.Variable 'bert/embeddings/token_type_embeddings:0' shape=(2, 768) dtype=float32>,
 <tf.Variable 'bert/embeddings/position_embeddings:0' shape=(512, 768) dtype=float32>,
 <tf.Variable 'bert/embeddings/LayerNorm/gamma:0' shape=(768,) dtype=float32>,
 <tf.Variable 'bert/embeddings/LayerNorm/beta:0' shape=(768,) dtype=float32>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/query/kernel:0' shape=(768, 768) dtype=float32>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/query/bias:0' shape=(768,) dtype=float32>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/key/kernel:0' shape=(768, 768) dtype=float32>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/key/bias:0' shape=(768,) dtype=float32>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/value/kernel:0' shape=(768, 768) dtype=float32>,
 <tf.Variable 'bert/encoder/layer_0/attention/self/value/bias:0' shape=(768,) dtype=float32>,
 <tf.

In [12]:
# try:
#     del paraphrase_detector
#     tf.keras.backend.clear_session()
#     print('Session cleared')
# except NameError:
#     pass
# paraphrase_detector = tf.keras.Sequential([bert.BERT(name='bert'),
#                                            tf.keras.layers.Dropout(rate=0.5),
#                                            tf.keras.layers.Dense(NUM_CLASSES)])

In [13]:
# # some data to pass through model in order to construct it
# try:
#     for s in batched_train_ds:
#         r = paraphrase_detector(s[0])
#         print('Model constructed by means of passing one batch from tf.data')
#         break
# except NameError:
#     r = paraphrase_detector.predict(x_train)
#     print('Model constructed by means of passing one batch from array slice')
# except RuntimeError:
#     paraphrase_detector.build(input_shape=(BATCH_SIZE, None if DYNAMIC_SEQ_LEN else MAX_LEN))
#     print('Model constructed using .build() method')

In [14]:
NUM_EPOCHS = 5
# VAL_EVERY_N_BATCHES = 52  # ~50, but better suited for paraphraser dataset
# TRAIN_STEPS_PER_EPOCH = NUM_TRAIN_SAMPLES // BATCH_SIZE
# VALIDATION_STEPS = 512 // BATCH_SIZE  # calculated by hand for now (~546 samples with droppping remainder)
LEARNING_RATE = 2e-05
# learning_rate = tf.keras.optimizers.schedules.PolynomialDecay(initial_learning_rate=LEARNING_RATE,
#                                                               # n. of batches per epoch * ~NUM_EPOCHS for convergence 
#                                                               decay_steps=TRAIN_STEPS_PER_EPOCH * NUM_EPOCHS,
#                                                               end_learning_rate=1e-06)
# from bert_dp.weight_decay_optimizers import AdamW
paraphrase_detector.compile(#optimizer=AdamW(weight_decay=0.01,
#                                             learning_rate=LEARNING_RATE,
#                                             epsilon=1e-6),  # as in bert_dp
                            optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE,
                                                               epsilon=1e-6),
                            loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                            metrics=[tf.keras.metrics.SparseCategoricalAccuracy(),
                                     F1Score(num_classes=2)
                                    ])

W0620 18:33:46.871460 140220619470656 deprecation.py:506] From /home/nab/PycharmProjects/work/venv/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py:507: calling count_nonzero (from tensorflow.python.ops.math_ops) with axis is deprecated and will be removed in a future version.
Instructions for updating:
reduction_indices is deprecated, use axis instead


In [15]:
# baseline to ensure something is learning
# paraphrase_detector.evaluate(batched_test_ds)
paraphrase_detector.evaluate(x=x_test, y=y_test, batch_size=BATCH_SIZE)



[0.6833566011975727,
 0.59452343,
 array([0.08434711, 0.7382734 ], dtype=float32)]

In [16]:
paraphraser_checkpoint_path = 'models/paraphraser_rubert/model_rubert'
paraphraser_saver = tf.compat.v1.train.Saver(var_list=paraphrase_detector.variables)
paraphraser_saver.restore(sess=None if is_eager else tf.keras.backend.get_session(),
                          save_path=paraphraser_checkpoint_path)

W0620 18:34:22.652697 140220619470656 deprecation.py:323] From /home/nab/PycharmProjects/work/venv/lib/python3.6/site-packages/tensorflow/python/training/saver.py:1276: checkpoint_exists (from tensorflow.python.training.checkpoint_management) is deprecated and will be removed in a future version.
Instructions for updating:
Use standard file APIs to check for files with this prefix.


In [17]:
r2 = paraphrase_detector.predict(x_train[:BATCH_SIZE])
try:
    np.testing.assert_allclose(r1, r2)
except AssertionError:
    print('Outputs are changed')

Outputs are changed


In [18]:
paraphrase_detector.evaluate(x=x_test, y=y_test, batch_size=BATCH_SIZE)



[0.47024373447286133,
 0.84886783,
 array([0.09805857, 0.8009791 ], dtype=float32)]

# Training

In [19]:
# baseline to ensure something is learning
# paraphrase_detector.evaluate(batched_test_ds)
paraphrase_detector.evaluate(x=x_test, y=y_test, batch_size=BATCH_SIZE)



[0.47024373447286133,
 0.84886783,
 array([0.10154799, 0.825     ], dtype=float32)]

In [20]:
history = paraphrase_detector.fit(x=x_train,
                                  y=y_train,
                                  epochs=NUM_EPOCHS,
                                  callbacks=[tf.keras.callbacks.TensorBoard(batch_size=BATCH_SIZE),
                                             tf.keras.callbacks.ReduceLROnPlateau(monitor='val_f1_score',
                                                                                  factor=0.5,
                                                                                  patience=3,
                                                                                  min_lr=1e-06)],
                                  validation_split=0.071)  # ~512 samples
# history = paraphrase_detector.fit(x=batched_train_ds.repeat(),
#                                   y=None,
#                                   epochs=NUM_EPOCHS * TRAIN_STEPS_PER_EPOCH // VAL_EVERY_N_BATCHES,  # workaround for more frequent evaluation
#                                   callbacks=[tf.keras.callbacks.TensorBoard(batch_size=BATCH_SIZE),
#                                              tf.keras.callbacks.ReduceLROnPlateau(monitor='val_sparse_binary_f1_score',
#                                                                                   factor=0.5,
#                                                                                   patience=3,
#                                                                                   min_lr=1e-06)],
#                                   validation_data=batched_valid_ds,
#                                   steps_per_epoch=VAL_EVERY_N_BATCHES,
#                                   validation_steps=VALIDATION_STEPS)

Train on 6690 samples, validate on 512 samples
Epoch 1/5

ValueError: can only convert an array of size 1 to a Python scalar

In [19]:
# baseline to ensure something is learning
# paraphrase_detector.evaluate(batched_test_ds)
paraphrase_detector.evaluate(x=x_test, y=y_test, batch_size=BATCH_SIZE)



[1.1100106747166743, 0.6113744, 0.7494238]