In [55]:
import re
import json
from transformers import DistilBertTokenizerFast, TFDistilBertModel, DistilBertConfig
import tensorflow as tf
import tensorflow_io as tfio

import numpy as np
import h5py
from tensorflow.keras.utils import Sequence

from pathlib import Path
data_path = Path('../data') / 'span_model_oie'
model_path = Path('../models')
path_hdf5 = str(data_path/'encoded_span_oie.hdf5')

In [56]:
# Let's quickly get the shapes from HDF5 for bookkeeping
if 'fp' in locals():
    fp.close()
fp = h5py.File(path_hdf5, "r")
x_train = fp['x_train']
x_test = fp['x_test']
y_train = fp['y_train']
y_test = fp['y_test']
x_train_shape = x_train.shape
y_train_shape = y_train.shape
x_test_shape = x_test.shape
y_test_shape = y_test.shape
fp.close()
print("data sizes: x_train %s, y_train %s, x_test %s, y_test %s ." % \
      (x_train_shape, y_train_shape, x_test_shape, y_test_shape))
validation_index = 1+int(0.9*x_train_shape[0])

x_test = tfio.IODataset.from_hdf5(path_hdf5, dataset='/x_test')
y_test = tfio.IODataset.from_hdf5(path_hdf5, dataset='/y_test')
x_train = tfio.IODataset.from_hdf5(path_hdf5, dataset='/x_train')
y_train = tfio.IODataset.from_hdf5(path_hdf5, dataset='/y_train')


data sizes: x_train (802100, 128), y_train (802100, 64), x_test (200526, 128), y_test (200526, 64) .


In [57]:
# Thanks to the excellent tutorial at:
# https://towardsdatascience.com/working-with-hugging-face-transformers-and-tf-2-0-89bf35e3555a
# Setup the config and embedding layer, then prep data.
distil_bert = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizerFast.from_pretrained(distil_bert)
max_input_size = 128
max_target_size = 64

config = DistilBertConfig(dropout=0.2, attention_dropout=0.2, trainable=False)
config.output_hidden_states = False
transformer_model = TFDistilBertModel.from_pretrained(distil_bert, config = config)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_layer_norm', 'vocab_transform', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [58]:
# Instantiate the full model.
# The crucial part for our setup is "teacher forcing", so as to properly teach the full triple generation
# Great starting example at: https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py#L159
vocab_size = tokenizer.vocab_size
num_encoder_tokens = num_decoder_tokens = vocab_size
latent_dim = int(max_input_size)

encoder_inputs = tf.keras.layers.Input(shape=(max_input_size,), name='input_token', dtype='int32')
encoder_masks  = tf.keras.layers.Input(shape=(max_input_size,), name='masked_token', dtype='int32')

lm_embedding = transformer_model(encoder_inputs, attention_mask=encoder_masks)[0]
encoder = tf.keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(lm_embedding)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = tf.keras.layers.Input(shape=(None, num_decoder_tokens))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = tf.keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                      initial_state=encoder_states)
decoder_dense = tf.keras.layers.Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = tf.keras.Model([encoder_inputs, encoder_masks, decoder_inputs], decoder_outputs)

for layer in model.layers[:3]:
  layer.trainable = False

print(model.summary())


Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_1 (TFDisti ((None, 128, 768),)  66362880    input_token[0][0]                
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, None, 30522) 0                                            
____________________________________________________________________________________________

In [59]:
# Prepare training data stream and iterate efficiently via tf.data
label_dim = tokenizer.vocab_size

In [60]:
model.load_weights(str(model_path / 'checkpoint_sample_oie'))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f06c0139e80>

In [61]:

# Next: inference mode (sampling).
# Here's the drill:
# 1) encode input and retrieve initial decoder state
# 2) run one step of decoder with this initial state
# and a "start of sequence" token as target.
# Output will be the next target token
# 3) Repeat with the current target token and current states

# Define sampling models
encoder_model = tf.keras.Model([encoder_inputs,encoder_masks], encoder_states)
print(encoder_model.summary())

lm_embedding = transformer_model(encoder_inputs, attention_mask=encoder_masks)[0]
encoder = tf.keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(lm_embedding)


decoder_state_input_h = tf.keras.layers.Input(shape=(latent_dim,))
decoder_state_input_c = tf.keras.layers.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = tf.keras.Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)
print(decoder_model.summary())

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_distil_bert_model_1 (TFDisti ((None, 128, 768),)  66362880    input_token[0][0]                
__________________________________________________________________________________________________
lstm_3 (LSTM)                   [(None, 128), (None, 459264      tf_distil_bert_model_1[0][0]     
Total params: 66,822,144
Trainable params: 459,264
Non-trainable params: 66,362,880
________

In [109]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    input_mask = np.array([0 if x==0 else 1 for x in input_seq[0]]).reshape(1,128)
    states_value = encoder_model.predict((input_seq,input_mask))

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    CLS_id = 101 # tokenizer.encode('[CLS]') to check this
    SEP_id = 102
    PAD_id = 0 
    target_seq[0, 0, CLS_id] = 1.

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    sentence_ids = []
    
    # We will only sample from the words we were given in the input, drawing from
    # the specific goal of a "triple extraction" task w.r.t a sentence.
    # without the word restriction, we end up combining extraction with ~paraphrase.
    eligible_word_ids = list(input_seq[0])
    eligible_word_ids.append(SEP_id)
    eligible_word_ids.append(SEP_id)
    eligible_word_ids.append(PAD_id)
    
    while not stop_condition:
        sent_wid_set = set(eligible_word_ids)
        sent_mask = np.ones(tokenizer.vocab_size)
        sent_mask[list(sent_wid_set)] = 0

        output_tokens, h, c = decoder_model.predict([target_seq]+states_value)

        # Sample a token
        word_likelihoods = output_tokens[0, -1, :]
        sent_word_likelihoods = np.ma.masked_array(word_likelihoods, mask=sent_mask)
        sampled_token_index = np.argmax(sent_word_likelihoods)
        sentence_ids.append(sampled_token_index)
        # remove from eligible words, as each predicted word can be considered
        # a "used" element of the original input
        eligible_word_ids.remove(sampled_token_index)

        # Exit condition: either hit max length
        # or find stop character.
        if (sampled_token_index == PAD_id or
           len(sentence_ids) > max_target_size):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return tokenizer.decode(sentence_ids)


In [116]:
def string_to_triples(text):
    text_ids = tokenizer.encode(text, max_length=max_input_size, padding='max_length')
    triples = decode_sequence(np.array([text_ids])).replace(' [PAD]','')
    print("---")
    print("Input: %s"%text)
    print("")
    print("Extracted: %s"% triples)
    
    
string_to_triples('The meeting was scheduled to start at half past six .')

---
Input: The meeting was scheduled to start at half past six .

Extracted: the meeting [SEP] was [SEP] to start at six [SEP]


### Discussion

## Raw model output (6 epochs)

Here is a sample of using the **raw** model after **6 epochs** of training on the full data:

1. Cuisine
> Input: a cake has multiple layers .
>
> Extracted: a bread [SEP] has [SEP] two pieces [SEP]

2. Mathematics
> Input: In mathematics, a monomial is, roughly speaking, a polynomial which has only one term.
>
> Extracted: a broad number [SEP] has [SEP] an infinite number [SEP]

3. Biology
>Input: A tiger is a cat with stripes .
>
>Extracted: a brother [SEP] is [SEP] a bear [SEP]

4. Social
> Input: The meeting was scheduled to start at half past six .
> 
> Extracted: the event [SEP] was [SEP] to be answered on 12 january [SEP]

## Sentence-only dictionary

Here is the same checkpoint (after 6 epochs) on these examples when we first mask the predictions to only make visibles words that appeared in the original input. Much better!

1. Cuisine
> Input: a cake has multiple layers .
>
> Extracted: a cake [SEP] has [SEP] multiple layers [SEP]

2. Mathematics
> Input: In mathematics, a monomial is, roughly speaking, a polynomial which has only one term.
>
> Extracted: a monol [SEP] has [SEP] one term [SEP]

3. Biology
>Input: A tiger is a cat with stripes .
>
>Extracted: a tiger [SEP] is [SEP] a tiger [SEP]

4. Social
> Input: The meeting was scheduled to start at half past six .
> 
> Extracted: the meeting [SEP] was [SEP] to start to the six [SEP]

### Sentence-only with one-use-per-input-occurrence
1. Cuisine
> Input: a cake has multiple layers .
>
> Extracted: a cake [SEP] has [SEP] multiple layers [SEP]

2. Mathematics
> Input: In mathematics, a monomial is, roughly speaking, a polynomial which has only one term.
>
> Extracted: a monol [SEP] has [SEP] one term [SEP]

3. Biology
>Input: A tiger is a cat with stripes .
>
>Extracted: a tiger [SEP] is [SEP] a [SEP]

4. Social
> Input: The meeting was scheduled to start at half past six .
> 
> Extracted: the meeting [SEP] was [SEP] to start at six [SEP]
