### BERT NLI

In [1]:
!pip install tf-models-official

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting tf-models-official
  Downloading tf_models_official-2.5.1-py2.py3-none-any.whl (1.6 MB)
[K     |████████████████████████████████| 1.6 MB 4.2 MB/s eta 0:00:01
[?25hCollecting google-api-python-client>=1.6.7
  Downloading google_api_python_client-2.15.0-py2.py3-none-any.whl (7.2 MB)
[K     |████████████████████████████████| 7.2 MB 20.2 MB/s eta 0:00:01
Collecting gin-config
  Downloading gin_config-0.4.0-py2.py3-none-any.whl (46 kB)
[K     |████████████████████████████████| 46 kB 8.3 MB/s  eta 0:00:01
Collecting tf-slim>=1.1.0
  Downloading tf_slim-1.1.0-py2.py3-none-any.whl (352 kB)
[K     |████████████████████████████████| 352 kB 95.7 MB/s eta 0:00:01
[?25hCollecting sacrebleu
  Downloading sacrebleu-1.5.1-py3-none-any.whl (54 kB)
[K     |████████████████████████████████| 54 kB 5.5 MB/s s eta 0:00:01
Collecting tensorflow-model-optimization>=0.4.1
  Downloading tensorflow_model_optimiz

In [2]:
import tensorflow as tf
import tensorflow_hub as hub
from official.nlp import bert
import official.nlp.bert.tokenization
import os

In [3]:
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/keras_bert/uncased_L-12_H-768_A-12"
tokenizer = bert.tokenization.FullTokenizer(vocab_file=os.path.join(gs_folder_bert, "vocab.txt"),
     do_lower_case=True)

In [4]:
def sentence_encoder(s, tokenizer, max_tokens=25):
    """
    This turns each sentence into a list of tokens, adds '[SEP]' token to end of the list, then turns tokens
    into ids and returns list of ids.

    INPUTS:
        s: input sentence
        tokenizer: an instance of BERT tokenizer

    OUTPUTS:
        (python list): list of ids of the words of input sentence 
    """
    tokens = list(tokenizer.tokenize(str(s)))
    if len(tokens) > max_tokens:
        tokens = tokens[:max_tokens]
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

Source: https://hamedhelali.github.io/project/Fine-tuning-BERT-For-NLI/

In [5]:
### Ref: https://hamedhelali.github.io/project/Fine-tuning-BERT-For-NLI/
def bert_input_encoder_backup(train_corpus, tokenizer):
    """
    gets a dataframe of input sentences and returns required inputs of BERT encoder in a dictionary

    INPUTS:
    train_corpus (pandas.Dataframe): data frame of input sentences
    tokenizer: an instance of BERT tokenizer

    OUTPUTS:
    (python dictionary): A dictionary with 3 keys which has required inputs of BERT encoder
    """
    sentence1 = tf.ragged.constant([sentence_encoder(s, tokenizer, 25) for s in np.array(train_corpus['sentence1'])])
    print("sentence1=", sentence1.shape)
    sentence2 = tf.ragged.constant([sentence_encoder(s, tokenizer, 40) for s in np.array(train_corpus['sentence2'])])
    print("sentence2=", sentence2.shape)

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])] * sentence1.shape[0]
    print("len(cls)=", len(cls))
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor()

    segment_cls = tf.zeros_like(cls)
    segment_s1 = tf.zeros_like(sentence1)
    segment_s2 = tf.ones_like(sentence2)
    input_segment_ids = tf.concat([segment_cls, segment_s1, segment_s2], axis=-1).to_tensor()

    inputs_dic = {
    'input_word_ids': input_word_ids.to_tensor(),
    'input_mask': input_mask,
    'input_segment_ids': input_segment_ids
    }

    return inputs_dic

def bert_input_encoder(data, tokenizer):
    """
    gets a dataframe of input sentences and returns required inputs of BERT encoder in a dictionary

    INPUTS:
    train_corpus (pandas.Dataframe): data frame of input sentences
    tokenizer: an instance of BERT tokenizer

    OUTPUTS:
    (python dictionary): A dictionary with 3 keys which has required inputs of BERT encoder
    """
    s1_tokens = 50
    s2_tokens = 75

    sentence1 = tf.ragged.constant([sentence_encoder(data[0], tokenizer, s1_tokens)])
    
    sentence2 = tf.ragged.constant([sentence_encoder(data[1], tokenizer, s2_tokens)])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])] * sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)
    
    input_mask = tf.ones(s1_tokens+1+s2_tokens+1+1)

    segment_cls = tf.zeros_like(cls)
    segment_s1 = tf.zeros_like(sentence1)
    segment_s2 = tf.ones_like(sentence2)

    input_segment_ids = tf.concat([segment_cls, segment_s1, segment_s2], axis=-1).to_tensor()
    
    pad_len = (s1_tokens + 1 + s2_tokens + 1 + 1) - tf.shape(input_segment_ids[0])[0]
    paddings = [[0, pad_len]]
    input_segment_ids = tf.pad(input_segment_ids[0], paddings, "CONSTANT")
    
    input_word_ids = tf.pad(input_word_ids[0], paddings, "CONSTANT")


    return input_word_ids, input_mask, input_segment_ids

In [6]:
import tensorflow_datasets as tfds

In [7]:
glue, info = tfds.load('glue/mrpc', with_info=True,
                       # It's small, load the whole dataset
                       batch_size=-1)

In [8]:
list(glue.keys())

['train', 'validation', 'test']

In [9]:
info.features

FeaturesDict({
    'idx': tf.int32,
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    'sentence1': Text(shape=(), dtype=tf.string),
    'sentence2': Text(shape=(), dtype=tf.string),
})

In [10]:
glue_train = glue['train']
for key, value in glue_train.items():
    print(f"{key:9s}: {value[0].numpy()}")

idx      : 1680
label    : 0
sentence1: b'The identical rovers will act as robotic geologists , searching for evidence of past water .'
sentence2: b'The rovers act as robotic geologists , moving on six wheels .'


In [10]:
import unicodedata
import re
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')
def preprocess(w):
    w = unicode_to_ascii(w.lower().strip())
    # creating a space between a word and the punctuation following it
    # eg: "he is a boy." => "he is a boy ."
    # Reference:- https://stackoverflow.com/questions/3645931/python-padding-punctuation-with-white-spaces-keeping-punctuation
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    # replacing everything with space except (a-z, A-Z, ".", "?", "!", ",")
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.strip()
    # adding a start and an end token to the sentence
    # so that the model know when to start and stop predicting.
    w = '[START] ' + w + ' [END]'
    return w

In [11]:
from sklearn import preprocessing
import numpy as np

def labels_to_tensors(labels):
    lbls = tf.reshape(tf.convert_to_tensor(labels, dtype=tf.int32), (labels.shape))
    lbls_ds = tf.data.Dataset.from_tensor_slices(lbls)
    return lbls_ds

labels = [label for label in glue_train['label']]
# labels = [d['label_text'] for d in self.data]
labelencoder = preprocessing.LabelEncoder()
labelencoder.fit(labels)
labels_enc = labelencoder.transform(labels)
test_labels = np.zeros(shape=(len(labels_enc), 3))
for idx, val in enumerate(labels_enc):
    test_labels[idx][val] = 1
labels_tnsr = labels_to_tensors(test_labels)
labels_tnsr

<TensorSliceDataset shapes: (3,), types: tf.int32>

In [12]:
def gen_glue():
    for s1, s2, l in zip(glue_train['sentence1'], glue_train['sentence2'], glue_train['label']):
        s1p = preprocess(s1.numpy().decode('utf-8'))
        s2p = preprocess(s2.numpy().decode('utf-8'))
        yield (s1p, s2p), l

In [13]:
ds_glue_train = tf.data.Dataset.from_generator(
            gen_glue, output_signature=(
                tf.TensorSpec(shape=(2,), dtype=(tf.string)),
                tf.TensorSpec(shape=( ), dtype=(tf.int32))))

In [22]:
BATCH_SIZE = 64
MAX_SEQ_LEN = 60
BUFFER_SIZE = 320

res = ds_glue_train.map(lambda x, y: bert_input_encoder(x, tokenizer))
a = res.map(lambda x, y, z: x)
b = res.map(lambda x, y, z: y)
c = res.map(lambda x, y, z: z)


f = tf.data.Dataset.zip((a,b,c))
d = tf.data.Dataset.zip((f, labels_tnsr))
dataset_train = d.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset_train)
print("********")
print(dataset_train.element_spec)

<BatchDataset shapes: (((64, None), (64, 128), (64, None)), (64, 3)), types: ((tf.int32, tf.float32, tf.int32), tf.int32)>
********
((TensorSpec(shape=(64, None), dtype=tf.int32, name=None), TensorSpec(shape=(64, 128), dtype=tf.float32, name=None), TensorSpec(shape=(64, None), dtype=tf.int32, name=None)), TensorSpec(shape=(64, 3), dtype=tf.int32, name=None))


In [139]:
for d in dataset_train.take(1):
    print(d)

((<tf.Tensor: shape=(2, 128), dtype=int32, numpy=
array([[  101, 23435,  1006,  1000, 18045,  2094,  1035, 14704,  1024,
         1014,  1000,  1010,  4338,  1027,  1006,  1007,  1010, 26718,
        18863,  1027,  5164,  1007,   102, 23435,  1006,  1000, 18045,
         2094,  1035, 14704,  1035,  1015,  1024,  1014,  1000,  1010,
         4338,  1027,  1006,  1007,  1010, 26718, 18863,  1027,  5164,
         1007,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0

In [15]:
def inference_glue_model_builder():
    """
    Builds and returns a model instance of Keras with the functional API
    """
    max_seq_length = None

    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
    input_segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_segment_ids")

    bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)

    pooled_output, _ = bert_layer([input_word_ids, input_mask, input_segment_ids])
    # pooled output is the embedding output for the '[CLS]' token that is dependant on all words of two sentences
    # and can be used for classfication purposes

    output_class = tf.keras.layers.Dense(units=3, activation='softmax')(pooled_output)

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_segment_ids], outputs=output_class)

    optimizer = tf.keras.optimizers.Adam(lr=1e-5)
    model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
#     model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-5),
#               loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
#               metrics=['accuracy'])
    
    return model

In [16]:
model = inference_glue_model_builder()
model.fit(dataset_train,
          epochs=5,
          verbose=1)

  "The `lr` argument is deprecated, use `learning_rate` instead.")


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f5d73af4a90>

#### Try a different model on same GLUE dataset

In [11]:
import os

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

# Load the required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks

In [12]:
import json

bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())

bert_config = bert.configs.BertConfig.from_dict(config_dict)

config_dict

{'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'max_position_embeddings': 512,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'type_vocab_size': 2,
 'vocab_size': 30522}

In [13]:
bert_classifier, bert_encoder = bert.bert_models.classifier_model(
    bert_config, num_labels=2)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


#### Dataset

In [17]:
len(labels_tnsr)

3668

In [14]:
def encode_sentence(s):
    tokens = list(tokenizer.tokenize(s.numpy()))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

sentence1 = tf.ragged.constant([
    encode_sentence(s) for s in glue_train["sentence1"]])
sentence2 = tf.ragged.constant([
    encode_sentence(s) for s in glue_train["sentence2"]])

In [16]:
print("Sentence1 shape:", sentence1.shape.as_list())
print("Sentence2 shape:", sentence2.shape.as_list())

Sentence1 shape: [3668, None]
Sentence2 shape: [3668, None]


In [17]:
cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

In [18]:
input_mask = tf.ones_like(input_word_ids).to_tensor()

In [19]:
type_cls = tf.zeros_like(cls)
type_s1 = tf.zeros_like(sentence1)
type_s2 = tf.ones_like(sentence2)
input_type_ids = tf.concat([type_cls, type_s1, type_s2], axis=-1).to_tensor()

In [20]:
def encode_sentence(s, tokenizer):
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(glue_dict, tokenizer):
    num_examples = len(glue_dict["sentence1"])

    sentence1 = tf.ragged.constant([
      encode_sentence(s, tokenizer)
      for s in np.array(glue_dict["sentence1"])])
    sentence2 = tf.ragged.constant([
      encode_sentence(s, tokenizer)
       for s in np.array(glue_dict["sentence2"])])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor()

    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

    inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

    return inputs

In [21]:
glue_train = bert_encode(glue['train'], tokenizer)
glue_train_labels = glue['train']['label']

In [23]:
for key, value in glue_train.items():
    print(f'{key:15s} shape: {value.shape}')

print(f'glue_train_labels shape: {glue_train_labels.shape}')

input_word_ids  shape: (3668, 103)
input_mask      shape: (3668, 103)
input_type_ids  shape: (3668, 103)
glue_train_labels shape: (3668,)


In [18]:
BATCH_SIZE = 64
MAX_SEQ_LEN = 60
BUFFER_SIZE = 320

res = ds_glue_train.map(lambda x, y: bert_input_encoder(x, tokenizer))
a = res.map(lambda x, y, z: x)
b = res.map(lambda x, y, z: y)
c = res.map(lambda x, y, z: z)

lbls = ds_glue_train.map(lambda x, y: y)

f = tf.data.Dataset.zip((a,b,c))
d = tf.data.Dataset.zip((f, lbls))
dataset_train = d.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset_train)
print("********")
print(dataset_train.element_spec)

<BatchDataset shapes: (((64, None), (64, 128), (64, None)), (64,)), types: ((tf.int32, tf.float32, tf.int32), tf.int32)>
********
((TensorSpec(shape=(64, None), dtype=tf.int32, name=None), TensorSpec(shape=(64, 128), dtype=tf.float32, name=None), TensorSpec(shape=(64, None), dtype=tf.int32, name=None)), TensorSpec(shape=(64,), dtype=tf.int32, name=None))


In [24]:
### special
# optimizer = tf.keras.optimizers.Adam(lr=1e-5)
####

# Set up epochs and steps
epochs = 5
batch_size = 64
#eval_batch_size = 32

#train_data_size = 3668 # len(glue_train_labels), we are using tf.data, do not know the size of the data from the datset, hardcode it since we know it already
train_data_size = len(glue_train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)


metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

bert_classifier.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics)

bert_classifier.fit(
      glue_train, glue_train_labels,
      epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f73d326b0d0>

#### Try pre-trained BERT NLI task on FEVER dataset

In [12]:
from mda.src.dataset.DatasetReader import DatasetReader

In [13]:
#load train dataset
infile = 'working/data/training/train.ns.pages.p5.jsonl'
dsreader = DatasetReader(in_file=infile,label_checkpoint_file=None, database_path='data/data/fever/fever.db')
raw, data = dsreader.read()
ds_train = dsreader.get_dataset()
print(ds_train.element_spec)

#load dev dataset
infile = 'working/data/training/paper_dev.ns.pages.p5.jsonl'
label_checkpoint_file = 'working/data/training/label_encoder_train.pkl'
#note, use type = 'train' since formatting would be like the train examples
dsreader = DatasetReader(in_file=infile,label_checkpoint_file=label_checkpoint_file, database_path='data/data/fever/fever.db', type='train')
raw_dev, data_dev = dsreader.read()
ds_dev = dsreader.get_dataset()
print(ds_dev.element_spec)

100%|██████████| 145449/145449 [00:01<00:00, 89768.16it/s] 
100%|██████████| 145449/145449 [00:01<00:00, 140599.64it/s]
  0%|          | 0/9999 [00:00<?, ?it/s]

(TensorSpec(shape=(2,), dtype=tf.string, name=None), TensorSpec(shape=(3,), dtype=tf.int32, name=None))


100%|██████████| 9999/9999 [00:00<00:00, 23756.67it/s]
100%|██████████| 9999/9999 [00:00<00:00, 197156.09it/s]

(TensorSpec(shape=(2,), dtype=tf.string, name=None), TensorSpec(shape=(3,), dtype=tf.int32, name=None))





In [14]:
for d in ds_train.take(1):
    print(d)

(<tf.Tensor: shape=(2,), dtype=string, numpy=
array([b'[START] nikolaj coster waldau worked with the fox broadcasting company . [END]',
       b'He then played Detective John Amsterdam in the short-lived Fox television series New Amsterdam -LRB- 2008 -RRB- , as well as appearing as Frank Pike in the 2009 Fox television film Virtuality , originally intended as a pilot . The Fox Broadcasting Company -LRB- often shortened to Fox and stylized as FOX -RRB- is an American English language commercial broadcast television network that is owned by the Fox Entertainment Group subsidiary of 21st Century Fox .'],
      dtype=object)>, <tf.Tensor: shape=(3,), dtype=int32, numpy=array([0, 0, 1], dtype=int32)>)


In [19]:
for f, l in ds_train.take(3):
    input_word_ids, input_mask, input_segment_ids = bert_input_encoder(f, tokenizer)

In [20]:
for f, l in ds_train.take(3):
    inputs = bert_input_encoder(f, tokenizer)
    print(inputs)
    #print(l)

(<tf.Tensor: shape=(128,), dtype=int32, numpy=
array([  101,  1056,  2546,  1012, 23435,  1006,  1038,  1005,  1031,
        2707,  1033, 24794,  3501,  3465,  2121, 24547,  2850,  2226,
        2499,  2007,  1996,  4419,  5062,  2194,  1012,  1031,  2203,
        1033,  1005,  1010,  4338,  1027,  1006,  1007,  1010, 26718,
       18863,  1027,  5164,  1007,   102,  1056,  2546,  1012, 23435,
        1006,  1038,  1005,  2002,  2059,  2209,  6317,  2198,  7598,
        1999,  1996,  2460,  1011,  2973,  4419,  2547,  2186,  2047,
        7598,  1011,  1048, 15185,  1011,  2263,  1011, 25269,  2497,
        1011,  1010,  2004,  2092,  2004,  6037,  2004,  3581, 12694,
        1999,  1996,  2268,  4419,  2547,  2143,  7484,  3012,  1010,
        2761,  3832,  2004,  1037,  4405,  1012,  1996,  4419,  5062,
        2194,  1011,  1048, 15185,  1011,  2411, 12641,  2000,  4419,
        1998, 19551,  2004,  4419,  1011, 25269,  2497,  1011,   102,
           0,     0,     0,     0,     0,  

In [21]:
BATCH_SIZE = 64
MAX_SEQ_LEN = 60
BUFFER_SIZE = 3200
# def tokenize_and_pad(text, max_len):
#     segment = pt_tokenizer.tokenize(text).merge_dims(1, -1)
#     inp = segment.to_tensor(shape=[None, max_len])
#     return inp[0]


#f = ds_train.map(lambda x, y: bert_input_encoder(x, tokenizer))
res = ds_train.map(lambda x, y: bert_input_encoder(x, tokenizer))
a = res.map(lambda x, y, z: x)
b = res.map(lambda x, y, z: y)
c = res.map(lambda x, y, z: z)
#print(f)
# h = ds_train.map(lambda x, y: tokenize_and_pad(x[0], MAX_SEQ_LEN))
# e = ds_train.map(lambda x, y: tokenize_and_pad(x[1], MAX_SEQ_LEN))
l = ds_train.map(lambda x, y: y)
f = tf.data.Dataset.zip((a,b,c))
d = tf.data.Dataset.zip((f, l))
dataset_train = d.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset_train)
print("********")
print(dataset_train.element_spec)

<BatchDataset shapes: (((64, None), (64, 128), (64, None)), (64, 3)), types: ((tf.int32, tf.float32, tf.int32), tf.int32)>
********
((TensorSpec(shape=(64, None), dtype=tf.int32, name=None), TensorSpec(shape=(64, 128), dtype=tf.float32, name=None), TensorSpec(shape=(64, None), dtype=tf.int32, name=None)), TensorSpec(shape=(64, 3), dtype=tf.int32, name=None))


In [466]:
for d, l in dataset_train.take(3):
    print(d)
    print(l)

(<tf.Tensor: shape=(2, 128), dtype=int32, numpy=
array([[  101, 23435,  1006,  1000, 18045,  2094,  1035, 14704,  1024,
         1014,  1000,  1010,  4338,  1027,  1006,  1007,  1010, 26718,
        18863,  1027,  5164,  1007,   102, 23435,  1006,  1000, 18045,
         2094,  1035, 14704,  1035,  1015,  1024,  1014,  1000,  1010,
         4338,  1027,  1006,  1007,  1010, 26718, 18863,  1027,  5164,
         1007,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,

In [117]:
def inference_model_builder():
    """
    Builds and returns a model instance of Keras with the functional API
    """
    max_seq_length = None

    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")
    input_segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_segment_ids")

    bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2", trainable=True)

    pooled_output, _ = bert_layer([input_word_ids, input_mask, input_segment_ids])
    # pooled output is the embedding output for the '[CLS]' token that is dependant on all words of two sentences
    # and can be used for classfication purposes

    output_class = tf.keras.layers.Dense(units=3, activation='softmax')(pooled_output)

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_segment_ids], outputs=output_class)

    optimizer = tf.keras.optimizers.Adam(lr=1e-5)
    model.compile(optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
#     model.compile(optimizer=tf.keras.optimizers.Adam(lr=1e-5),
#               loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
#               metrics=['accuracy'])
    
    return model

In [118]:
model = inference_model_builder()
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, None)]       0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, None)]       0                                            
__________________________________________________________________________________________________
input_segment_ids (InputLayer)  [(None, None)]       0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

  "The `lr` argument is deprecated, use `learning_rate` instead.")


In [469]:
model.evaluate(dataset_train)

    149/Unknown - 9s 18ms/step - loss: 1.4469 - accuracy: 0.2651

KeyboardInterrupt: 

In [446]:
for f, l in dataset_train.take(1):
    print(f)

(<tf.Tensor: shape=(64, 47), dtype=int32, numpy=
array([[  101, 23435,  1006, ...,  5164,  1007,   102],
       [  101, 23435,  1006, ...,  5164,  1007,   102],
       [  101, 23435,  1006, ...,  5164,  1007,   102],
       ...,
       [  101, 23435,  1006, ...,  5164,  1007,   102],
       [  101, 23435,  1006, ...,  5164,  1007,   102],
       [  101, 23435,  1006, ...,  5164,  1007,   102]], dtype=int32)>, <tf.Tensor: shape=(64, 128), dtype=float32, numpy=
array([[1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.],
       [1., 1., 1., ..., 1., 1., 1.]], dtype=float32)>, <tf.Tensor: shape=(64, 128), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>)


In [11]:
model.fit(dataset_train,
          epochs=5,
          verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7fdd647b8f90>

In [6]:
import os

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

# Load the required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks

In [7]:
import json
import os

bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())

bert_config = bert.configs.BertConfig.from_dict(config_dict)

config_dict

{'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'max_position_embeddings': 512,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'type_vocab_size': 2,
 'vocab_size': 30522}

In [9]:
bert_classifier, bert_encoder = bert.bert_models.classifier_model(
    bert_config, num_labels=3)