In [2]:
!pip install tf-models-official

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
You should consider upgrading via the '/home/ubuntu/anaconda3/envs/tensorflow2_latest_p37/bin/python -m pip install --upgrade pip' command.[0m


In [1]:
import os

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf

import tensorflow_hub as hub
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

from official.modeling import tf_utils
from official import nlp
from official.nlp import bert

# Load the required submodules
import official.nlp.optimization
import official.nlp.bert.bert_models
import official.nlp.bert.configs
import official.nlp.bert.run_classifier
import official.nlp.bert.tokenization
import official.nlp.data.classifier_data_lib
import official.nlp.modeling.losses
import official.nlp.modeling.models
import official.nlp.modeling.networks

In [2]:
gs_folder_bert = "gs://cloud-tpu-checkpoints/bert/v3/uncased_L-12_H-768_A-12"
tf.io.gfile.listdir(gs_folder_bert)

['bert_config.json',
 'bert_model.ckpt.data-00000-of-00001',
 'bert_model.ckpt.index',
 'vocab.txt']

In [3]:
hub_url_bert = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3"

In [4]:
glue, info = tfds.load('glue/mrpc', with_info=True,
                       # It's small, load the whole dataset
                       batch_size=-1)

In [5]:
list(glue.keys())

['train', 'validation', 'test']

In [6]:
info.features

FeaturesDict({
    'idx': tf.int32,
    'label': ClassLabel(shape=(), dtype=tf.int64, num_classes=2),
    'sentence1': Text(shape=(), dtype=tf.string),
    'sentence2': Text(shape=(), dtype=tf.string),
})

In [7]:
glue_train = glue['train']

for key, value in glue_train.items():
    print(f"{key:9s}: {value[0].numpy()}")

idx      : 1680
label    : 0
sentence1: b'The identical rovers will act as robotic geologists , searching for evidence of past water .'
sentence2: b'The rovers act as robotic geologists , moving on six wheels .'


In [4]:
# Set up tokenizer to generate Tensorflow dataset
tokenizer = bert.tokenization.FullTokenizer(
    vocab_file=os.path.join(gs_folder_bert, "vocab.txt"),
     do_lower_case=True)

print("Vocab size:", len(tokenizer.vocab))

Vocab size: 30522


In [5]:
tokens = tokenizer.tokenize("Hello TensorFlow!")
print(tokens)
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

['hello', 'tensor', '##flow', '!']
[7592, 23435, 12314, 999]


In [60]:
tf.ragged.constant([encode_sentence(s, tokenizer)
      for s in np.array(glue['train']["sentence1"])])[0]

<tf.Tensor: shape=(18,), dtype=int32, numpy=
array([ 1996,  7235,  9819,  2097,  2552,  2004, 20478, 21334,  2015,
        1010,  6575,  2005,  3350,  1997,  2627,  2300,  1012,   102],
      dtype=int32)>

In [81]:
tf.ragged.constant([encode_sentence(s, tokenizer)
      for s in np.array(glue['train']["sentence2"])])[0]

<tf.Tensor: shape=(14,), dtype=int32, numpy=
array([ 1996,  9819,  2552,  2004, 20478, 21334,  2015,  1010,  3048,
        2006,  2416,  7787,  1012,   102], dtype=int32)>

In [12]:
def encode_sentence(s, tokenizer):
    tokens = list(tokenizer.tokenize(s))
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

def bert_encode(glue_dict, tokenizer):
    num_examples = len(glue_dict["sentence1"])

    sentence1 = tf.ragged.constant([
      encode_sentence(s, tokenizer)
      for s in np.array(glue_dict["sentence1"])])
    sentence2 = tf.ragged.constant([
      encode_sentence(s, tokenizer)
       for s in np.array(glue_dict["sentence2"])])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]*sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)

    input_mask = tf.ones_like(input_word_ids).to_tensor()

    type_cls = tf.zeros_like(cls)
    type_s1 = tf.zeros_like(sentence1)
    type_s2 = tf.ones_like(sentence2)
    input_type_ids = tf.concat(
      [type_cls, type_s1, type_s2], axis=-1).to_tensor()

    inputs = {
      'input_word_ids': input_word_ids.to_tensor(),
      'input_mask': input_mask,
      'input_type_ids': input_type_ids}

    return inputs

In [16]:
#### Let's find out the max token size for sentence1
sentence1 = tf.ragged.constant([
  encode_sentence(s, tokenizer)
  for s in np.array(glue_train["sentence1"])])

In [14]:
sentence1.shape

TensorShape([3668, None])

In [43]:
glue_train_data = bert_encode(glue['train'], tokenizer)
glue_train_labels = glue['train']['label']

glue_validation_data = bert_encode(glue['validation'], tokenizer)
glue_validation_labels = glue['validation']['label']

glue_test_data = bert_encode(glue['test'], tokenizer)
glue_test_labels  = glue['test']['label']

In [47]:
glue_train_data['input_mask'][0]

<tf.Tensor: shape=(103,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>

In [25]:
for key, value in glue_train.items():
    print(f'{key:15s} shape: {value.shape}')

print(f'glue_train_labels shape: {glue_train_labels.shape}')

input_word_ids  shape: (3668, 103)
input_mask      shape: (3668, 103)
input_type_ids  shape: (3668, 103)
glue_train_labels shape: (3668,)


In [14]:
import json

bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())

bert_config = bert.configs.BertConfig.from_dict(config_dict)

config_dict

{'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'max_position_embeddings': 512,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'type_vocab_size': 2,
 'vocab_size': 30522}

In [15]:
bert_classifier, bert_encoder = bert.bert_models.classifier_model(
    bert_config, num_labels=2)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


In [16]:
# Set up epochs and steps
epochs = 7
batch_size = 32
eval_batch_size = 32

train_data_size = len(glue_train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [23]:
glue_train.keys()

dict_keys(['input_word_ids', 'input_mask', 'input_type_ids'])

In [25]:
glue_train['input_type_ids'].shape

TensorShape([3668, 103])

In [17]:
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

bert_classifier.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics)

bert_classifier.fit(
      glue_train_data, glue_train_labels,
      validation_data=(glue_validation_data, glue_validation_labels),
      batch_size=32,
      epochs=epochs)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x7fc6fc0ca650>

#### Use tf.data

Try this one https://colab.research.google.com/github/dlmacedo/starter-academic/blob/master/content/courses/deeplearning/notebooks/tensorflow/fine_tuning_bert.ipynb#scrollTo=XQeDFOzYR9Z9

In [6]:
import tensorflow_text as tf_text
def sentence_encoder(s, tokenizer, max_tokens=25):
    """
    This turns each sentence into a list of tokens, adds '[SEP]' token to end of the list, then turns tokens
    into ids and returns list of ids.

    INPUTS:
        s: input sentence
        tokenizer: an instance of BERT tokenizer

    OUTPUTS:
        (python list): list of ids of the words of input sentence 
    """
    tokens = tokenizer.tokenize(s.numpy())
#     if len(tokens) > max_tokens:
#         tokens = tokens[:max_tokens]
    tokens.append('[SEP]')
    return tokenizer.convert_tokens_to_ids(tokens)

In [48]:

def bert_input_encoder(data):
    """
    gets a dataframe of input sentences and returns required inputs of BERT encoder in a dictionary

    INPUTS:
    train_corpus (pandas.Dataframe): data frame of input sentences
    tokenizer: an instance of BERT tokenizer

    OUTPUTS:
    (python dictionary): A dictionary with 3 keys which has required inputs of BERT encoder
    """
#     s1_tokens = 50
#     s2_tokens = 50
   
#     sentence1 = tf.ragged.constant([sentence_encoder(data[0], tokenizer, s1_tokens)])
#     sentence2 = tf.ragged.constant([sentence_encoder(data[1], tokenizer, s2_tokens)])
    sentence1 = tf.ragged.constant([sentence_encoder(data[0], tokenizer)])
    sentence2 = tf.ragged.constant([sentence_encoder(data[1], tokenizer)])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]# * sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)
    
#     print("tf.shape(sentence1)=", tf.shape(sentence1))
#     print("tf.shape(sentence2)=", tf.shape(sentence2))
    #print(sentence1.to_tensor().shape[1])
    input_mask_1 = tf.ones(sentence1.to_tensor().shape[1]+sentence2.to_tensor().shape[1]+1)
    #we know that the tensor shape is 103, based on the max number of tokens in sentence1 and sentence2
    max_total_token_length = 103
    pad_len_1 = (max_total_token_length - 1) - (sentence1.to_tensor().shape[1]+sentence2.to_tensor().shape[1])
    paddings1 = [[0, pad_len_1]]
    input_mask = tf.pad(input_mask_1, paddings1, "CONSTANT")
    #input_mask = tf.concat([tf.ones(sentence1.to_tensor().shape[1]), tf.zeros(sentence2.to_tensor().shape[1])], axis=-1)
    segment_cls = tf.zeros_like(cls)
    segment_s1 = tf.zeros_like(sentence1)
    segment_s2 = tf.ones_like(sentence2)

    input_segment_ids = tf.concat([segment_cls, segment_s1, segment_s2], axis=-1).to_tensor()
    
    pad_len = max_total_token_length - tf.shape(input_segment_ids[0])[0]
    paddings = [[0, pad_len]]
    input_segment_ids = tf.pad(input_segment_ids[0], paddings, "CONSTANT")
    
    input_word_ids = tf.pad(input_word_ids[0], paddings, "CONSTANT")


    return input_word_ids, input_mask, input_segment_ids

In [49]:
def gen_glue():
    for s1, s2, l in zip(glue_train['sentence1'], glue_train['sentence2'], glue_train['label']):
        s1p = s1.numpy().decode('utf-8')
        s2p = s2.numpy().decode('utf-8')
        yield (s1p, s2p), l

In [50]:
ds_glue_train = tf.data.Dataset.from_generator(
            gen_glue, output_signature=(
                tf.TensorSpec(shape=(2,), dtype=(tf.string)),
                tf.TensorSpec(shape=( ), dtype=(tf.int32))))

In [51]:
for data, l in ds_glue_train.take(3):
#     s1_tokens = 25
#     s2_tokens = 25
    sentence1 = tf.ragged.constant([sentence_encoder(data[0], tokenizer)])
    sentence2 = tf.ragged.constant([sentence_encoder(data[1], tokenizer)])
    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]# * sentence1.shape[0]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)
    enc_bert = bert_input_encoder(data)
    print(enc_bert)

(<tf.Tensor: shape=(103,), dtype=int32, numpy=
array([  101,  1996,  7235,  9819,  2097,  2552,  2004, 20478, 21334,
        2015,  1010,  6575,  2005,  3350,  1997,  2627,  2300,  1012,
         102,  1996,  9819,  2552,  2004, 20478, 21334,  2015,  1010,
        3048,  2006,  2416,  7787,  1012,   102,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0], dtype=int32)>, <tf.Tensor: shape=(103,), dtype=float32, numpy=
array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1.

In [103]:
print(np.array(glue['train']["sentence1"]))
print(tf.ragged.constant([encode_sentence(s, tokenizer)
      for s in np.array(glue['train']["sentence1"])])[0])
tf.ragged.constant([encode_sentence(s, tokenizer)
      for s in np.array(glue['train']["sentence2"])])[0]

b'The identical rovers will act as robotic geologists , searching for evidence of past water .'
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type(s)= <class 'bytes'>
type

<tf.Tensor: shape=(14,), dtype=int32, numpy=
array([ 1996,  9819,  2552,  2004, 20478, 21334,  2015,  1010,  3048,
        2006,  2416,  7787,  1012,   102], dtype=int32)>

In [32]:
glue_train_data['input_word_ids'][0]

NameError: name 'glue_train_data' is not defined

In [276]:
glue_train_data['input_mask'][0]
#input_type_ids

<tf.Tensor: shape=(103,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>

In [247]:
glue_train_data['input_type_ids'][0]

<tf.Tensor: shape=(103,), dtype=int32, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>

In [52]:
BATCH_SIZE = 32
MAX_SEQ_LEN = 60
BUFFER_SIZE = 320

#res = ds_glue_train.map(lambda x, y: bert_input_encoder(x, tokenizer))
res = ds_glue_train.map(lambda x, y: tf.py_function(bert_input_encoder, [x], Tout=[tf.int32, tf.float32, tf.int32]))
#print(res)
a = res.map(lambda x, y, z: x)
b = res.map(lambda x, y, z: y)
c = res.map(lambda x, y, z: z)

lbls = ds_glue_train.map(lambda x, y: y)

f = tf.data.Dataset.zip((a,b,c))
d = tf.data.Dataset.zip((f, lbls))
#dataset_train = d.batch(BATCH_SIZE, drop_remainder=True)
dataset_train = d.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset_train)
print("********")
print(dataset_train.element_spec)

<BatchDataset shapes: ((<unknown>, <unknown>, <unknown>), (32,)), types: ((tf.int32, tf.float32, tf.int32), tf.int32)>
********
((TensorSpec(shape=<unknown>, dtype=tf.int32, name=None), TensorSpec(shape=<unknown>, dtype=tf.float32, name=None), TensorSpec(shape=<unknown>, dtype=tf.int32, name=None)), TensorSpec(shape=(32,), dtype=tf.int32, name=None))


In [34]:
for d in dataset_train.take(1):
    print(d)

KeyboardInterrupt: 

In [53]:
import json

bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())

bert_config = bert.configs.BertConfig.from_dict(config_dict)

config_dict

{'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'max_position_embeddings': 512,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'type_vocab_size': 2,
 'vocab_size': 30522}

In [54]:
bert_classifier, bert_encoder = bert.bert_models.classifier_model(
    bert_config, num_labels=2)

In [62]:
from sklearn import preprocessing
import numpy as np

def labels_to_tensors(labels):
    lbls = tf.reshape(tf.convert_to_tensor(labels, dtype=tf.int32), (labels.shape))
    lbls_ds = tf.data.Dataset.from_tensor_slices(lbls)
    return lbls_ds

labels = [label for label in glue_train['label']]
# labels = [d['label_text'] for d in self.data]
labelencoder = preprocessing.LabelEncoder()
labelencoder.fit(labels)
labels_enc = labelencoder.transform(labels)
test_labels = np.zeros(shape=(len(labels_enc), 2))
for idx, val in enumerate(labels_enc):
    test_labels[idx][val] = 1
labels_tnsr = labels_to_tensors(test_labels)
labels_tnsr

<TensorSliceDataset shapes: (2,), types: tf.int32>

In [63]:
len(labels_tnsr)

3668

In [55]:
# Set up epochs and steps
epochs = 7
batch_size = 32
eval_batch_size = 32

train_data_size = len(labels_tnsr)#len(glue_train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [41]:
for d, l in dataset_train.take(1):
    print(d)
    print("****")
    print(l)

(<tf.Tensor: shape=(32, 103), dtype=int32, numpy=
array([[  101, 13688,  2038, ...,     0,     0,     0],
       [  101,  4826,  1005, ...,     0,     0,     0],
       [  101,  2006,  1996, ...,     0,     0,     0],
       ...,
       [  101,  1996,  5662, ...,     0,     0,     0],
       [  101,  6661,  1997, ...,     0,     0,     0],
       [  101,  1999,  2060, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(32, 104), dtype=float32, numpy=
array([[1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       ...,
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.],
       [1., 1., 1., ..., 0., 0., 0.]], dtype=float32)>, <tf.Tensor: shape=(32, 103), dtype=int32, numpy=
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)>)
****
tf.

In [56]:
epochs = 7
metrics = [tf.keras.metrics.SparseCategoricalAccuracy('accuracy', dtype=tf.float32)]
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

bert_classifier.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics)

bert_classifier.fit(
      dataset_train,
      #validation_data=(glue_validation, glue_validation_labels),
      #batch_size=32,
      epochs=epochs)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<tensorflow.python.keras.callbacks.History at 0x7f1a746b8cd0>

#### We achieved good performance with tf.dataset structures

We will try using one hot encoded labels now

In [64]:
BATCH_SIZE = 32
MAX_SEQ_LEN = 60
BUFFER_SIZE = 320

#res = ds_glue_train.map(lambda x, y: bert_input_encoder(x, tokenizer))
res = ds_glue_train.map(lambda x, y: tf.py_function(bert_input_encoder, [x], Tout=[tf.int32, tf.float32, tf.int32]))
#print(res)
a = res.map(lambda x, y, z: x)
b = res.map(lambda x, y, z: y)
c = res.map(lambda x, y, z: z)

lbls = ds_glue_train.map(lambda x, y: y)

f = tf.data.Dataset.zip((a,b,c))
d = tf.data.Dataset.zip((f, labels_tnsr))
#dataset_train = d.batch(BATCH_SIZE, drop_remainder=True)
dataset_train = d.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset_train)
print("********")
print(dataset_train.element_spec)

<BatchDataset shapes: ((<unknown>, <unknown>, <unknown>), (32, 2)), types: ((tf.int32, tf.float32, tf.int32), tf.int32)>
********
((TensorSpec(shape=<unknown>, dtype=tf.int32, name=None), TensorSpec(shape=<unknown>, dtype=tf.float32, name=None), TensorSpec(shape=<unknown>, dtype=tf.int32, name=None)), TensorSpec(shape=(32, 2), dtype=tf.int32, name=None))


In [68]:
bert_classifier, bert_encoder = bert.bert_models.classifier_model(
    bert_config, num_labels=2)

In [69]:
# Set up epochs and steps
epochs = 7
batch_size = 32
eval_batch_size = 32

train_data_size = len(labels_tnsr)#len(glue_train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [70]:
epochs = 7
metrics = [tf.keras.metrics.CategoricalAccuracy('accuracy', dtype=tf.float32)]
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

bert_classifier.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics)

bert_classifier.fit(
      dataset_train,
      #validation_data=(glue_validation, glue_validation_labels),
      #batch_size=32,
      epochs=epochs)

Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7

KeyboardInterrupt: 

In [9]:
def bert_input_encoder_fever(data):
    """
    gets a dataframe of input sentences and returns required inputs of BERT encoder in a dictionary

    INPUTS:
    data: Tensor of strings
    tokenizer: an instance of BERT tokenizer (hardcoded for now)

    OUTPUTS:
    (python dictionary): A dictionary with 3 keys which has required inputs of BERT encoder
    """
    #limit input to a max length
    #np.quantile(evi_token_count, [0.5, 0.75, 0.9, 0.95, 0.98]) = array([ 47.,  86., 175., 287., 492.])
    #np.quantile(hyp_token_count, [0.5, 0.75, 0.9, 0.95, 0.98]) = array([17., 20., 23., 26., 29.])
    max_total_token_length = 200
    
    sentence1 = tf.ragged.constant([sentence_encoder(data[0], tokenizer)])
    sentence2 = tf.ragged.constant([sentence_encoder(data[1], tokenizer)])

    cls = [tokenizer.convert_tokens_to_ids(['[CLS]'])]
    input_word_ids = tf.concat([cls, sentence1, sentence2], axis=-1)
    
    segment_cls = tf.zeros_like(cls)
    segment_s1 = tf.zeros_like(sentence1)
    segment_s2 = tf.ones_like(sentence2)
    input_segment_ids = tf.concat([segment_cls, segment_s1, segment_s2], axis=-1).to_tensor()
    
    #is padding needed?
    total_concat_length = sentence1.to_tensor().shape[1]+sentence2.to_tensor().shape[1]+1 #sent1 + sent2 + separator
    
    if total_concat_length < max_total_token_length: #padding needed
        pad_length_mask = max_total_token_length - total_concat_length
        inp_mask_paddings = [[0, pad_length_mask]]
        input_mask_wo_padding = tf.ones(total_concat_length)
        input_mask = tf.pad(input_mask_wo_padding, inp_mask_paddings, "CONSTANT")
        pad_length_input_ids = max_total_token_length - tf.shape(input_segment_ids[0])[0]
        input_id_paddings = [[0, pad_length_input_ids]]
        input_segment_ids = tf.pad(input_segment_ids[0], input_id_paddings, "CONSTANT")
        input_word_ids = tf.pad(input_word_ids[0], input_id_paddings, "CONSTANT")
    else:
        #TODO, trim this to a fixed length
        input_mask = tf.ones(total_concat_length)
        input_segment_ids = input_segment_ids[0]
        input_word_ids = input_word_ids[0]
        
        #slice
        
        input_mask = input_mask[:max_total_token_length]
        input_segment_ids = input_segment_ids[:max_total_token_length]
        input_word_ids = input_word_ids[:max_total_token_length]
        
#     pad_len = max_total_token_length - tf.shape(input_segment_ids[0])[0]
#     paddings = [[0, pad_len]]
#     input_segment_ids = tf.pad(input_segment_ids[0], paddings, "CONSTANT")
    
#     input_word_ids = tf.pad(input_word_ids[0], paddings, "CONSTANT")


    return input_word_ids, input_mask, input_segment_ids

In [17]:
BATCH_SIZE = 32
MAX_SEQ_LEN = 60
BUFFER_SIZE = 320

#res = ds_glue_train.map(lambda x, y: bert_input_encoder(x, tokenizer))
res = ds_glue_train.map(lambda x, y: tf.py_function(bert_input_encoder_fever, [x], Tout=[tf.int32, tf.float32, tf.int32]))
#print(res)
a = res.map(lambda x, y, z: x)
b = res.map(lambda x, y, z: y)
c = res.map(lambda x, y, z: z)

lbls = ds_glue_train.map(lambda x, y: y)

f = tf.data.Dataset.zip((a,b,c))
d = tf.data.Dataset.zip((f, labels_tnsr))
#dataset_train = d.batch(BATCH_SIZE, drop_remainder=True)
dataset_train = d.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset_train)
print("********")
print(dataset_train.element_spec)

NameError: name 'ds_glue_train' is not defined

In [16]:
for d, l in dataset_train.take(1):
    print(d)
    print("****")
    print(l)

NameError: name 'dataset_train' is not defined

In [75]:
bert_classifier, bert_encoder = bert.bert_models.classifier_model(
    bert_config, num_labels=2)

In [35]:
# Set up epochs and steps
epochs = 7
batch_size = 32
eval_batch_size = 32

train_data_size = len(labels_tnsr)#len(glue_train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [None]:
epochs = 7
metrics = [tf.keras.metrics.CategoricalAccuracy('accuracy', dtype=tf.float32)]
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

bert_classifier.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics)

bert_classifier.fit(
      dataset_train,
      #validation_data=(glue_validation, glue_validation_labels),
      #batch_size=32,
      epochs=epochs)

Epoch 1/7
   2001/Unknown - 3860s 2s/step - loss: 0.3413 - accuracy: 0.8682

#### BERT on FEVER

In [7]:
from mda.src.dataset.DatasetReader import DatasetReader

In [8]:
#load train dataset
infile = 'working/data/training/train.ns.pages.p5.jsonl'
dsreader = DatasetReader(in_file=infile,label_checkpoint_file=None, database_path='data/data/fever/fever.db')
raw, data = dsreader.read()
ds_train = dsreader.get_dataset()
print(ds_train.element_spec)

#load dev dataset
infile = 'working/data/training/paper_dev.ns.pages.p5.jsonl'
label_checkpoint_file = 'working/data/training/label_encoder_train.pkl'
#note, use type = 'train' since formatting would be like the train examples
dsreader = DatasetReader(in_file=infile,label_checkpoint_file=label_checkpoint_file, database_path='data/data/fever/fever.db', type='train')
raw_dev, data_dev = dsreader.read()
ds_dev = dsreader.get_dataset()
print(ds_dev.element_spec)

100%|██████████| 145449/145449 [00:01<00:00, 81527.88it/s] 
100%|██████████| 145449/145449 [00:01<00:00, 139745.66it/s]
  0%|          | 0/9999 [00:00<?, ?it/s]

(TensorSpec(shape=(2,), dtype=tf.string, name=None), TensorSpec(shape=(3,), dtype=tf.int32, name=None))


100%|██████████| 9999/9999 [00:00<00:00, 23552.15it/s]
100%|██████████| 9999/9999 [00:00<00:00, 196361.30it/s]

(TensorSpec(shape=(2,), dtype=tf.string, name=None), TensorSpec(shape=(3,), dtype=tf.int32, name=None))





In [10]:
BATCH_SIZE = 32
MAX_SEQ_LEN = 60
BUFFER_SIZE = 3200
# def tokenize_and_pad(text, max_len):
#     segment = pt_tokenizer.tokenize(text).merge_dims(1, -1)
#     inp = segment.to_tensor(shape=[None, max_len])
#     return inp[0]


#f = ds_train.map(lambda x, y: bert_input_encoder(x, tokenizer))
res = ds_train.map(lambda x, y: tf.py_function(bert_input_encoder_fever, [x], Tout=[tf.int32, tf.float32, tf.int32]))
a = res.map(lambda x, y, z: x)
b = res.map(lambda x, y, z: y)
c = res.map(lambda x, y, z: z)
#print(f)
# h = ds_train.map(lambda x, y: tokenize_and_pad(x[0], MAX_SEQ_LEN))
# e = ds_train.map(lambda x, y: tokenize_and_pad(x[1], MAX_SEQ_LEN))
l = ds_train.map(lambda x, y: y)
f = tf.data.Dataset.zip((a,b,c))
d = tf.data.Dataset.zip((f, l))
dataset_train = d.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset_train)
print("********")
print(dataset_train.element_spec)

<BatchDataset shapes: ((<unknown>, <unknown>, <unknown>), (32, 3)), types: ((tf.int32, tf.float32, tf.int32), tf.int32)>
********
((TensorSpec(shape=<unknown>, dtype=tf.int32, name=None), TensorSpec(shape=<unknown>, dtype=tf.float32, name=None), TensorSpec(shape=<unknown>, dtype=tf.int32, name=None)), TensorSpec(shape=(32, 3), dtype=tf.int32, name=None))


In [11]:
BATCH_SIZE = 32
MAX_SEQ_LEN = 60
BUFFER_SIZE = 3200
# def tokenize_and_pad(text, max_len):
#     segment = pt_tokenizer.tokenize(text).merge_dims(1, -1)
#     inp = segment.to_tensor(shape=[None, max_len])
#     return inp[0]


#f = ds_train.map(lambda x, y: bert_input_encoder(x, tokenizer))
res = ds_dev.map(lambda x, y: tf.py_function(bert_input_encoder_fever, [x], Tout=[tf.int32, tf.float32, tf.int32]))
a = res.map(lambda x, y, z: x)
b = res.map(lambda x, y, z: y)
c = res.map(lambda x, y, z: z)
#print(f)
# h = ds_train.map(lambda x, y: tokenize_and_pad(x[0], MAX_SEQ_LEN))
# e = ds_train.map(lambda x, y: tokenize_and_pad(x[1], MAX_SEQ_LEN))
l = ds_dev.map(lambda x, y: y)
f = tf.data.Dataset.zip((a,b,c))
d = tf.data.Dataset.zip((f, l))
dataset_test = d.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset_test)
print("********")
print(dataset_test.element_spec)

<BatchDataset shapes: ((<unknown>, <unknown>, <unknown>), (32, 3)), types: ((tf.int32, tf.float32, tf.int32), tf.int32)>
********
((TensorSpec(shape=<unknown>, dtype=tf.int32, name=None), TensorSpec(shape=<unknown>, dtype=tf.float32, name=None), TensorSpec(shape=<unknown>, dtype=tf.int32, name=None)), TensorSpec(shape=(32, 3), dtype=tf.int32, name=None))


In [12]:
from sklearn import preprocessing
import numpy as np

def labels_to_tensors(labels):
    lbls = tf.reshape(tf.convert_to_tensor(labels, dtype=tf.int32), (labels.shape))
    lbls_ds = tf.data.Dataset.from_tensor_slices(lbls)
    return lbls_ds

labels = [d['label_text'] for d in data]
# labels = [d['label_text'] for d in self.data]
labelencoder = preprocessing.LabelEncoder()
labelencoder.fit(labels)
labels_enc = labelencoder.transform(labels)
test_labels = np.zeros(shape=(len(labels_enc), 3))
for idx, val in enumerate(labels_enc):
    test_labels[idx][val] = 1
labels_tnsr = labels_to_tensors(test_labels)
labels_tnsr

<TensorSliceDataset shapes: (3,), types: tf.int32>

In [13]:
len(labels_tnsr)

145449

In [14]:
import json

bert_config_file = os.path.join(gs_folder_bert, "bert_config.json")
config_dict = json.loads(tf.io.gfile.GFile(bert_config_file).read())

bert_config = bert.configs.BertConfig.from_dict(config_dict)

config_dict

{'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'max_position_embeddings': 512,
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'type_vocab_size': 2,
 'vocab_size': 30522}

In [15]:
bert_classifier, bert_encoder = bert.bert_models.classifier_model(
    bert_config, num_labels=3)

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.


In [16]:
# Set up epochs and steps
epochs = 7
batch_size = 32
#eval_batch_size = 32

train_data_size = len(labels_tnsr)#len(glue_train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp.optimization.create_optimizer(
    2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [36]:
!mkdir -p tmp/bert_fever

In [17]:
!rm -rf tmp/bert_fever/checkpoint_bert_nli

In [18]:
checkpoint_filepath = 'tmp/bert_fever/checkpoint_bert_nli'
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)

stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=8)

In [19]:
epochs = 30
metrics = [tf.keras.metrics.CategoricalAccuracy('accuracy', dtype=tf.float32)]
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=True)

bert_classifier.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metrics)

bert_classifier.fit(
      dataset_train,
      validation_data=dataset_test,
      #batch_size=32,
      epochs=epochs,
      callbacks=[stop_early, model_checkpoint_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30


<tensorflow.python.keras.callbacks.History at 0x7f7814564990>

In [21]:
checkpoint_filepath = 'tmp/bert_fever/checkpoint_bert_nli'
bert_classifier.load_weights(checkpoint_filepath)
bert_classifier.evaluate(dataset_test)



[0.4495616853237152, 0.7913661599159241]