In [None]:
!pip install -q --upgrade pip
!pip install -q tensorflow==2.2.0-rc2
!pip install -q transformers

In [None]:
import tensorflow as tf
import pandas as pd
from tensorflow.io import TFRecordWriter
from tensorflow.io import FixedLenFeature
from tensorflow.data import TFRecordDataset

In [None]:
MAX_SEQ_LENGTH=128
BATCH_SIZE=32
EVAL_BATCH_SIZE=BATCH_SIZE*2
EPOCHS=1
STEPS_PER_EPOCH=30
VALIDATION_STEPS=30
CLASSES = [1, 2, 3, 4, 5]
                    
# XLA is an optimization compiler for tensorflow
USE_XLA = True

# Mixed precision can help to speed up training time
USE_AMP = True

In [None]:
# # TFRecords encode and store data
# train_dataset = TFRecordDataset("./data-tfrecord/train/part-algo-1-amazon_reviews_us_Software_v1_00.tfrecord")
# validation_dataset = TFRecordDataset("./data-tfrecord/validation/part-algo-1-amazon_reviews_us_Software_v1_00.tfrecord")

In [None]:
# # The tensors you pull into the model MUST have the same name 
# # as what was encoded in the TFRecord

# # FixedLenFeature means that you know the number of tensors associated
# # with each label and example.

# # For example, there will only be 1 review per example, and as 
# # a result, sentence is a FixedLenFeature.

# feature_spec = {
#       "input_ids": FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
#       "input_mask": FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
#       "segment_ids": FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
#       "label_ids": FixedLenFeature([], tf.int64),
#       "is_real_example": FixedLenFeature([], tf.int64),
# }

# def parse_example(example_proto):
#   # Parse the input tf.Example proto using the dictionary above.
#   return tf.io.parse_single_example(example_proto, feature_spec)

# train_parse_dataset = train_dataset.map(parse_example)
# validation_parse_dataset = validation_dataset.map(parse_example)

In [None]:
def select_data_and_label_from_record(record):
    x = {
        'input_ids': record['input_ids'],
        'input_mask': record['input_mask'],
        'segment_ids': record['segment_ids']
    }
    y = record['label_ids']

    return (x, y)

In [None]:
name_to_features = {
  "input_ids": tf.io.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
  "input_mask": tf.io.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
  "segment_ids": tf.io.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
  "label_ids": tf.io.FixedLenFeature([], tf.int64),
  "is_real_example": tf.io.FixedLenFeature([], tf.int64),
}

def decode_record(record, name_to_features):
    """Decodes a record to a TensorFlow example."""
    example = tf.io.parse_single_example(record, name_to_features)

    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
    # So cast all int64 to int32.
    for name in list(example.keys()):
        t = example[name]
        if t.dtype == tf.int64:
            t = tf.cast(t, tf.int32)
        example[name] = t

    return example

In [None]:
def file_based_input_dataset_builder(input_file,
                                     is_training,
                                     drop_remainder):


  # For training, we want a lot of parallel reading and shuffling.
  # For eval, we want no shuffling and parallel reading doesn't matter.
  dataset = tf.data.TFRecordDataset(input_file)
  if is_training:
    dataset = dataset.repeat()
    dataset = dataset.shuffle(buffer_size=10)

  dataset = dataset.apply(
      tf.data.experimental.map_and_batch(
          lambda record: decode_record(record, name_to_features),
          batch_size=BATCH_SIZE,
          drop_remainder=drop_remainder))

  return dataset

In [None]:
from glob import glob

train_data = './data-tfrecord/train'
train_data_filenames = glob('{}/*.tfrecord'.format(train_data))

print(train_data_filenames)

train_dataset = file_based_input_dataset_builder(
    train_data_filenames,
    is_training=True,
    drop_remainder=False).map(select_data_and_label_from_record)

In [None]:
from glob import glob

validation_data = './data-tfrecord/validation'
validation_data_filenames = glob('{}/*.tfrecord'.format(validation_data))

print(validation_data_filenames)

validation_dataset = file_based_input_dataset_builder(
    validation_data_filenames,
    is_training=False,
    drop_remainder=False).map(select_data_and_label_from_record)

In [None]:
from glob import glob

test_data = './data-tfrecord/test'
test_data_filenames = glob('{}/*.tfrecord'.format(test_data))

print(test_data_filenames)

test_dataset = file_based_input_dataset_builder(
    test_data_filenames,
    is_training=False,
    drop_remainder=False).map(select_data_and_label_from_record)

In [None]:
#from transformers import *
from transformers import BertTokenizer, TFBertForSequenceClassification

In [None]:
tf.config.optimizer.set_jit(USE_XLA)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
from transformers.configuration_bert import BertConfig
config = BertConfig(num_labels=len(CLASSES))

In [None]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', 
                                                         config=config)

In [None]:
#train_parse_dataset = train_parse_dataset.shuffle(train_parse_dataset).batch(BATCH_SIZE).repeat(-1)
#validation_dataset = validation_dataset.batch(EVAL_BATCH_SIZE)

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)

if USE_AMP:
    # loss scaling is currently required when using mixed precision
    optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [None]:
model.layers[0].trainable = False

In [None]:
model.summary()


In [None]:
history = model.fit(train_dataset,
                    epochs=EPOCHS,
                    steps_per_epoch=STEPS_PER_EPOCH,
                    validation_data=validation_dataset,
                    validation_steps=VALIDATION_STEPS)

# Save the Model

In [None]:
!mkdir -p ./custom_pretrained
model.save_pretrained('./custom_pretrained')

In [None]:
!ls -al ./custom_pretrained

In [None]:
cat ./custom_pretrained/config.json

In [None]:
import json
loaded_model = TFBertForSequenceClassification.from_pretrained('./custom_pretrained/',
                                                              id2label={
                                                               0: 1,
                                                               1: 2,
                                                               2: 3,
                                                               3: 4,
                                                               4: 5
                                                              },
                                                              label2id={
                                                               1: 0,
                                                               2: 1,
                                                               3: 2,
                                                               4: 3,
                                                               5: 4
                                                              })

In [None]:
from transformers import BertTokenizer, TextClassificationPipeline

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

inference_pipeline = TextClassificationPipeline(model=loaded_model, 
                                                tokenizer=tokenizer,
                                                framework='tf',
                                                device=-1) # -1 is CPU, >= 0 is GPU

In [None]:
inference_pipeline('This is great!')

In [None]:
inference_pipeline('This is wonderful!')

In [None]:
inference_pipeline('This is OK.')

In [None]:
inference_pipeline('This sucks!')