In [3]:
!pip install -q --upgrade pip
!pip install -q tensorflow==2.2.0-rc2
!pip install -q transformers

[31mERROR: tensorflow-transform 0.21.2 has requirement tensorflow<2.2,>=1.15, but you'll have tensorflow 2.2.0rc2 which is incompatible.[0m
[31mERROR: tensorflow-text 2.1.1 has requirement tensorflow<2.2,>=2.1.0, but you'll have tensorflow 2.2.0rc2 which is incompatible.[0m
[31mERROR: tensorflow-serving-api 2.1.0 has requirement tensorflow~=2.1.0, but you'll have tensorflow 2.2.0rc2 which is incompatible.[0m
[31mERROR: tensorflow-model-analysis 0.21.6 has requirement pandas<2,>=0.24, but you'll have pandas 0.23.0 which is incompatible.[0m
[31mERROR: tensorflow-data-validation 0.21.5 has requirement pandas<1,>=0.24, but you'll have pandas 0.23.0 which is incompatible.[0m
[31mERROR: apache-beam 2.17.0 has requirement avro-python3<2.0.0,>=1.8.1; python_version >= "3.0", but you'll have avro-python3 file-.avro-VERSION.txt which is incompatible.[0m


In [4]:
import time
import random
import pandas as pd
from glob import glob
import argparse
import json
import subprocess
import sys
import os
import tensorflow as tf
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline
from transformers.configuration_distilbert import DistilBertConfig

In [5]:
MAX_SEQ_LENGTH=128
BATCH_SIZE=8
EVAL_BATCH_SIZE=BATCH_SIZE * 2
EPOCHS=1
STEPS_PER_EPOCH=30
VALIDATION_STEPS=30
CLASSES = [1, 2, 3, 4, 5]
# XLA is an optimization compiler for tensorflow
USE_XLA = True
# Mixed precision can help to speed up training time
USE_AMP = True

In [6]:
def select_data_and_label_from_record(record):
    x = {
        'input_ids': record['input_ids'],
        'input_mask': record['input_mask'],
        'segment_ids': record['segment_ids']
    }
    y = record['label_ids']

    return (x, y)

In [7]:
def file_based_input_dataset_builder(channel,
                                     input_filenames,
                                     pipe_mode,
                                     is_training,
                                     drop_remainder):

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.

    if pipe_mode:
        print('***** Using pipe_mode with channel {}'.format(channel))
        from sagemaker_tensorflow import PipeModeDataset
        dataset = PipeModeDataset(channel=channel,
                                  record_format='TFRecord')
    else:
        print('***** Using input_filenames {}'.format(input_filenames))
        dataset = tf.data.TFRecordDataset(input_filenames)

    dataset = dataset.repeat(EPOCHS * STEPS_PER_EPOCH)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    name_to_features = {
      "input_ids": tf.io.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
      "input_mask": tf.io.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
      "segment_ids": tf.io.FixedLenFeature([MAX_SEQ_LENGTH], tf.int64),
      "label_ids": tf.io.FixedLenFeature([], tf.int64),
      "is_real_example": tf.io.FixedLenFeature([], tf.int64),
    }

    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        example = tf.io.parse_single_example(record, name_to_features)

#         for name in list(example.keys()):
#             t = example[name]
#             if t.dtype == tf.int64:
#                 t = tf.cast(t, tf.int32)
#             example[name] = t

        return example
        
    dataset = dataset.apply(
        tf.data.experimental.map_and_batch(
          lambda record: _decode_record(record, name_to_features),
          batch_size=BATCH_SIZE,
          drop_remainder=drop_remainder,
          num_parallel_calls=tf.data.experimental.AUTOTUNE))

    dataset.cache()

    if is_training:
        dataset = dataset.shuffle(seed=42,
                                  buffer_size=1000,
                                  reshuffle_each_iteration=True)

    return dataset

In [8]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
                                          num_labels=len(CLASSES))
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', 
                                                              config=config)
successful_download = True

In [9]:
train_data = './data-tfrecord/train'

In [12]:
train_data_filenames = glob('{}/*.tfrecord'.format(train_data))
print('train_data_filenames {}'.format(train_data_filenames))
train_dataset = file_based_input_dataset_builder(
    channel='train',
    input_filenames=train_data_filenames,
    pipe_mode=False,
    is_training=True,
    drop_remainder=False).map(select_data_and_label_from_record)

train_data_filenames ['./data-tfrecord/train/part-algo-1-amazon_reviews_us_Software_v1_00.tfrecord']
***** Using input_filenames ['./data-tfrecord/train/part-algo-1-amazon_reviews_us_Software_v1_00.tfrecord']
Instructions for updating:
Use `tf.data.Dataset.map(map_func, num_parallel_calls)` followed by `tf.data.Dataset.batch(batch_size, drop_remainder)`. Static tf.data optimizations will take care of using the fused implementation.


In [13]:
validation_data = './data-tfrecord/validation'

In [16]:
validation_data_filenames = glob('{}/*.tfrecord'.format(validation_data))

print('validation_data_filenames {}'.format(validation_data_filenames))
validation_dataset = file_based_input_dataset_builder(
    channel='validation',
    input_filenames=validation_data_filenames,
    pipe_mode=False,
    is_training=False,
    drop_remainder=False).map(select_data_and_label_from_record)

validation_data_filenames ['./data-tfrecord/validation/part-algo-1-amazon_reviews_us_Software_v1_00.tfrecord']
***** Using input_filenames ['./data-tfrecord/validation/part-algo-1-amazon_reviews_us_Software_v1_00.tfrecord']


In [15]:
test_data = './data-tfrecord/test'

In [17]:
test_data_filenames = glob('{}/*.tfrecord'.format(test_data))

print(test_data_filenames)

test_dataset = file_based_input_dataset_builder(
    channel='test',
    input_filenames=test_data_filenames,
    pipe_mode=False,
    is_training=False,
    drop_remainder=False).map(select_data_and_label_from_record)

['./data-tfrecord/test/part-algo-1-amazon_reviews_us_Software_v1_00.tfrecord']
***** Using input_filenames ['./data-tfrecord/test/part-algo-1-amazon_reviews_us_Software_v1_00.tfrecord']


# Setup the fine-tuning here

In [18]:
tf.config.optimizer.set_jit(USE_XLA)
tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})

In [19]:
opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
if USE_AMP:
    # loss scaling is currently required when using mixed precision
    optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
#model.layers[0].trainable = False
model.summary()

Model: "tf_distil_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  66362880  
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  3845      
_________________________________________________________________
dropout_19 (Dropout)         multiple                  0         
Total params: 66,957,317
Trainable params: 66,957,317
Non-trainable params: 0
_________________________________________________________________


In [20]:
log_dir = './tensorboard/'
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)

In [21]:
history = model.fit(train_dataset,
                    shuffle=True,
                    epochs=EPOCHS,
                    steps_per_epoch=STEPS_PER_EPOCH,
#                    validation_data=validation_dataset,
#                    validation_steps=VALIDATION_STEPS,
                    callbacks=[tensorboard_callback])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "




In [None]:
print('Trained model {}'.format(model))

# Save the Model

In [None]:
model_dir = './custom_pretrained'

In [None]:
!mkdir -p $model_dir
model.save_pretrained(model_dir)

In [None]:
!ls -al $model_dir

In [None]:
cat $model_dir/config.json

In [None]:
import json
loaded_model = TFBertForSequenceClassification.from_pretrained('./custom_pretrained/',
                                                              id2label={
                                                               0: 1,
                                                               1: 2,
                                                               2: 3,
                                                               3: 4,
                                                               4: 5
                                                              },
                                                              label2id={
                                                               1: 0,
                                                               2: 1,
                                                               3: 2,
                                                               4: 3,
                                                               5: 4
                                                              })

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

inference_pipeline = TextClassificationPipeline(model=loaded_model, 
                                                tokenizer=tokenizer,
                                                framework='tf',
                                                device=-1) # -1 is CPU, >= 0 is GPU

print("""I loved it!  I will recommend this to everyone.""", inference_pipeline("""I loved it!  I will recommend this to everyone."""))
print("""Really bad.  I hope they don't make this anymore.""", inference_pipeline("""Really bad.  I hope they don't make this anymore."""))
print("""It's OK.""", inference_pipeline("""It's OK."""))


In [None]:
!tensorboard --port 6006 --logdir ./tensorboard/ # <== MAKE SURE YOU INCLUDE THE TRAILING `/`


While Tensorboard is running locally on your SageMaker Notebook instance, it is reading the training logs from Amazon S3.

Navigate to https://workshop.notebook.us-west-2.sagemaker.aws/proxy/6006/

_Note:  Make sure you copy the trailing `/` in the link above.  If you see no data, you are likely not using the correct S3 bucket above._

![Tensorboard](img/tensorboard.png)

Once you are done, hit Kernel => Stop to stop the running `Tensorboard` process in this notebook.