# Fine-Tuning a BERT Model and Create a Text Classifier

In the previous section, we've already performed the Feature Engineering to create BERT embeddings from the `reviews_body` text using the pre-trained BERT model, and split the dataset into train, validation and test files. To optimize for Tensorflow training, we saved the files in TFRecord format. 

Now, let’s fine-tune the BERT model to our Customer Reviews Dataset and add a new classification layer to predict the `star_rating` for a given `review_body`.

![BERT Training](img/bert_training.png)

As mentioned earlier, BERT’s attention mechanism is called a Transformer. This is, not coincidentally, the name of the popular BERT Python library, “Transformers,” maintained by a company called HuggingFace. 

We will use a variant of BERT called [**DistilBert**](https://arxiv.org/pdf/1910.01108.pdf) which requires less memory and compute, but maintains very good accuracy on our dataset.

In [1]:
import time
import random
import pandas as pd
from glob import glob
import argparse
import json
import subprocess
import sys
import os
import tensorflow as tf
from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification
from transformers import TextClassificationPipeline
from transformers import DistilBertConfig

In [2]:
%store -r max_seq_length

In [3]:
try:
    max_seq_length
except NameError:
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')
    print('[ERROR] Please run the notebooks in the PREPARE section before you continue.')
    print('++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++')

In [4]:
print(max_seq_length)

64


In [5]:
def select_data_and_label_from_record(record):
    x = {
        'input_ids': record['input_ids'],
        'input_mask': record['input_mask'],
        'segment_ids': record['segment_ids']
    }
    y = record['label_ids']

    return (x, y)

In [6]:
def file_based_input_dataset_builder(channel,
                                     input_filenames,
                                     pipe_mode,
                                     is_training,
                                     drop_remainder):

    # For training, we want a lot of parallel reading and shuffling.
    # For eval, we want no shuffling and parallel reading doesn't matter.

    if pipe_mode:
        print('***** Using pipe_mode with channel {}'.format(channel))
        from sagemaker_tensorflow import PipeModeDataset
        dataset = PipeModeDataset(channel=channel,
                                  record_format='TFRecord')
    else:
        print('***** Using input_filenames {}'.format(input_filenames))
        dataset = tf.data.TFRecordDataset(input_filenames)

    dataset = dataset.repeat(100)
    dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

    name_to_features = {
      "input_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
      "input_mask": tf.io.FixedLenFeature([max_seq_length], tf.int64),
      "segment_ids": tf.io.FixedLenFeature([max_seq_length], tf.int64),
      "label_ids": tf.io.FixedLenFeature([], tf.int64),
    }

    def _decode_record(record, name_to_features):
        """Decodes a record to a TensorFlow example."""
        return tf.io.parse_single_example(record, name_to_features)
        
    dataset = dataset.apply(
        tf.data.experimental.map_and_batch(
          lambda record: _decode_record(record, name_to_features),
          batch_size=8,
          drop_remainder=drop_remainder,
          num_parallel_calls=tf.data.experimental.AUTOTUNE))

    dataset.cache()

    if is_training:
        dataset = dataset.shuffle(seed=42,
                                  buffer_size=10,
                                  reshuffle_each_iteration=True)

    return dataset

In [7]:
train_data = './data-tfrecord/bert-train'
train_data_filenames = glob('{}/*.tfrecord'.format(train_data))
print('train_data_filenames {}'.format(train_data_filenames))

train_dataset = file_based_input_dataset_builder(
    channel='train',
    input_filenames=train_data_filenames,
    pipe_mode=False,
    is_training=True,
    drop_remainder=False).map(select_data_and_label_from_record)

train_data_filenames ['./data-tfrecord/bert-train/part-algo-1-amazon_reviews_us_Digital_Software_v1_00.tfrecord', './data-tfrecord/bert-train/part-algo-2-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord']
***** Using input_filenames ['./data-tfrecord/bert-train/part-algo-1-amazon_reviews_us_Digital_Software_v1_00.tfrecord', './data-tfrecord/bert-train/part-algo-2-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord']
Instructions for updating:
Use `tf.data.Dataset.map(map_func, num_parallel_calls)` followed by `tf.data.Dataset.batch(batch_size, drop_remainder)`. Static tf.data optimizations will take care of using the fused implementation.


In [8]:
validation_data = './data-tfrecord/bert-validation'
validation_data_filenames = glob('{}/*.tfrecord'.format(validation_data))
print('validation_data_filenames {}'.format(validation_data_filenames))

validation_dataset = file_based_input_dataset_builder(
    channel='validation',
    input_filenames=validation_data_filenames,
    pipe_mode=False,
    is_training=False,
    drop_remainder=False).map(select_data_and_label_from_record)

validation_data_filenames ['./data-tfrecord/bert-validation/part-algo-1-amazon_reviews_us_Digital_Software_v1_00.tfrecord', './data-tfrecord/bert-validation/part-algo-2-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord']
***** Using input_filenames ['./data-tfrecord/bert-validation/part-algo-1-amazon_reviews_us_Digital_Software_v1_00.tfrecord', './data-tfrecord/bert-validation/part-algo-2-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord']


In [9]:
test_data = './data-tfrecord/bert-test'
test_data_filenames = glob('{}/*.tfrecord'.format(test_data))
print(test_data_filenames)

test_dataset = file_based_input_dataset_builder(
    channel='test',
    input_filenames=test_data_filenames,
    pipe_mode=False,
    is_training=False,
    drop_remainder=False).map(select_data_and_label_from_record)

['./data-tfrecord/bert-test/part-algo-1-amazon_reviews_us_Digital_Software_v1_00.tfrecord', './data-tfrecord/bert-test/part-algo-2-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord']
***** Using input_filenames ['./data-tfrecord/bert-test/part-algo-1-amazon_reviews_us_Digital_Software_v1_00.tfrecord', './data-tfrecord/bert-test/part-algo-2-amazon_reviews_us_Digital_Video_Games_v1_00.tfrecord']


# Specify Manual Hyper-Parameters

In [10]:
epochs=1
steps_per_epoch=50
validation_steps=50
test_steps=150
freeze_bert_layer=True
learning_rate=3e-5
epsilon=1e-08

# Load Pretrained BERT Model 
https://huggingface.co/transformers/pretrained_models.html 

In [11]:
CLASSES=[1, 2, 3, 4, 5]

config = DistilBertConfig.from_pretrained('distilbert-base-uncased',
                                          num_labels=len(CLASSES),
                                          id2label={
                                            0: 1,
                                            1: 2,
                                            2: 3,
                                            3: 4,
                                            4: 5
                                          },
                                          label2id={
                                            1: 0,
                                            2: 1,
                                            3: 2,
                                            4: 3,
                                            5: 4
                                          })
print(config)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=442.0, style=ProgressStyle(description_…


DistilBertConfig {
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": 1,
    "1": 2,
    "2": 3,
    "3": 4,
    "4": 5
  },
  "initializer_range": 0.02,
  "label2id": {
    "1": 0,
    "2": 1,
    "3": 2,
    "4": 3,
    "5": 4
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "vocab_size": 30522
}



In [12]:
from transformers import TFDistilBertModel

transformer_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased', 
                                                                          config=config)

input_ids = tf.keras.layers.Input(shape=(max_seq_length,), name='input_ids', dtype='int32')
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), name='input_mask', dtype='int32') 

embedding_layer = transformer_model.distilbert(input_ids, attention_mask=input_mask)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(50, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(len(CLASSES), activation='sigmoid')(X)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs = X)

for layer in model.layers[:3]:
    layer.trainable = not freeze_bert_layer

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=363423424.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['activation_13', 'vocab_transform', 'vocab_layer_norm', 'vocab_projector']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


# Setup the Custom Classifier Model Here

In [13]:
loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric=tf.keras.metrics.SparseCategoricalAccuracy('accuracy')

optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon)

model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 64)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 64)]         0                                            
__________________________________________________________________________________________________
distilbert (TFDistilBertMainLay ((None, 64, 768),)   66362880    input_ids[0][0]                  
                                                                 input_mask[0][0]                 
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 64, 100)      327600      distilbert[0][0]      

In [14]:
callbacks = []

log_dir = './tmp/tensorboard/'
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir)
callbacks.append(tensorboard_callback)

In [15]:
history = model.fit(train_dataset,
                    shuffle=True,
                    epochs=epochs,
                    steps_per_epoch=steps_per_epoch,
                    validation_data=validation_dataset,
                    validation_steps=validation_steps,
                    callbacks=callbacks)

  [n for n in tensors.keys() if n not in ref_input_names])


Instructions for updating:
use `tf.profiler.experimental.stop` instead.


In [16]:
print('Trained model {}'.format(model))

Trained model <tensorflow.python.keras.engine.functional.Functional object at 0x7fd591955210>


# Evaluate on Holdout Test Dataset

In [17]:
test_history = model.evaluate(test_dataset,
                              steps=test_steps,                            
                              callbacks=callbacks)
print(test_history)

[1.5883307456970215, 0.5]


# Save the Model

In [18]:
# model_dir = './tmp/fine-tuned'

In [19]:
# !mkdir -p $model_dir

In [20]:
# model.save_pretrained(model_dir)

In [21]:
!ls -al $model_dir

total 536
drwxr-xr-x 14 root root   6144 Jan 16 00:21 .
drwxr-xr-x 17 root root   6144 Jan 14 00:56 ..
-rw-r--r--  1 root root    189 Dec 21 20:46 .gitignore
drwxr-xr-x  2 root root   6144 Jan 16 00:17 .ipynb_checkpoints
-rw-r--r--  1 root root   6268 Jan 16 00:16 00_Overview.ipynb
-rw-r--r--  1 root root  25239 Jan 16 00:21 01_Train_Reviews_BERT_Transformers_TensorFlow_AdHoc.ipynb
-rw-r--r--  1 root root  28358 Jan 16 00:17 02_Train_Reviews_BERT_Transformers_TensorFlow_ScriptMode.ipynb
-rw-r--r--  1 root root  10956 Jan 16 00:19 03_Convert_BERT_Transformers_TensorFlow_To_PyTorch.ipynb
-rw-r--r--  1 root root  20834 Jan 16 00:20 04_Evaluate_Model_Metrics.ipynb
-rw-r--r--  1 root root 368598 Jan  2 19:11 99_generated_profiler_report.html
drwxr-xr-x  6 root root   6144 Dec 21 20:46 container-demo
drwxr-xr-x  2 root root   6144 Dec 21 20:46 data
drwxr-xr-x  2 root root   6144 Dec 21 20:46 data-jumpstart
drwxr-xr-x  2 root root   6144 Dec 21 20:46 data-pipeline
drwxr-xr-x  5 root root   61

In [22]:
# cat $model_dir/config.json

In [23]:
tensorflow_model_dir = './tmp/tensorflow/'

In [24]:
!mkdir -p $tensorflow_model_dir

In [None]:
model.save(tensorflow_model_dir, include_optimizer=False, overwrite=True)

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


In [None]:
!ls -al $tensorflow_model_dir

In [None]:
!saved_model_cli show --all --dir $tensorflow_model_dir

In [None]:
!saved_model_cli run --dir $tensorflow_model_dir --tag_set serve --signature_def serving_default \
    --input_exprs 'input_ids=np.zeros((1,64));input_mask=np.zeros((1,64))'

# Predict with Model

In [None]:
import pandas as pd
import numpy as np

from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

def predict(text):
    encode_plus_tokens = tokenizer.encode_plus(text,
                                               pad_to_max_length=True,
                                               max_length=max_seq_length,
                                               truncation=True,
                                               return_tensors='tf')
    # The id from the pre-trained BERT vocabulary that represents the token.  (Padding of 0 will be used if the # of tokens is less than `max_seq_length`)
    input_ids = encode_plus_tokens['input_ids']
    
    # Specifies which tokens BERT should pay attention to (0 or 1).  Padded `input_ids` will have 0 in each of these vector elements.    
    input_mask = encode_plus_tokens['attention_mask']

    outputs = model.predict(x=(input_ids, input_mask))

    scores = np.exp(outputs) / np.exp(outputs).sum(-1, keepdims=True)

    prediction = [{"label": config.id2label[item.argmax()], "score": item.max().item()} for item in scores]

    return prediction[0]['label']

In [None]:
sample_review_body = 'This product is terrible.'
predict(sample_review_body)

# Test Model

In [None]:
import csv

df_sample_reviews = pd.read_csv('./data/amazon_reviews_us_Digital_Software_v1_00.tsv.gz', 
                                delimiter='\t', 
                                quoting=csv.QUOTE_NONE,
                                compression='gzip')[['review_body', 'star_rating']].sample(n=100)
df_sample_reviews.shape

In [None]:
df_sample_reviews.head()

In [None]:
# import pandas as pd

# def predict(review_body):
#     prediction_map = inference_pipeline(review_body)
#     return prediction_map[0]['label']

In [None]:
y_pred = df_sample_reviews['review_body'].map(predict)

y_pred

In [None]:
y_true = df_sample_reviews['star_rating']

# Classification Report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true=y_true, y_pred=y_pred))

# Accuracy

In [None]:
from sklearn.metrics import accuracy_score

print('Test Accuracy: ', accuracy_score(y_pred=y_pred, y_true=y_true))

# Confusion Matrix

In [None]:
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

def plot_conf_mat(cm, classes, title, cmap = plt.cm.Greens):
    print(cm)
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
        horizontalalignment="center",
        color="black" if cm[i, j] > thresh else "black")

        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')

In [None]:
import itertools
import numpy as np
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

cm = confusion_matrix(y_true=y_true, y_pred=y_pred)

plt.figure()
fig, ax = plt.subplots(figsize=(10,5))
plot_conf_mat(cm, 
              classes=['1', '2', '3', '4', '5'], 
              title='Confusion Matrix')
plt.show()

In [None]:
%%javascript

try {
    Jupyter.notebook.save_checkpoint();
    Jupyter.notebook.session.delete();
}
catch(err) {
    // NoOp
}