In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Specify the S3 Location of the Features

In [None]:
%store -r scikit_processing_job_s3_output_prefix

In [None]:
print('Previous Scikit Processing Job Name: {}'.format(scikit_processing_job_s3_output_prefix))

In [None]:
prefix_train = '{}/output/bert-train'.format(scikit_processing_job_s3_output_prefix)
prefix_validation = '{}/output/bert-validation'.format(scikit_processing_job_s3_output_prefix)
prefix_test = '{}/output/bert-test'.format(scikit_processing_job_s3_output_prefix)

path_train = './{}'.format(prefix_train)
path_validation = './{}'.format(prefix_validation)
path_test = './{}'.format(prefix_test)

train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)
validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)
test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)

s3_input_train_data = sagemaker.s3_input(s3_data=train_s3_uri) #, content_type='text/csv')
s3_input_validation_data = sagemaker.s3_input(s3_data=validation_s3_uri) #, content_type='text/csv')
s3_input_test_data = sagemaker.s3_input(s3_data=test_s3_uri) #, content_type='text/csv')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

In [None]:
!cat src_bert_tf/tf_bert_reviews.py

In [None]:
from sagemaker.tensorflow import TensorFlow

model_output_path = 's3://{}/models/tf-bert'.format(bucket)

bert_estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                         source_dir='src_bert_tf',
                         role=role,
                         train_instance_count=1, # 1 is actually faster due to communication overhead with >1
                         train_instance_type='ml.c5.18xlarge',
                         py_version='py3',
                         framework_version='1.15.2',
                         output_path=model_output_path,
#                         hyperparameters={'model_type':'bert',
#                                          'model_name': 'bert-base-cased'},
                         enable_cloudwatch_metrics=True)

# Train the model

In [None]:
bert_estimator.fit(inputs={'train': s3_input_train_data, 
                           'validation': s3_input_validation_data,}, 
                   wait=False)

In [None]:
training_job_name = bert_estimator.latest_training_job.name
print('training_job_name:  {}'.format(training_job_name))

In [None]:
# from sagemaker.tensorflow import TensorFlow

# bert_estimator = TensorFlow.attach(training_job_name=training_job_name)

In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [None]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [None]:
from IPython.core.display import display, HTML

# This is different than the job name because we are not using ProcessingOutput's in this Spark ML case.
training_job_s3_output_prefix = 'models/tf-bert/script-mode/training-runs/{}'.format(training_job_name)

display(HTML('<b>Review <a href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(bucket, training_job_s3_output_prefix, region)))


# Download and Load the Trained Model

In [None]:
# download the model artifact from AWS S3
!aws s3 cp $model_output_path/$training_job_name/output/model.tar.gz ./models/bert-tf/

#!aws s3 cp s3://sagemaker-us-east-1-835319576252/models/tf-bert/script-mode/training-runs/tensorflow-training-2020-03-24-04-41-39-405/output/model.tar.gz ./models/bert-tf/

In [None]:
import tarfile
import pickle as pkl

tar = tarfile.open('./models/bert-tf/model.tar.gz')
tar.extractall(path='./models/bert-tf-model')
tar.close()

In [None]:
!ls -al ./models/bert-tf-model

In [None]:
!ls -al ./models/bert-tf-model/tf-bert-model-oh-yeah/1585283912

In [None]:
# Must upgrade wrapt before installing TF
!pip install -q wrapt --upgrade --ignore-installed
!pip install -q tensorflow==1.15.2
!pip install -q tensorflow-hub==0.7.0
!pip install -q bert-tensorflow==1.0.1

# Load the model

In [None]:
saved_model = tf.saved_model.load_v2(
    './models/bert-tf-model/tf-bert-model-oh-yeah/1585283912',
    tags=None
)

In [None]:
inference = saved_model.signatures["serving_default"]
print(inference.inputs)
print(inference.structured_outputs)

# Predict 


In [None]:
from bert import tokenization
import tensorflow_hub as hub

BERT_MODEL_HUB = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    with tf.Graph().as_default():
        bert_module = hub.Module(BERT_MODEL_HUB)
        tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
        with tf.Session() as sess:
            vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],
                                                tokenization_info["do_lower_case"]])
      
        return tokenization.FullTokenizer(vocab_file=vocab_file,
                                               do_lower_case=do_lower_case)
   

In [None]:
def get_predict_features(features, seq_length):
  all_input_ids = []
  all_input_mask = []
  all_segment_ids = []
  all_label_ids = []

  for feature in features:
    all_input_ids.append(feature.input_ids)
    all_input_mask.append(feature.input_mask)
    all_segment_ids.append(feature.segment_ids)
    all_label_ids.append(feature.label_id)

    batch_size = 32

  num_examples = len(features)

  # This is for demo purposes and does NOT scale to large data sets. We do
  # not use Dataset.from_generator() because that uses tf.py_func which is
  # not TPU compatible. The right way to load data is with TFRecordReader.
#   d = tf.data.Dataset.from_tensor_slices({
#     "input_ids":
#         tf.constant(
#             all_input_ids, shape=[num_examples, seq_length],
#             dtype=tf.int32),
#     "input_mask":
#         tf.constant(
#             all_input_mask,
#             shape=[num_examples, seq_length],
#             dtype=tf.int32),
#     "segment_ids":
#         tf.constant(
#             all_segment_ids,
#             shape=[num_examples, seq_length],
#             dtype=tf.int32),
#     "label_ids":
#         tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
#   })

#   d = d.batch(batch_size=batch_size, drop_remainder=False)

#  return d

  input_ids = tf.constant(
             all_input_ids, shape=[num_examples, seq_length],
             dtype=tf.int32)

  input_mask = tf.constant(
             all_input_mask,
             shape=[num_examples, seq_length],
             dtype=tf.int32)

  segment_ids = tf.constant(
             all_segment_ids,
             shape=[num_examples, seq_length],
             dtype=tf.int32)

  label_ids = tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),

  return input_ids, input_mask, segment_ids, label_ids

In [None]:
from src_bert_tf import amazon_run_classifier

MAX_SEQ_LENGTH = 128
LABEL_VALUES = [1, 2, 3, 4, 5]

def predict(in_sentences):
    tokenizer = create_tokenizer_from_hub_module()
    print('**** TOKENIZER {}****'.format(tokenizer))
    
    input_examples = [amazon_run_classifier.InputExample(guid="", text_a = x, text_b = None, label = -1) for x in in_sentences]
    input_features = amazon_run_classifier.convert_examples_to_features(input_examples, LABEL_VALUES, MAX_SEQ_LENGTH, tokenizer)

#    predict_input_fn = amazon_run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
#    predictions = estimator.predict(predict_input_fn)
    input_ids, input_mask, segment_ids, label_ids = get_predict_features(input_features, MAX_SEQ_LENGTH)
    print(type(input_ids))

    inference(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_ids=label_ids)

    return [(sentence, prediction['probabilities'], LABEL_VALUES[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]


In [None]:
import numpy as np

pred_sentences = [
  "That movie was absolutely awful",
  "The acting was a bit lacking",
  "The film was creative and surprising",
  "Absolutely fantastic!"
]

np_list = np.asarray(pred_sentences)
tensor_list = tf.convert_to_tensor(np_list)
predictions = predict(tensor_list)