In [34]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Specify the S3 Location of the Features

In [35]:
%store -r scikit_processing_job_s3_output_prefix

In [36]:
print('Previous Scikit Processing Job Name: {}'.format(scikit_processing_job_s3_output_prefix))

Previous Scikit Processing Job Name: sagemaker-scikit-learn-2020-03-30-03-34-18-188


In [37]:
prefix_train = '{}/output/bert-train'.format(scikit_processing_job_s3_output_prefix)
prefix_validation = '{}/output/bert-validation'.format(scikit_processing_job_s3_output_prefix)
prefix_test = '{}/output/bert-test'.format(scikit_processing_job_s3_output_prefix)

path_train = './{}'.format(prefix_train)
path_validation = './{}'.format(prefix_validation)
path_test = './{}'.format(prefix_test)

train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)
validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)
test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)

In [38]:
s3_input_train_data = sagemaker.s3_input(s3_data=train_s3_uri, distribution='ShardedByS3Key') #, content_type='text/csv')
s3_input_validation_data = sagemaker.s3_input(s3_data=validation_s3_uri, distribution='ShardedByS3Key') #, content_type='text/csv')
s3_input_test_data = sagemaker.s3_input(s3_data=test_s3_uri, content_type='text/csv', distribution='ShardedByS3Key') #, content_type='text/csv')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-03-30-03-34-18-188/output/bert-train', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-03-30-03-34-18-188/output/bert-validation', 'S3DataDistributionType': 'ShardedByS3Key'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-03-30-03-34-18-188/output/bert-test', 'S3DataDistributionType': 'ShardedByS3Key'}}, 'ContentType': 'text/csv'}


In [39]:
!cat src_bert_tf2/tf_bert_reviews.py

import sys
import subprocess
import argparse
import json
#subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'tensorflow-gpu==2.2.0-rc2'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'bert-for-tf2'])
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'sentencepiece'])

import tensorflow as tf
print(tf.__version__)

import boto3
import pandas as pd

import os
import math
import datetime

from tqdm import tqdm

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow import keras
from glob import glob 

from bert.model import BertModelLayer
from bert.loader import StockBertConfig, map_stock_config_to_params, load_stock_weights
from bert.tokenization.bert_tokenization import FullTokenizer

from sklearn.metrics import confusion_matrix, classification_report

import os

os.system('rm uncased_L-12_H-768_A-12.zip')
os.system('rm -rf uncased_L-12_H-768_A-12')

os.system('wget -q https://stor

In [40]:
from sagemaker.tensorflow import TensorFlow

model_output_path = 's3://{}/models/tf2-bert'.format(bucket)

bert_estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                            source_dir='new_src_bert_tf2',
                            role=role,
                            train_instance_count=1, # 1 is actually faster due to communication overhead with >1
                            train_instance_type='ml.p3.8xlarge',
                            py_version='py3',
                            framework_version='2.0.0',
                            output_path=model_output_path,
#                            hyperparameters={'model_type':'bert',
#                                             'model_name': 'bert-base-cased'},
                            distributions={'parameter_server': {'enabled': True}},
                            enable_cloudwatch_metrics=True,
                            input_mode='Pipe')

# Train the model

In [41]:
bert_estimator.fit(inputs={'train': s3_input_train_data, 
                           'validation': s3_input_validation_data,}, 
                   wait=False)

In [42]:
training_job_name = bert_estimator.latest_training_job.name
print('training_job_name:  {}'.format(training_job_name))

training_job_name:  tensorflow-training-2020-04-02-07-21-25-367


In [43]:
# from sagemaker.tensorflow import TensorFlow

# bert_estimator = TensorFlow.attach(training_job_name=training_job_name)

In [44]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [45]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [46]:
from IPython.core.display import display, HTML

training_job_s3_output_prefix = 'models/tf2-bert/{}'.format(training_job_name) # 'models/tf-bert/script-mode/training-runs/{}'.format(training_job_name)

display(HTML('<b>Review <a href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(bucket, training_job_s3_output_prefix, region)))


# Download and Load the Trained Model

In [None]:
# download the model artifact from AWS S3

!aws s3 cp $model_output_path/$training_job_name/output/model.tar.gz ./models

#!aws s3 cp s3://sagemaker-us-east-1-835319576252/models/tf-bert/script-mode/training-runs/tensorflow-training-2020-03-24-04-41-39-405/output/model.tar.gz ./models/tf2-bert/

In [None]:
import tarfile
import pickle as pkl

tar = tarfile.open('./models/model.tar.gz')
tar.extractall(path='./models')
tar.close()

In [None]:
!ls -al ./models

In [None]:
# Must upgrade wrapt before installing TF
!pip install -q pip --upgrade
!pip install -q wrapt --upgrade --ignore-installed
!pip install -q tensorflow==2.0.0

# Load the model

In [None]:
# TODO