In [1]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Specify the S3 Location of the Features

In [8]:
%store -r scikit_processing_job_s3_output_prefix

In [9]:
print('Previous Scikit Processing Job Name: {}'.format(scikit_processing_job_s3_output_prefix))

Previous Scikit Processing Job Name: sagemaker-scikit-learn-2020-03-24-01-28-42-203


In [10]:
prefix_train = '{}/output/bert-labeled-split-balanced-header-train'.format(scikit_processing_job_s3_output_prefix)
prefix_validation = '{}/output/bert-labeled-split-balanced-header-validation'.format(scikit_processing_job_s3_output_prefix)
prefix_test = '{}/output/bert-labeled-split-balanced-header-test'.format(scikit_processing_job_s3_output_prefix)

path_train = './{}'.format(prefix_train)
path_validation = './{}'.format(prefix_validation)
path_test = './{}'.format(prefix_test)

train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)
validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)
test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)

s3_input_train_data = sagemaker.s3_input(s3_data=train_s3_uri) #, content_type='text/csv')
s3_input_validation_data = sagemaker.s3_input(s3_data=validation_s3_uri) #, content_type='text/csv')
s3_input_test_data = sagemaker.s3_input(s3_data=test_s3_uri) #, content_type='text/csv')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-03-24-01-28-42-203/output/bert-labeled-split-balanced-header-train', 'S3DataDistributionType': 'FullyReplicated'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-03-24-01-28-42-203/output/bert-labeled-split-balanced-header-validation', 'S3DataDistributionType': 'FullyReplicated'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-03-24-01-28-42-203/output/bert-labeled-split-balanced-header-test', 'S3DataDistributionType': 'FullyReplicated'}}}


In [11]:
!cat src_bert/bert_reviews.py

import os
import argparse
import csv
import pickle as pkl
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
import sklearn
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
import re
import glob
import json
import numpy as np
import subprocess
import sys
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'simpletransformers'])
import torch
import torch.distributed as dist
import torch.utils.data
import torch.utils.data.distributed

import simpletransformers
from simpletransformers.classification import ClassificationModel

def load_dataset(path, sep, header):
    data = pd.concat([pd.read_csv(f, sep=sep, header=header) for f in glob.glob('{}/*.csv'.format(path))], ignore_index = True)

    labels = data.iloc[:,0]
    features = data.drop(data.columns[0], axis=1)

    if header==None:
        # Adjust the column names after dropped the 0th 

In [20]:
from sagemaker.tensorflow import TensorFlow

model_output_path = 's3://{}/models/tf-bert/script-mode/training-runs'.format(bucket)

bert_estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                         source_dir='src_bert_tf',
                         role=role,
                         train_instance_count=1, # 1 is actually faster due to communication overhead with >1
                         train_instance_type='ml.c5.9xlarge',
                         py_version='py3',
                         framework_version='1.15.2',
                         output_path=model_output_path,
                         hyperparameters={'model_type':'bert',
                                          'model_name': 'bert-base-cased'},
                         enable_cloudwatch_metrics=True)

### Train the model

In [21]:
bert_estimator.fit(inputs={'train': s3_input_train_data, 
                           'validation': s3_input_validation_data,}, 
                   wait=False) 

In [22]:
training_job_name = bert_estimator.latest_training_job.name
print('training_job_name:  {}'.format(training_job_name))

training_job_name:  tensorflow-training-2020-03-24-02-03-34-945


In [23]:
from sagemaker.tensorflow import TensorFlow

bert_estimator = TensorFlow.attach(training_job_name=training_job_name)

2020-03-24 02:04:30 Starting - Starting the training job...
2020-03-24 02:04:32 Starting - Launching requested ML instances......
2020-03-24 02:05:39 Starting - Preparing the instances for training...
2020-03-24 02:06:25 Downloading - Downloading input data...
[0m
[34m2020-03-24 02:06:53,483 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2020-03-24 02:06:53,489 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-03-24 02:07:43,277 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-03-24 02:07:43,290 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-03-24 02:07:43,303 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-03-24 02:07:43,312 sagemaker-containers INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    "channel_


2020-03-24 02:07:58 Uploading - Uploading generated training model
2020-03-24 02:07:58 Failed - Training job failed
[34mCollecting tensorflow-hub==0.7.0
  Downloading tensorflow_hub-0.7.0-py2.py3-none-any.whl (89 kB)[0m
[34mInstalling collected packages: tensorflow-hub[0m
[34mSuccessfully installed tensorflow-hub-0.7.0[0m
[34mCollecting bert-tensorflow==1.0.1
  Downloading bert_tensorflow-1.0.1-py2.py3-none-any.whl (67 kB)[0m
[34mInstalling collected packages: bert-tensorflow[0m
[34mSuccessfully installed bert-tensorflow-1.0.1[0m
[0m
[34m1.15.2[0m
[0m
[0m
[34m['/opt/ml/input/data/train/part-algo-1-amazon_reviews_us_Software_v1_00.csv'][0m
[34mBeginning Training![0m
[34m[2020-03-24 02:07:53.243 ip-10-0-68-245.ec2.internal:98 INFO json_config.py:90] Creating hook from json_config at /opt/ml/input/config/debughookconfig.json.[0m
[34m[2020-03-24 02:07:53.243 ip-10-0-68-245.ec2.internal:98 INFO hook.py:170] tensorboard_dir has not been set for the hook. SMDebug will

UnexpectedStatusException: Error for Training job tensorflow-training-2020-03-24-02-03-34-945: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/usr/bin/python3 tf_bert_reviews.py --model_dir s3://sagemaker-us-east-1-835319576252/models/tf-bert/script-mode/training-runs/tensorflow-training-2020-03-24-02-03-34-945/model --model_name bert-base-cased --model_type bert"

# Load the Model

In [None]:
# download the model artifact from AWS S3
!aws s3 cp $model_output_path/$training_job_name/output/model.tar.gz ./models/bert-tf/

In [None]:
import tarfile
import pickle as pkl

tar = tarfile.open('./models/bert-tf/model.tar.gz')
tar.extractall(path='./models/bert-tf-model')
tar.close()

In [None]:
# from simpletransformers.classification import ClassificationModel

# args = {
#    'fp16': False,
#    'max_seq_length': 128
# }

# bert_model = ClassificationModel(model_type='distilbert', # bert, distilbert, etc, etc.
#                                  model_name='./models/bert-model',
#                                  args=args,
#                                  use_cuda=False)

# Predict 


In [None]:
predictions, raw_outputs = bert_model.predict(["""I really enjoyed this item.  I highly recommend it."""])

print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))

In [None]:
predictions, raw_outputs = bert_model.predict(["""This item is awful and terrible."""])

print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))