In [2]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Specify the S3 Location of the Features

In [3]:
%store -r scikit_processing_job_s3_output_prefix

In [4]:
print('Previous Scikit Processing Job Name: {}'.format(scikit_processing_job_s3_output_prefix))

Previous Scikit Processing Job Name: sagemaker-scikit-learn-2020-03-25-05-52-04-408


In [5]:
prefix_train = '{}/output/bert-train'.format(scikit_processing_job_s3_output_prefix)
prefix_validation = '{}/output/bert-validation'.format(scikit_processing_job_s3_output_prefix)
prefix_test = '{}/output/bert-test'.format(scikit_processing_job_s3_output_prefix)

path_train = './{}'.format(prefix_train)
path_validation = './{}'.format(prefix_validation)
path_test = './{}'.format(prefix_test)

train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)
validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)
test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)

s3_input_train_data = sagemaker.s3_input(s3_data=train_s3_uri) #, content_type='text/csv')
s3_input_validation_data = sagemaker.s3_input(s3_data=validation_s3_uri) #, content_type='text/csv')
s3_input_test_data = sagemaker.s3_input(s3_data=test_s3_uri) #, content_type='text/csv')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-03-25-05-52-04-408/output/bert-train', 'S3DataDistributionType': 'FullyReplicated'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-03-25-05-52-04-408/output/bert-validation', 'S3DataDistributionType': 'FullyReplicated'}}}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-03-25-05-52-04-408/output/bert-test', 'S3DataDistributionType': 'FullyReplicated'}}}


In [6]:
!cat src_bert/bert_reviews.py

import os
import argparse
import csv
import pickle as pkl
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
import sklearn
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
import re
import glob
import json
import numpy as np
import subprocess
import sys
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'simpletransformers'])
import torch
import torch.distributed as dist
import torch.utils.data
import torch.utils.data.distributed

import simpletransformers
from simpletransformers.classification import ClassificationModel

def load_dataset(path, sep, header):
    data = pd.concat([pd.read_csv(f, sep=sep, header=header) for f in glob.glob('{}/*.csv'.format(path))], ignore_index = True)

    labels = data.iloc[:,0]
    features = data.drop(data.columns[0], axis=1)

    if header==None:
        # Adjust the column names after dropped the 0th 

In [7]:
from sagemaker.tensorflow import TensorFlow

model_output_path = 's3://{}/models/tf-bert/script-mode/training-runs'.format(bucket)

bert_estimator = TensorFlow(entry_point='tf_bert_reviews.py',
                         source_dir='src_bert_tf',
                         role=role,
                         train_instance_count=1, # 1 is actually faster due to communication overhead with >1
                         train_instance_type='ml.c5.18xlarge',
                         py_version='py3',
                         framework_version='1.15.2',
                         output_path=model_output_path,
#                         hyperparameters={'model_type':'bert',
#                                          'model_name': 'bert-base-cased'},
                         enable_cloudwatch_metrics=True)

# Train the model

In [8]:
bert_estimator.fit(inputs={'train': s3_input_train_data, 
                           'validation': s3_input_validation_data,}, 
                   wait=False)

In [9]:
training_job_name = bert_estimator.latest_training_job.name
print('training_job_name:  {}'.format(training_job_name))

training_job_name:  tensorflow-training-2020-03-25-07-01-00-916


In [10]:
# from sagemaker.tensorflow import TensorFlow

# bert_estimator = TensorFlow.attach(training_job_name=training_job_name)

In [11]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [12]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a> After About 5 Minutes</b>'.format(region, training_job_name)))


In [13]:
from IPython.core.display import display, HTML

# This is different than the job name because we are not using ProcessingOutput's in this Spark ML case.
training_job_s3_output_prefix = 'models/tf-bert/script-mode/training-runs/{}'.format(training_job_name)

display(HTML('<b>Review <a href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(bucket, training_job_s3_output_prefix, region)))


# Download and Load the Trained Model

In [43]:
# download the model artifact from AWS S3
!aws s3 cp $model_output_path/$training_job_name/output/model.tar.gz ./models/bert-tf/

#!aws s3 cp s3://sagemaker-us-east-1-835319576252/models/tf-bert/script-mode/training-runs/tensorflow-training-2020-03-24-04-41-39-405/output/model.tar.gz ./models/bert-tf/

fatal error: An error occurred (404) when calling the HeadObject operation: Key "models/tf-bert/script-mode/training-runs/tensorflow-training-2020-03-25-06-07-46-309/output/model.tar.gz" does not exist


In [None]:
import tarfile
import pickle as pkl

tar = tarfile.open('./models/bert-tf/model.tar.gz')
tar.extractall(path='./models/bert-tf-model')
tar.close()

In [None]:
!ls -al ./models/bert-tf-model

In [None]:
# TODO:  We need to install tf

In [None]:
#!pip install tensorflow==1.15.2

In [None]:
# from bert import run_classifier


In [None]:
# #import tensorflow as tf

# with tf.Session() as sess:
#     model = tf.train.import_meta_graph('./models/bert-tf-model/model.ckpt-100.meta')
#     model.restore(sess, tf.train.latest_checkpoint('./'))
#     print(model)
#     #print(sess.run('w1:0'))

In [None]:
#saver = tf.train.import_meta_graph('./models/bert-tf-model/model.ckpt-100.meta')

In [None]:
# TODO:  Load the model

# Predict 


In [None]:
# def get_prediction(in_sentences):
#   labels = ["1", "2", "3", "4", "5"]
#   input_examples = [run_classifier.InputExample(guid="", text_a = x, text_b = None, label = 0) for x in in_sentences] # here, "" is just a dummy label
#   input_features = run_classifier.convert_examples_to_features(input_examples, label_list, MAX_SEQ_LENGTH, tokenizer)
#   predict_input_fn = run_classifier.input_fn_builder(features=input_features, seq_length=MAX_SEQ_LENGTH, is_training=False, drop_remainder=False)
#   predictions = estimator.predict(predict_input_fn)
#   return [(sentence, prediction['probabilities'], labels[prediction['labels']]) for sentence, prediction in zip(in_sentences, predictions)]

In [None]:
# pred_sentences = [
#   "That movie was absolutely awful",
#   "The acting was a bit lacking",
#   "The film was creative and surprising",
#   "Absolutely fantastic!"
# ]

In [None]:
# predictions = get_prediction(pred_sentences)
