In [None]:
!pip install -q boto3

In [None]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Specify the S3 Location of the Features

In [None]:
%store -r scikit_processing_job_s3_output_prefix

In [None]:
print('Previous Scikit Processing Job Name: {}'.format(scikit_processing_job_s3_output_prefix))

In [28]:
prefix_train = '{}/output/raw-labeled-split-balanced-header-train'.format(scikit_processing_job_s3_output_prefix)
prefix_validation = '{}/output/raw-labeled-split-balanced-header-validation'.format(scikit_processing_job_s3_output_prefix)
prefix_test = '{}/output/raw-labeled-split-balanced-header-test'.format(scikit_processing_job_s3_output_prefix)

path_train = './{}'.format(prefix_train)
path_validation = './{}'.format(prefix_validation)
path_test = './{}'.format(prefix_test)

train_s3_uri = 's3://{}/{}'.format(bucket, prefix_train)
validation_s3_uri = 's3://{}/{}'.format(bucket, prefix_validation)
test_s3_uri = 's3://{}/{}'.format(bucket, prefix_test)

s3_input_train_data = sagemaker.s3_input(s3_data=train_s3_uri, content_type='text/csv')
s3_input_validation_data = sagemaker.s3_input(s3_data=validation_s3_uri, content_type='text/csv')
s3_input_test_data = sagemaker.s3_input(s3_data=test_s3_uri, content_type='text/csv')

print(s3_input_train_data.config)
print(s3_input_validation_data.config)
print(s3_input_test_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-03-21-00-51-49-387/output/raw-labeled-split-balanced-header-train', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'text/csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-03-21-00-51-49-387/output/raw-labeled-split-balanced-header-validation', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'text/csv'}
{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/sagemaker-scikit-learn-2020-03-21-00-51-49-387/output/raw-labeled-split-balanced-header-test', 'S3DataDistributionType': 'FullyReplicated'}}, 'ContentType': 'text/csv'}


In [29]:
!cat src_bert/bert_reviews.py

import os
import argparse
import csv
import pickle as pkl
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
import sklearn
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
import re
import glob
import json
import subprocess
import sys
subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'simpletransformers'])
import torch
import torch.distributed as dist
import torch.utils.data
import torch.utils.data.distributed

import simpletransformers
from simpletransformers.classification import ClassificationModel

# Note:  header=None
def load_dataset(path, sep, header):
    data = pd.concat([pd.read_csv(f, sep=sep, header=header) for f in glob.glob('{}/*.csv'.format(path))], ignore_index = True)

    labels = data.iloc[:,0]
    features = data.drop(data.columns[0], axis=1)
    
    if header==None:
        # Adjust the column names after dropped th

In [30]:
from sagemaker.pytorch import PyTorch

model_output_path = 's3://{}/models/bert/script-mode/training-runs'.format(bucket)

bert_estimator = PyTorch(entry_point='bert_reviews.py',
                         source_dir='src_bert',
                         role=role,
                         train_instance_count=2, 
                         train_instance_type='ml.c5.9xlarge',
                         py_version='py3',
                         framework_version='1.4.0',
                         output_path=model_output_path,
                         hyperparameters={'model_type':'bert',
                                          'model_name': 'bert-base-cased',
                                          'backend': 'gloo'},
                         enable_cloudwatch_metrics=True)

### Train the model

In [31]:
bert_estimator.fit(inputs={'train': s3_input_train_data, 
                           'validation': s3_input_validation_data,}, 
                   wait=False) 

In [32]:
training_job_name = bert_estimator.latest_training_job.name
print('training_job_name:  {}'.format(training_job_name))

training_job_name:  pytorch-training-2020-03-21-05-09-07-267


In [None]:
from sagemaker.pytorch import PyTorch

bert_estimator = PyTorch.attach(training_job_name=training_job_name)

2020-03-21 05:09:34 Starting - Starting the training job...
2020-03-21 05:09:36 Starting - Launching requested ML instances.........
2020-03-21 05:11:08 Starting - Preparing the instances for training...
2020-03-21 05:11:49 Downloading - Downloading input data
2020-03-21 05:11:49 Training - Downloading the training image...
2020-03-21 05:12:23 Training - Training image download completed. Training in progress.[35mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[35mbash: no job control in this shell[0m
[35m2020-03-21 05:12:23,516 sagemaker-containers INFO     Imported framework sagemaker_pytorch_container.training[0m
[35m2020-03-21 05:12:23,518 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[35m2020-03-21 05:12:23,527 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job con

[35m#015Downloading:   0%|          | 0.00/939 [00:00<?, ?B/s]#015Downloading: 100%|██████████| 939/939 [00:00<00:00, 975kB/s]
  Downloading simpletransformers-0.22.1-py3-none-any.whl (144 kB)[0m
[34m#015Downloading:   0%|          | 0.00/939 [00:00<?, ?B/s]#015Downloading: 100%|██████████| 939/939 [00:00<00:00, 1.18MB/s]
  Downloading simpletransformers-0.22.1-py3-none-any.whl (144 kB)[0m
[35m#015Downloading:   0%|          | 0.00/263M [00:00<?, ?B/s]#015Downloading:   3%|▎         | 7.84M/263M [00:00<00:03, 78.4MB/s]#015Downloading:   6%|▌         | 16.0M/263M [00:00<00:03, 79.4MB/s]#015Downloading:   9%|▉         | 24.3M/263M [00:00<00:02, 80.5MB/s]#015Downloading:  12%|█▏        | 32.6M/263M [00:00<00:02, 81.1MB/s]#015Downloading:  16%|█▌        | 40.9M/263M [00:00<00:02, 81.8MB/s]#015Downloading:  19%|█▉        | 49.4M/263M [00:00<00:02, 82.6MB/s]#015Downloading:  22%|██▏       | 57.9M/263M [00:00<00:02, 83.2MB/s]#015Downloading:  25%|██▌       | 66.4M/263M [00:00<00:02, 83.7

[35m#015Current iteration:   3%|▎         | 7/225 [00:04<02:03,  1.77it/s]#033[A[0m
[35m#015Current iteration:   4%|▎         | 8/225 [00:04<02:00,  1.80it/s]#033[A[0m
[34m#015Current iteration:   3%|▎         | 7/225 [00:04<02:05,  1.74it/s]#033[A
  Downloading seqeval-0.0.12.tar.gz (21 kB)[0m
[34m#015Current iteration:   4%|▎         | 8/225 [00:04<02:02,  1.77it/s]#033[A[0m
[35m#015Current iteration:   4%|▍         | 9/225 [00:05<01:58,  1.82it/s]#033[A[0m
[35mCollecting seqeval[0m
[35m#015Current iteration:   4%|▍         | 10/225 [00:05<01:56,  1.84it/s]#033[A
  Downloading seqeval-0.0.12.tar.gz (21 kB)[0m
[34m#015Current iteration:   4%|▍         | 9/225 [00:05<01:59,  1.81it/s]#033[A[0m
[34m#015Current iteration:   4%|▍         | 10/225 [00:05<01:58,  1.81it/s]#033[A[0m
[35m#015Current iteration:   5%|▍         | 11/225 [00:06<01:55,  1.86it/s]#033[A[0m
[35mCollecting sentencepiece[0m
[35m#015Current iteration:   5%|▌         | 12/225 [00:06<01:54,  1.86it

[35m#015Current iteration:  20%|██        | 45/225 [00:24<01:37,  1.85it/s]#033[A
  Building wheel for seqeval (setup.py): finished with status 'done'[0m
[35m#015Current iteration:  20%|██        | 46/225 [00:24<01:36,  1.85it/s]#033[A
  Created wheel for seqeval: filename=seqeval-0.0.12-py3-none-any.whl size=7423 sha256=c8d4293b72ce186c8402278cbcdbb1849c561cf69661382ff951db3033c13c28[0m
[34m#015Current iteration:  20%|█▉        | 44/225 [00:24<01:40,  1.81it/s]#033[A
  Building wheel for seqeval (setup.py): finished with status 'done'[0m
[34m#015Current iteration:  20%|██        | 45/225 [00:24<01:39,  1.81it/s]#033[A
  Created wheel for seqeval: filename=seqeval-0.0.12-py3-none-any.whl size=7423 sha256=c8d4293b72ce186c8402278cbcdbb1849c561cf69661382ff951db3033c13c28[0m
[35m#015Current iteration:  21%|██        | 47/225 [00:25<01:36,  1.84it/s]#033[A
  Stored in directory: /root/.cache/pip/wheels/1f/1b/a6/a808a7e4d1f7584e42f5e279664cd48bf24ed8392218ce6be4[0m
[35m#015Current

# Load the Model

In [None]:
# download the model artifact from AWS S3
!aws s3 cp $model_output_path/$training_job_name/output/model.tar.gz ./models/bert/

In [None]:
import tarfile
import pickle as pkl

tar = tarfile.open('./models/bert/model.tar.gz')
tar.extractall(path='./models/bert-model')
tar.close()

In [None]:
from simpletransformers.classification import ClassificationModel

args = {
   'fp16': False,
   'max_seq_length': 128
}

bert_model = ClassificationModel(model_type='distilbert', # bert, distilbert, etc, etc.
                                 model_name='./models/bert-model',
                                 args=args,
                                 use_cuda=False)

# Predict 


In [None]:
predictions, raw_outputs = bert_model.predict(["""I really enjoyed this item.  I highly recommend it."""])

print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))

In [None]:
predictions, raw_outputs = bert_model.predict(["""This item is awful and terrible."""])

print('Predictions: {}'.format(predictions))
print('Raw outputs: {}'.format(raw_outputs))