In [1]:
import sagemaker
from pathlib import Path
from sagemaker.predictor import json_serializer
import json

In [2]:
role = sagemaker.get_execution_role()
sagemaker_session = sagemaker.Session()

## Setup Path 

In [3]:
# location for train.csv, val.csv and labels.csv
DATA_PATH = Path("../sample_data/multi_label_toxic_comments/")   

# Location for storing training_config.json
CONFIG_PATH = DATA_PATH/'config'
CONFIG_PATH.mkdir(exist_ok=True)

# S3 bucket name
bucket = sagemaker_session.default_bucket()

# Prefix for S3 bucket for input and output
prefix = 'toxic_comments/input'
prefix_output = 'toxic_comments/output'

## Upload Data

In [4]:
# This is a helper feature to upload data
# from your local machine to S3 bucket.

s3_input = sagemaker_session.upload_data(DATA_PATH, bucket=bucket , key_prefix=prefix)

's3://sagemaker-us-east-1-835319576252/toxic_comments/input/labels.csv'

In [7]:
sagemaker_session.upload_data(str(DATA_PATH/'label/labels.csv'), bucket=bucket , key_prefix=prefix)

's3://sagemaker-us-east-1-835319576252/toxic_comments/input/labels.csv'

In [5]:
sagemaker_session.upload_data(str(DATA_PATH/'data/train_sample.csv'), bucket=bucket , key_prefix=prefix)

's3://sagemaker-us-east-1-835319576252/toxic_comments/input/train_sample.csv'

In [6]:
sagemaker_session.upload_data(str(DATA_PATH/'data/val_sample.csv'), bucket=bucket , key_prefix=prefix)

's3://sagemaker-us-east-1-835319576252/toxic_comments/input/val_sample.csv'

## Hyperparameters & Training Config

In [8]:
hyperparameters = {
    "epochs": 10,
    "lr": 8e-5,
    "max_seq_length": 512,
    "train_batch_size": 16,
    "lr_schedule": "warmup_cosine",
    "warmup_steps": 1000,
    "optimizer_type": "adamw"
}

In [9]:
training_config = {
    "run_text": "toxic comments",
    "finetuned_model": None,
    "do_lower_case": "True",
    "train_file": "train_sample.csv",
    "val_file": "val_sample.csv",
    "label_file": "labels.csv",
    "text_col": "comment_text",
    "label_col": '["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]',
    "multi_label": "True",
    "grad_accumulation_steps": "1",
    "fp16_opt_level": "O1",
    "fp16": "True",
    "model_type": "roberta",
    "model_name": "roberta-base",
    "logging_steps": "300"
}

with open(CONFIG_PATH/'training_config.json', 'w') as f:
    json.dump(training_config, f)

## Create an Estimator and start training

In [15]:
account = sagemaker_session.boto_session.client('sts').get_caller_identity()['Account']
region = sagemaker_session.boto_session.region_name

image = "{}.dkr.ecr.{}.amazonaws.com/fluent-sagemaker-fast-bert:1.0-gpu-py36".format(account, region)
print(image)

835319576252.dkr.ecr.us-east-1.amazonaws.com/fluent-sagemaker-fast-bert:1.0-gpu-py36


In [16]:
output_path = "s3://{}/{}".format(bucket, prefix_output)

In [19]:
estimator = sagemaker.estimator.Estimator(image, 
                                          role,
                                          train_instance_count=1, 
                                          train_instance_type='ml.p3.8xlarge', 
                                          output_path=output_path,
                                          base_job_name='toxic-comments',
                                          enable_sagemaker_metrics=True,
                                          hyperparameters=hyperparameters,
                                          sagemaker_session=sagemaker_session
                                         )

In [20]:
# This is throwing the following error with both py3.7 (original) and py3.6 (i changed the docker image to use this)
estimator.fit(s3_input)

2020-01-31 18:49:05 Starting - Starting the training job...
2020-01-31 18:49:07 Starting - Launching requested ML instances......
2020-01-31 18:50:13 Starting - Preparing the instances for training......
2020-01-31 18:51:27 Downloading - Downloading input data
2020-01-31 18:51:27 Training - Downloading the training image...............
2020-01-31 18:54:03 Uploading - Uploading generated training model
2020-01-31 18:54:03 Failed - Training job failed
[34mStarting the training.[0m
[34m/opt/ml/input/data/training/config/training_config.json[0m
[34m{'run_text': 'toxic comments', 'finetuned_model': None, 'do_lower_case': 'True', 'train_file': 'train_sample.csv', 'val_file': 'val_sample.csv', 'label_file': 'labels.csv', 'text_col': 'comment_text', 'label_col': '["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]', 'multi_label': 'True', 'grad_accumulation_steps': '1', 'fp16_opt_level': 'O1', 'fp16': 'True', 'model_type': 'roberta', 'model_name': 'roberta-base', 'lo

UnexpectedStatusException: Error for Training job toxic-comments-2020-01-31-18-49-05-491: Failed. Reason: AlgorithmError: Exception during training: 'PosixPath' object has no attribute 'decode'
Traceback (most recent call last):
  File "/opt/ml/code/train", line 138, in train
    PRETRAINED_PATH, do_lower_case=bool(training_config["do_lower_case"])
  File "/opt/conda/lib/python3.6/site-packages/transformers/tokenization_utils.py", line 309, in from_pretrained
    return cls._from_pretrained(*inputs, **kwargs)
  File "/opt/conda/lib/python3.6/site-packages/transformers/tokenization_utils.py", line 339, in _from_pretrained
    if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
  File "/opt/conda/lib/python3.6/site-packages/transformers/file_utils.py", line 143, in is_remote_url
    parsed = urlparse(url_or_filename)
  File "/opt/conda/lib/python3.6/urllib/parse.py", line 367, in urlparse
    url, scheme, _coerce_result = _coerce_args(url, scheme)
  File "/opt/conda/lib/python3.6/urllib/parse.py", line 123, in _coerce_args
    return _decode_args(args) + (_encode_result

## Deploy the model to hosting service

In [None]:
predictor = estimator.deploy(1, 
                             'ml.m5.large', 
                             endpoint_name='bert-toxic-comments', 
                             update_endpoint=True, 
                             serializer=json_serializer)