In [1]:
import json
import os

import boto3
import sagemaker
from sagemaker import get_execution_role

bucket = os.getenv('BUCKET_NAME')
endpoint_name = os.getenv('ENDPOINT_NAME')

sess = sagemaker.Session(default_bucket=bucket)

role = get_execution_role()

prefix = 'blazingtext/supervised' #Replace with the prefix under which you want to store the data if needed

ModuleNotFoundError: No module named 'boto3'

# Import and Preprocess Data

In [None]:
import pandas as pd
# you will need to change this
train_input = 's3://beularnotebookstack-beularsagemakerapibucket1198e-xck265jh9uop/training_output/train/train.csv'
validation_input = 's3://beularnotebookstack-beularsagemakerapibucket1198e-xck265jh9uop/training_output/test/test.csv'

train_df = pd.read_csv(train_input)
valid_df = pd.read_csv(validation_input)

# Data Preprocessing
We need to preprocess the training data into space separated tokenized text format which can be consumed by BlazingText algorithm. Also, as mentioned previously, the class label(s) should be prefixed with __label__ and it should be present in the same line along with the original sentence. We'll use nltk library to tokenize the input sentences from DBPedia dataset.

Download the nltk tokenizer and other libraries

In [None]:
import nltk
#nltk.download('punkt')
train = []
for _, row in train_df.iterrows():
    text, label = row
    tokens = ' '.join(nltk.word_tokenize(text.lower()))
    sample = f'__label__{label} {tokens}'
    train.append(sample)
    
    
validation = []
for _, row in valid_df.iterrows():
    text, label = row
    tokens = ' '.join(nltk.word_tokenize(text.lower()))
    sample = f'__label__{label} {tokens}'
    validation.append(sample)

In [None]:
with open("data.train", 'w') as f:
    f.write("\n".join(train))
        
with open("data.validation", 'w') as f:
    f.write("\n".join(validation))

In [None]:
train_channel = prefix + '/train'
validation_channel = prefix + '/validation'

sess.upload_data(path='data.train', bucket=bucket, key_prefix=train_channel)
sess.upload_data(path='data.validation', bucket=bucket, key_prefix=validation_channel)

In [None]:
s3_train_data = 's3://{}/{}'.format(bucket, train_channel)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_channel)

In [None]:
s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)
s3_output_location

# Training

Now that we are done with all the setup that is needed, we are ready to train our object detector. To begin, let us create a sageMaker.estimator.Estimator object. This estimator will launch the training job.

In [None]:
region_name = boto3.Session().region_name

In [None]:
container = sagemaker.amazon.amazon_estimator.get_image_uri(region_name, "blazingtext", "latest")
print('Using SageMaker BlazingText container: {} ({})'.format(container, region_name))

In [None]:
bt_model = sagemaker.estimator.Estimator(
    container,
    role, 
    train_instance_count=1, 
    train_instance_type='ml.c4.4xlarge',
    train_volume_size=30,
    train_max_run=360000,
    input_mode='File',
    output_path=s3_output_location,
    sagemaker_session=sess
)

In [None]:
# see https://docs.aws.amazon.com/sagemaker/latest/dg/blazingtext-tuning.html

bt_model.set_hyperparameters(
    mode="supervised",
    epochs=100,
    min_count=1,
    learning_rate=0.05,
    vector_dim=200,
    early_stopping=True,
    patience=15,
    min_epochs=50,
    word_ngrams=2
)

In [None]:
train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

In [None]:
bt_model.fit(inputs=data_channels, logs=True)

# Deploy the Model 

In [None]:
bt_model.deploy(
    initial_instance_count=1,
    instance_type='ml.c4.xlarge',
    endpoint_name=endpoint_name
)

# Inferences with the Deployed Model

In [None]:
from sklearn import metrics
from sagemaker.predictor import json_serializer, csv_serializer, json_deserializer, RealTimePredictor
from sagemaker.content_types import CONTENT_TYPE_CSV, CONTENT_TYPE_JSON

In [None]:
predictor = RealTimePredictor(
    endpoint=endpoint_name,
    sagemaker_session=sess,
    serializer=json_serializer,
    content_type=CONTENT_TYPE_JSON,
    accept=CONTENT_TYPE_JSON
)

In [None]:
clause = " ".join(nltk.word_tokenize("This is a test of the system"))
payload = {"instances" : [clause]}

In [None]:
r = predictor.predict(payload)

In [None]:
r