In [None]:
!pip install --disable-pip-version-check -q sagemaker==2.35.0
!pip install --disable-pip-version-check -q nltk==3.5

In [None]:
import json

import boto3
from IPython.core.display import display, HTML
import matplotlib.pyplot as plt
import nltk
import pandas as pd
import sagemaker
from sklearn.model_selection import train_test_split

%matplotlib inline
%config InlineBackend.figure_format='retina'

nltk.download('punkt')

In [None]:
sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# 1. Prepare dataset

In [None]:
!aws s3 cp 's3://dlai-practical-data-science/data/balanced/womens_clothing_ecommerce_reviews_balanced.csv' ./

In [None]:
path = './womens_clothing_ecommerce_reviews_balanced.csv'
df = pd.read_csv(path, delimiter=',')
df.head()

In [None]:
sentence = "I'm not a fan of this product!"
tokens = nltk.word_tokenize(sentence)
print(tokens)

In [None]:
def tokenize(review):
    # delete commas and quotation marks, apply tokenization and join back
    # into a string separating by spaces
    return ' '.join(
        [str(token) for token in nltk.word_tokenize(
            str(review).replace(',', '').replace('"', '').lower())])

In [None]:
def prepare_data(df):
    df['sentiment'] = df['sentiment'].map(
        lambda sentiment: 
          f'__label__{str(sentiment.replace(",", "").lower())}'
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    # Replace all None
    df.review_body = df.review_body.map(lambda review : tokenize(review)) 
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    return df

In [None]:
# create a sample dataframe
df_example = pd.DataFrame({'sentiment':[-1, 0, 1], 
                           'review_body': [
                                "I do like this product!", 
                                "this product is ok", 
                                "I don't like this product!"]})
# test the prepare_data function
print(prepare_data(df_example))

In [None]:
df_blazingtext = df[['sentiment', 'review_body']].reset_index(drop=True)
df_blazingtext = prepare_data(df_blazingtext)
df_blazingtext.head()

In [None]:
# Split all data into 90% train and 10% holdout
df_train, df_validation = train_test_split(
    df_blazingtext, test_size=0.10, stratify=df_blazingtext['sentiment'])
labels = ['train', 'validation']
sizes = [len(df_train.index), len(df_validation.index)]

explode = (0.1, 0)
fig1, ax1 = plt.subplots()
ax1.pie(sizes, 
        explode=explode, 
        labels=labels, 
        autopct='%1.1f%%', 
        startangle=90)
# Equal aspect ratio ensures that pie is drawn as a circle.
ax1.axis('equal')
plt.show()

In [None]:
blazingtext_train_path = './train.csv'
df_train[['sentiment', 'review_body']].to_csv(
    blazingtext_train_path, index=False, header=False, sep=' ')

In [None]:
blazingtext_validation_path = './validation.csv'
df_validation[['sentiment', 'review_body']].to_csv(
    blazingtext_validation_path, index=False, header=False, sep=' ')

In [None]:
train_s3_uri = sess.upload_data(bucket=bucket, 
                                key_prefix='blazingtext/data', 
                                path=blazingtext_train_path)
validation_s3_uri = sess.upload_data(bucket=bucket, 
                                     key_prefix='blazingtext/data', 
                                     path=blazingtext_validation_path)

# 2. Train the model

In [None]:
image_uri = sagemaker.image_uris.retrieve(
    region=region,
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    framework='blazingtext') # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes

In [None]:
estimator = sagemaker.estimator.Estimator(
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    image_uri=image_uri, # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    role=role, 
    instance_count=1, 
    instance_type='ml.m5.large',
    volume_size=30,
    max_run=7200,
    sagemaker_session=sess)

In [None]:
estimator.set_hyperparameters(
    mode='supervised',   # supervised (text classification)
    epochs=10,           # number of complete passes through the dataset: 5 - 15
    learning_rate=0.01,  # step size for the  numerical optimizer: 0.005 - 0.01
    min_count=2,         # discard words that appear less than this number: 0 - 100                              
    vector_dim=300,      # number of dimensions in vector space: 32-300
    word_ngrams=3)       # number of words in a word n-gram: 1 - 3

In [None]:
train_data = sagemaker.inputs.TrainingInput(
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    train_s3_uri, # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    distribution='FullyReplicated', 
    content_type='text/plain', 
    s3_data_type='S3Prefix')

In [None]:
validation_data = sagemaker.inputs.TrainingInput(
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    validation_s3_uri, # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    distribution='FullyReplicated', 
    content_type='text/plain', 
    s3_data_type='S3Prefix')

In [None]:
data_channels = {
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    'train': train_data, # Replace None
    'validation': validation_data} # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes

In [None]:
estimator.fit(
    ### BEGIN SOLUTION - DO NOT delete this comment for grading purposes
    inputs=data_channels, # Replace None
    ### END SOLUTION - DO NOT delete this comment for grading purposes
    wait=False)
training_job_name = estimator.latest_training_job.name
print(f'Training Job Name: {training_job_name}'

In [None]:
display(
    HTML(
        f'<b>Review <a target="blank" href="https://console.aws.amazon'
        f'.com/sagemaker/home?region={region}#/jobs/{training_job_name}">'
        f'Training job</a></b>'))

In [None]:
display(
    HTML(
        f'<b>Review <a target="blank" href="https://console.aws.amazon'
        f'.com/cloudwatch/home?region={region}#logStream:group=/aws/'
        f'sagemaker/TrainingJobs;prefix={training_job_name};'
        f'streamFilter=typeLogStreamPrefix">CloudWatch logs</a> (after '
        f'about 5 minutes)</b>'))

In [None]:
%%time

estimator.latest_training_job.wait(logs=False)

In [None]:
estimator.training_job_analytics.dataframe()

In [None]:
display(
    HTML(
        f'<b>Review <a target="blank" href="https://s3.console.aws'
        f'.amazon.com/s3/buckets/{bucket}/{training_job_name}/output/'
        f'?region={region}&tab=overview">Trained model</a> in S3</b>'))

# 3. Deploy the model

In [None]:
%%time

text_classifier = estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.m5.large',
    serializer=sagemaker.serializers.JSONSerializer(),
    deserializer=sagemaker.deserializers.JSONDeserializer())
print()
print(f'Endpoint name: {text_classifier.endpoint_name}'

In [None]:
display(
    HTML(
        '<b>Review <a target="blank" href="https://console.aws.amazon'
        '.com/sagemaker/home?region={region}#/endpoints/'
        '{text_classifier.endpoint_name}">SageMaker REST Endpoint</a></b>'
    ))

# 4. Test the model

In [None]:
reviews = ['This product is great!',
           'OK, but not great',
           'This is not the right product.'] 

In [None]:
tokenized_reviews = [' '.join(nltk.word_tokenize(review)) 
                     for review in reviews]
payload = {"instances" : tokenized_reviews}
print(payload)

In [None]:
predictions = text_classifier.predict(data=payload)
for prediction in predictions:
    print(
        f'Predicted class: {prediction['label'][0].lstrip('__label__')}')