In [1]:
from sagemaker import get_execution_role

bucket_name = 'jmijailovic-item-logs'

role = get_execution_role()
bucket_key_prefix = 'sms-spam-classifier'
vocabulary_length = 9013

print(role)

arn:aws:iam::571632058847:role/ykumarbekov-sagemaker-role


In [2]:
!mkdir -p dataset
!curl https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip -o dataset/smsspamcollection.zip
!unzip -o dataset/smsspamcollection.zip -d dataset
!head -10 dataset/SMSSpamCollection

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  198k  100  198k    0     0   279k      0 --:--:-- --:--:-- --:--:--  279k
Archive:  dataset/smsspamcollection.zip
  inflating: dataset/SMSSpamCollection  
  inflating: dataset/readme          
ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat

In [2]:
!pip3 install import_ipynb

Collecting import_ipynb
  Downloading import-ipynb-0.1.3.tar.gz (4.0 kB)
Building wheels for collected packages: import-ipynb
  Building wheel for import-ipynb (setup.py) ... [?25ldone
[?25h  Created wheel for import-ipynb: filename=import_ipynb-0.1.3-py3-none-any.whl size=2975 sha256=4cc3cac257418bad922e111c9fbe44b666901eb21f4eedd9778573e6f6aacd79
  Stored in directory: /home/ec2-user/.cache/pip/wheels/fe/df/cf/1118a0b1acf475fb435463072bba11232db7e3592e7d7695d4
Successfully built import-ipynb
Installing collected packages: import-ipynb
Successfully installed import-ipynb-0.1.3
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [3]:
import pandas as pd
import numpy as np
import pickle
import import_ipynb
from utilities import one_hot_encode
from utilities import vectorize_sequences

df = pd.read_csv('dataset/SMSSpamCollection', sep='\t', header=None)
df[df.columns[0]] = df[df.columns[0]].map({'ham': 0, 'spam': 1})

targets = df[df.columns[0]].values
messages = df[df.columns[1]].values

# one hot encoding for each SMS message
one_hot_data = one_hot_encode(messages, vocabulary_length)
encoded_messages = vectorize_sequences(one_hot_data, vocabulary_length)

df2 = pd.DataFrame(encoded_messages)
df2.insert(0, 'spam', targets)

# Split into training and validation sets (80%/20% split)
split_index = int(np.ceil(df.shape[0] * 0.8))
train_set = df2[:split_index]
val_set = df2[split_index:]

train_set.to_csv('dataset/sms_train_set.gz', header=False, index=False, compression='gzip')
val_set.to_csv('dataset/sms_val_set.gz', header=False, index=False, compression='gzip')

importing Jupyter notebook from utilities.ipynb


In [4]:
import boto3

s3 = boto3.resource('s3')
target_bucket = s3.Bucket(bucket_name)

with open('dataset/sms_train_set.gz', 'rb') as data:
    target_bucket.upload_fileobj(data, '{0}/train/sms_train_set.gz'.format(bucket_key_prefix))
    
with open('dataset/sms_val_set.gz', 'rb') as data:
    target_bucket.upload_fileobj(data, '{0}/val/sms_val_set.gz'.format(bucket_key_prefix))

In [5]:
from sagemaker.mxnet import MXNet

output_path = 's3://{0}/{1}/output'.format(bucket_name, bucket_key_prefix)
code_location = 's3://{0}/{1}/code'.format(bucket_name, bucket_key_prefix)



In [6]:
m = MXNet('mxnet_script_python.py',
          role=role,
          train_instance_count=1,
          train_instance_type='ml.c5.2xlarge',
          output_path=output_path,
          base_job_name='sms-spam-classifier-mxnet',
          py_version='py3',
          framework_version="1.2",
          code_location = code_location,
          hyperparameters={'batch_size': 100,'epochs': 20,'learning_rate': 0.01})



In [7]:
inputs = {'train': 's3://{0}/{1}/train/'.format(bucket_name, bucket_key_prefix),
 'val': 's3://{0}/{1}/val/'.format(bucket_name, bucket_key_prefix)}

m.fit(inputs)

's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
's3_input' class will be renamed to 'TrainingInput' in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


2020-08-05 09:51:25 Starting - Starting the training job...
2020-08-05 09:51:27 Starting - Launching requested ML instances......
2020-08-05 09:52:29 Starting - Preparing the instances for training.
2020-08-05 09:52:29 Starting - Preparing the instances for training.....
2020-08-05 09:53:23 Downloading - Downloading input data
2020-08-05 09:53:23 Training - Training image download completed. Training in progress..
2020-08-05 09:53:23 Downloading - Downloading input data
2020-08-05 09:53:23 Training - Training image download completed. Training in progress..[34m2020-08-05 09:53:23,557 INFO - root - running container entrypoint[0m
[34m2020-08-05 09:53:23,557 INFO - root - starting train task[0m
[34m2020-08-05 09:53:23,561 INFO - container_support.training - Training starting[0m
[34m2020-08-05 09:53:30,521 INFO - mxnet_container.train - MXNetTrainingEnvironment: {'base_dir': '/opt/ml', 'input_dir': '/opt/ml/input', 'code_dir': '/opt/ml/code', 'resource_config': {'current_host': 'al

In [8]:
mxnet_pred = m.deploy(initial_instance_count=1,
                      instance_type='ml.m5.large')

Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
Parameter image will be renamed to image_uri in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.
'create_image_uri' will be deprecated in favor of 'ImageURIProvider' class in SageMaker Python SDK v2.


--------------------------!!

In [43]:
import pandas as pd
df = pd.read_json(r's3://jmijailovic-item-logs/reviewLogs/2020/06/10/11/jmijailovicLogsStream-1-2020-06-10-11-36-42-3691d933-23be-4814-8bcf-b50b926f3d22', lines=True)
# print(df['review_text'])
# print(df['review_title'])
# print(df['review_stars'])
# print(pd.concat([df['review_title'],df['review_text']]))
df['review_concat'] = df['review_title']+' '+df['review_text']
review_concat = str(df['review_concat'])
print(review_concat)

0           review2 great
1       review2 very good
2           review2 awful
3           review3 great
4             review3 bad
              ...        
1495      review1 awesome
1496        review3 great
1497        review8 awful
1498        review9 awful
1499                  NaN
Name: review_concat, Length: 1500, dtype: object


In [10]:
from sagemaker.mxnet.model import MXNetPredictor
from utilities import one_hot_encode
from utilities import vectorize_sequences

# Uncomment the following line to connect to an existing endpoint.
# mxnet_pred = MXNetPredictor('<endpoint_name>')

test_messages = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! ubscribe6GBP/ mnth inc 3hrs 16 stop?txtStop"]
one_hot_test_messages = one_hot_encode(test_messages, vocabulary_length)
encoded_test_messages = vectorize_sequences(one_hot_test_messages, vocabulary_length)

result = mxnet_pred.predict(encoded_test_messages)
print(result)

{'predicted_probability': [[0.999864935874939]], 'predicted_label': [[1.0]]}


In [44]:
from sagemaker.mxnet.model import MXNetPredictor
from utilities import one_hot_encode
from utilities import vectorize_sequences

# Uncomment the following line to connect to an existing endpoint.
# mxnet_pred = MXNetPredictor('<endpoint_name>')

# test_messages = ["FreeMsg: Txt: CALL to No: 86888 & claim your reward of 3 hours talk time to use from your phone now! ubscribe6GBP/ mnth inc 3hrs 16 stop?txtStop"]
one_hot_test_messages = one_hot_encode(review_concat, vocabulary_length)
encoded_test_messages = vectorize_sequences(one_hot_test_messages, vocabulary_length)

result = mxnet_pred.predict(encoded_test_messages)
print(result)

{'predicted_label': [[0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0], [0.0],

In [15]:
mxnet_pred.delete_endpoint()

Deleting endpoint configuration with name: sms-spam-classifier-mxnet-2020-07-14-09-29-49-703
Deleting endpoint with name: sms-spam-classifier-mxnet-2020-07-14-09-29-49-703
