In [1]:
import sagemaker
from sagemaker import get_execution_role
import json
import boto3

sess = sagemaker.Session()
role = get_execution_role()

bucket = "mastering-ml-aws"

prefix = "chapter2/blazingtext"


In [3]:
from os.path import expanduser

SRC_PATH = expanduser("~") + '/SageMaker/mastering-ml-on-aws/chapter2/'

with open(SRC_PATH + 'dem.txt', 'r') as file:
    dem_text = ["__label__0 " + line.strip('\n') for line in file]

with open(SRC_PATH + 'gop.txt', 'r') as file:
    gop_text = ["__label__1 " + line.strip('\n') for line in file]
    
corpus = dem_text + gop_text
    
from sklearn.model_selection import train_test_split
corpus_train, corpus_test = train_test_split(corpus, test_size=0.25, random_state=42)    

In [7]:
corpus_train_txt = "\n".join(corpus_train)
corpus_test_txt = "\n".join(corpus_test)
with open('tweets.train', 'w') as file:
    file.write(corpus_train_txt)    
with open('tweets.test', 'w') as file:
    file.write(corpus_test_txt)    


In [8]:
print(corpus_train_txt[:300])

__label__1 “We are forever grateful for your service.” -@FLOTUS https://t.co/22vFTZguAQ
__label__0 RT @CecileRichards: When your strategy relies on fewer people being able to vote, you’re on the wrong side of history. https://t.co/ncthe2W…
__label__0 RT @AFLCIO: Scott Walker. Forever a national disg


In [10]:

train_path = prefix + '/train'
validation_path = prefix + '/validation'

sess.upload_data(path='tweets.train', bucket=bucket, key_prefix=train_path)
sess.upload_data(path='tweets.test', bucket=bucket, key_prefix=validation_path)

s3_train_data = 's3://{}/{}'.format(bucket, train_path)
s3_validation_data = 's3://{}/{}'.format(bucket, validation_path)

In [11]:
container = sagemaker.amazon.amazon_estimator.get_image_uri('us-east-1', "blazingtext", "latest")

s3_output_location = 's3://{}/{}/output'.format(bucket, prefix)


The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: latest.


In [12]:
bt_model = sagemaker.estimator.Estimator(container,
                                         role, 
                                         train_instance_count=1, 
                                         train_instance_type='ml.c4.4xlarge',
                                         train_volume_size = 30,
                                         train_max_run = 360000,
                                         input_mode= 'File',
                                         output_path=s3_output_location,
                                         sagemaker_session=sess)

train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_max_run has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_volume_size has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [13]:
bt_model.set_hyperparameters(mode="supervised",
                            epochs=10,
                            min_count=3,
                            learning_rate=0.05,
                            early_stopping=False,
                            patience=5,
                            min_epochs=5,
                            word_ngrams=2)

train_data = sagemaker.session.s3_input(s3_train_data, distribution='FullyReplicated', 
                        content_type='text/plain', s3_data_type='S3Prefix')
validation_data = sagemaker.session.s3_input(s3_validation_data, distribution='FullyReplicated', 
                             content_type='text/plain', s3_data_type='S3Prefix')
data_channels = {'train': train_data, 'validation': validation_data}

The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


In [15]:
bt_model.fit(inputs=data_channels, logs=True)


2022-02-26 20:01:49 Starting - Starting the training job...
2022-02-26 20:02:17 Starting - Preparing the instances for trainingProfilerReport-1645905709: InProgress
......
2022-02-26 20:03:15 Downloading - Downloading input data...
2022-02-26 20:03:43 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[02/26/2022 20:03:45 INFO 140133551605120] nvidia-smi took: 0.025173664093017578 secs to identify 0 gpus[0m
[34m[02/26/2022 20:03:45 INFO 140133551605120] Running single machine CPU BlazingText training using supervised mode.[0m
[34mNumber of CPU sockets found in instance is  1[0m
[34m[02/26/2022 20:03:45 INFO 140133551605120] Processing /opt/ml/input/data/train/tweets.train . File size: 0.04697608947753906 MB[0m
[34m[02/26/2022 20:03:45 INFO 140133551605120] Processing /opt/ml/input/data/validation/tweets.test . File size: 0.015285491943359375 MB[0m
[34mRead 0M words[0m
[34mNumber of words:  407[0m
[34m##### Alpha: -0.0002  Pr

In [16]:
predictor = bt_model.deploy(initial_instance_count = 1,instance_type = 'ml.m4.xlarge')


------!

In [17]:
!aws s3 ls --recursive s3://mastering-ml-aws/chapter2/blazingtext

2022-02-26 20:05:22  739923033 chapter2/blazingtext/output/blazingtext-2022-02-26-20-01-49-653/output/model.tar.gz
2022-02-26 20:05:28          0 chapter2/blazingtext/output/blazingtext-2022-02-26-20-01-49-653/profiler-output/framework/training_job_end.ts
2022-02-26 20:04:00     457692 chapter2/blazingtext/output/blazingtext-2022-02-26-20-01-49-653/profiler-output/system/incremental/2022022620/1645905780.algo-1.json
2022-02-26 20:05:28          0 chapter2/blazingtext/output/blazingtext-2022-02-26-20-01-49-653/profiler-output/system/training_job_end.ts
2022-02-26 19:56:19      49258 chapter2/blazingtext/train/tweets.train
2022-02-26 19:56:19      16028 chapter2/blazingtext/validation/tweets.test


In [23]:
json.dumps(payload)

'{"instances": ["\\u201cI want to shine a spotlight on the organizations and programs that support children.\\u201d -@FLOTUS https://t.co/hvJJQKu01q", "\\u201cThis new deal will be the most modern, up-to-date, and balanced trade agreement in the history of our country, with the most advanced protections for workers ever developed.\\u201d \\u2013 President @realDonaldTrump", "RT @SenSchumer: I want to be crystal clear on this: A Trump temper tantrum and his more than 20 shutdown threats are not going to change an\\u2026", "\\"We\\u2019re respected again.\\" -@realDonaldTrump https://t.co/qqVXlMKeRf", "These disgraceful power grabs have to stop. The people made their voices heard, and Republicans have a duty to listen. https://t.co/laW8sltk9K", "Climate change is real. We must act now and protect the world we leave to future generations. https://t.co/6exHaRW9iO", "\\u201cLet future generations understand the burden and the blessings of freedom. Let them say we stood where duty required u

In [21]:
corpus_test_no_labels = [x[11:] for x in corpus_test]

payload = {"instances" : corpus_test_no_labels}

response = predictor.predict(json.dumps(payload))

predictions = json.loads(response)
print(json.dumps(predictions, indent=2))

ModelError: An error occurred (ModelError) when calling the InvokeEndpoint operation: Received client error (415) from primary with message "content-type application/octet-stream not supported". See https://us-east-1.console.aws.amazon.com/cloudwatch/home?region=us-east-1#logEventViewer:group=/aws/sagemaker/Endpoints/blazingtext-2022-02-26-20-07-57-303 in account 258532878709 for more information.

In [24]:
predicted_labels = [prediction['label'][0] for prediction in predictions]


NameError: name 'predictions' is not defined

In [17]:
predicted_labels[:4]

['__label__1', '__label__1', '__label__1', '__label__1']

In [18]:
actual_labels = [x[:10] for x in corpus_test]
actual_labels[:4]

['__label__1', '__label__1', '__label__0', '__label__1']

In [19]:
matches = [(actual_label == predicted_label) for (actual_label, predicted_label) in zip(actual_labels, predicted_labels)]
matches[:4]

[True, True, False, True]

In [20]:
matches.count(True) / len(matches)

0.62