In [1]:
#Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
#SPDX-License-Identifier: MIT-0

In [2]:
#install additional libraries
!pip install nltk
!pip install jsonlines
!pip install pandarallel



In [3]:
#import libraries
import os
import uuid
import datetime
import time
import logging
import glob

import boto3
import sagemaker

from search_utils import helpers

[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ec2-user/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
#Define common variables

#Creating a sagemaker session
sagemaker_session = sagemaker.Session()

#We'll be using the sagemaker default bucket
#Feel free to change this to another bucket name and make sure it's the same across all four notebooks
bucket_name = sagemaker_session.default_bucket()

In [5]:
def generate_unique_id():
    return str(uuid.uuid4())

# 1. Building the docker image 

First we'll build a custom docker container in order to use it with the SageMaker processing jobs.

Within the docker we'll install the libraries defined in the requirements.txt file.

We'll also upload the source code (helper functions, processing functions etc) under "/opt/source_code/" so they are accessible during runtime.

In [7]:
%%bash
cd ../
sh build_and_push.sh


Login Succeeded
Sending build context to Docker daemon  5.626GB
Step 1/7 : FROM amazonlinux:2
 ---> 7443854fbdb0
Step 2/7 : RUN yum install python3 -y
 ---> Running in c6e6bf218dbf
Loaded plugins: ovl, priorities
Resolving Dependencies
--> Running transaction check
---> Package python3.x86_64 0:3.7.10-1.amzn2.0.1 will be installed
--> Processing Dependency: python3-libs(x86-64) = 3.7.10-1.amzn2.0.1 for package: python3-3.7.10-1.amzn2.0.1.x86_64
--> Processing Dependency: python3-setuptools for package: python3-3.7.10-1.amzn2.0.1.x86_64
--> Processing Dependency: python3-pip for package: python3-3.7.10-1.amzn2.0.1.x86_64
--> Processing Dependency: libpython3.7m.so.1.0()(64bit) for package: python3-3.7.10-1.amzn2.0.1.x86_64
--> Running transaction check
---> Package python3-libs.x86_64 0:3.7.10-1.amzn2.0.1 will be installed
--> Processing Dependency: libtirpc.so.1()(64bit) for package: python3-libs-3.7.10-1.amzn2.0.1.x86_64
---> Package python3-pip.noarch 0:20.2.2-1.amzn2.0.3 will be i

https://docs.docker.com/engine/reference/commandline/login/#credentials-store



In [13]:
#Make sure you replace the following variable with your account id and region
#You can also copy past the ECR uri from the logs of the previous cell
ecr_uri = "062241367565.dkr.ecr.us-east-1.amazonaws.com/sm-search:latest"

In [14]:
#uploading the search_utils files so they are accessible during runtime
s3_client = boto3.client("s3")
for file_name in glob.glob("../src/search_utils/*.py"):
    s3_client.upload_file(file_name, bucket_name, f"search_knn_blog/code/{file_name.split('/')[-1]}" )

# 2. Preprocessing 

In [15]:
bucket_name="sagemaker-knn-benfelip"
from sagemaker.processing import ScriptProcessor
script_processor = ScriptProcessor(
            image_uri=ecr_uri,
                role=sagemaker.get_execution_role(),
                instance_count=1,
                instance_type='ml.m5.4xlarge',
                command=["python3"],
                volume_size_in_gb=50)


In [16]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

now = datetime.datetime.utcnow()
now_string = now.strftime('%y%m%d%H%M%S%f')
run_id = now_string[:-2]
print(f"run id : {run_id}")
preprocess_job_name = f"search-preprocess-{run_id}"

s3_input_path=f"s3://{bucket_name}/search_knn_blog/data/processed_data/"
s3_code_path=f"s3://{bucket_name}/search_knn_blog/code/"

script_processor.run(job_name=preprocess_job_name,
                     code='../src/preprocessing_main.py',
                      inputs=[ProcessingInput(
                        source=s3_input_path,
                        destination='/opt/ml/processing/input'),
                             ProcessingInput(
                        source=s3_code_path,
                        destination='/opt/ml/processing/input/code/search_utils/')],
                      outputs=[
ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
                output_name='train_textual',
                source='/opt/ml/processing/train_textual'),
ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
                output_name='test_textual',
                source='/opt/ml/processing/test_textual'),
ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
                output_name='train_numerical',
                source='/opt/ml/processing/train_numerical'),
ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
                output_name='test_numerical',
                source='/opt/ml/processing/test_numerical'),
ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
                output_name='vocab',
                source='/opt/ml/processing/vocab'),
ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
                output_name='raw_vocab',
                source='/opt/ml/processing/raw_vocab')
                      ],
                      arguments=['--train-test-split-ratio', '0.2','--total-nb-of-records', '10000'],wait=False)



run id : 2106302056404918

Job Name:  search-preprocess-2106302056404918
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-knn-benfelip/search_knn_blog/data/processed_data/', 'LocalPath': '/opt/ml/processing/input', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-knn-benfelip/search_knn_blog/code/', 'LocalPath': '/opt/ml/processing/input/code/search_utils/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-us-east-1-062241367565/search-preprocess-2106302056404918/input/code/preprocessing_main.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated'

In [17]:
status =  boto3.client("sagemaker").describe_processing_job(ProcessingJobName=preprocess_job_name)["ProcessingJobStatus"]

while status == 'InProgress':  
    status =  boto3.client("sagemaker").describe_processing_job(ProcessingJobName=preprocess_job_name)["ProcessingJobStatus"]
    print(status)
    time.sleep(30)
    continue

InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


# 3. Glove embedding

We will be using the glove embedding to initiate the values of the word tokens. The GloVe embeddings are downloaded from here : https://nlp.stanford.edu/projects/glove/

This data is made available under the Public Domain Dedication and License v1.0 whose full text can be found at: http://www.opendatacommons.org/licenses/pddl/1.0/.


Let's start by pulling the glove embeddings locally then pushing them to S3 using the following commands:

In [18]:
%%bash
# This might time a few minutes
mkdir /tmp/GloVe
curl -Lo /tmp/GloVe/glove.840B.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
unzip /tmp/GloVe/glove.840B.zip -d /tmp/GloVe/
rm /tmp/GloVe/glove.840B.zip

Archive:  /tmp/GloVe/glove.840B.zip
  inflating: /tmp/GloVe/glove.840B.300d.txt  


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0   315    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0   352    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0 2075M    0  221k    0     0   223k      0  2:38:48 --:--:--  2:38:48  223k  1 2075M    1 21.1M    0     0  10.7M      0  0:03:13  0:00:01  0:03:12 21.4M  1 2075M    1 30.3M    0     0  9922k      0  0:03:34  0:00:03  0:03:31 14.1M  1 2075M    1 35.0M    0     0  9006k      0  0:03:56  0:00:03  0:03:53 11.6M  2 2075M    2 42.2M    0     0  7844k      0  0:04:30  0:00:05  0:04:25 9521k  2 2075M    2 46.2M    0     0  7501k      0  0:04:43  0:00:06  0:04:37 8863k  2 2075M    2 50.1M    0     0  7235k      0  0:04:53  0:00:07  0:04:46 5788k  2 2075M    2 55.8M    0     0  7185k      0  0:

In [20]:
boto3.client("s3").upload_file("/tmp/GloVe/glove.840B.300d.txt",
                           bucket_name, "search_knn_blog/artefacts/glove.840B.300d.txt")
print(bucket_name)

sagemaker-knn-benfelip


We can now go ahead and craete a processing job name that will parse the vocabulary generated in the previous section and output a trimmed version of the glove embeddings based on our vocabulary.


In [21]:
from sagemaker.processing import ScriptProcessor
script_processor = ScriptProcessor(
            image_uri=ecr_uri,
                role=sagemaker.get_execution_role(),
                instance_count=1,
                instance_type='ml.m5.xlarge',
                command=["python3"])


In [22]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

now = datetime.datetime.utcnow()
now_string = now.strftime('%y%m%d%H%M%S%f')
run_id = now_string[:-2]
print(f"run id : {run_id}")
glove_job_name = f"search-glove-{run_id}"

s3_code_path=f"s3://{bucket_name}/search_knn_blog/code/"

script_processor.run(job_name=glove_job_name,
                     code='../src/glove_embeddings_main.py',
                      inputs=[ProcessingInput(
                        source=s3_code_path,
                        destination='/opt/ml/processing/input/code/search_utils/'),
                          ProcessingInput(
                        source=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{preprocess_job_name}/",
                        destination='/opt/ml/processing/input_vocabulary'),
                             ProcessingInput(
                        source=f"s3://{bucket_name}/search_knn_blog/artefacts/glove.840B.300d.txt",
                        destination='/opt/ml/processing/input_glove')
                             ],
                      outputs=[
                        ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{glove_job_name}/",
                                        output_name='trimmed_glove',
                                        source='/opt/ml/processing/trimmed_glove'),
                        ProcessingOutput(destination=f"s3://{bucket_name}/search_knn_blog/sagemaker-runs/{glove_job_name}/",
                                        output_name='vocab',
                                        source='/opt/ml/processing/vocab')
                                ],
                      arguments=['--train-test-split-ratio', '0.2'],
                     wait=False)

run id : 2106302153111456

Job Name:  search-glove-2106302153111456
Inputs:  [{'InputName': 'input-1', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-knn-benfelip/search_knn_blog/code/', 'LocalPath': '/opt/ml/processing/input/code/search_utils/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-2', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-knn-benfelip/search_knn_blog/sagemaker-runs/search-preprocess-2106302056404918/', 'LocalPath': '/opt/ml/processing/input_vocabulary', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'input-3', 'AppManaged': False, 'S3Input': {'S3Uri': 's3://sagemaker-knn-benfelip/search_knn_blog/artefacts/glove.840B.300d.txt', 'LocalPath': '/opt/ml/processing/input_glove', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyRep

In [23]:
status = boto3.client("sagemaker").describe_processing_job(ProcessingJobName=glove_job_name)["ProcessingJobStatus"]

while status == 'InProgress':  
    status =  boto3.client("sagemaker").describe_processing_job(ProcessingJobName=glove_job_name)["ProcessingJobStatus"]
    print(status)
 
    time.sleep(30)
    continue

InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


In [24]:
print(f"This is the processing job name you will need during inference : {glove_job_name}")

This is the processing job name you will need during inference : search-glove-2106302153111456


# 4. Training 

In [25]:
word_to_id = helpers.read_json_from_s3(bucket_name, f"search_knn_blog/sagemaker-runs/{glove_job_name}/vocab.json")

In [26]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.session import s3_input

now = datetime.datetime.utcnow()
now_string = now.strftime('%y%m%d%H%M%S%f')
run_id = now_string[:-2]
print(f"run id : {run_id}")

training_job_name = f"search-training-{run_id}"
output_path = os.path.join(f"s3://{bucket_name}/search_knn_blog/sagemaker-runs", training_job_name)

regressor = sagemaker.estimator.Estimator(get_image_uri(boto3.Session().region_name, 'object2vec'),
                                          sagemaker.get_execution_role(), 
                                          train_instance_count=1, 
                                          train_instance_type='ml.p3.8xlarge',
                                          output_path=output_path,
                                          sagemaker_session=sagemaker.Session())


hyperparameters = {
  "enc_dim": 512, #The dimension of the output of the embedding layer.
  "mlp_dim": 256, #The dimension of the output from MLP layers.
  "mlp_activation": "linear",
  "mlp_layers": 2,
    
  "output_layer" : "softmax",#classification task
  "num_classes": 2,#0 and 1

  "optimizer" : "adam",
  "learning_rate" : 0.0004,
  "mini_batch_size": 256,
  "epochs" : 20,

  "enc0_max_seq_len": 200,
  "enc1_max_seq_len": 200,

  "enc0_network": "bilstm", #The network model for the enc0 encoder.
  "enc1_network": "enc0", #same as enc0_network

  "enc0_token_embedding_dim": 300, #The output dimension of the enc0 token embedding layer.
  "enc1_token_embedding_dim": 300, #The output dimension of the enc1 token embedding layer.
    
  "enc0_vocab_file" : "vocab.json", #The vocabulary file for mapping pretrained enc0 token embedding vectors to numerical vocabulary IDs.
  "enc1_vocab_file" : "vocab.json", #same as enc0_vocab_file

  "enc0_vocab_size" : len(word_to_id),#The vocabulary size of enc0 tokens.
  "enc1_vocab_size" : len(word_to_id),#The vocabulary size of enc1 tokens.
    
  "enc0_pretrained_embedding_file" : "trimmed_glove.txt",
  "enc1_pretrained_embedding_file" : "trimmed_glove.txt"
    
}

input_channels = {}
s3_client = boto3.client('s3')

input_channels["train"] = s3_input(os.path.join(f"s3://{bucket_name}/search_knn_blog/sagemaker-runs",\
                                                preprocess_job_name,
                                                "numerical_train_data.jsonl"),
                         distribution='FullyReplicated', 
                         content_type='application/jsonlines')

input_channels["test"] = s3_input(os.path.join(f"s3://{bucket_name}/search_knn_blog/sagemaker-runs",\
                                                preprocess_job_name,
                                                "numerical_test_data.jsonl"),
                         distribution='FullyReplicated', 
                         content_type='application/jsonlines')

input_channels['auxiliary'] = s3_input(os.path.join(f"s3://{bucket_name}/search_knn_blog/sagemaker-runs",\
                                            glove_job_name), 
                             distribution='FullyReplicated', content_type='application/json')


The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
train_instance_count has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
train_instance_type has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
The class sagemaker.session.s3_input has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


run id : 2106302201087225


In [27]:
regressor.set_hyperparameters(**hyperparameters)
regressor.fit(input_channels, job_name=training_job_name, wait=False)

In [None]:
status = boto3.client("sagemaker").describe_training_job(TrainingJobName=training_job_name)["TrainingJobStatus"]

while status == 'InProgress':  
    status =  boto3.client("sagemaker").describe_training_job(TrainingJobName=training_job_name)["TrainingJobStatus"]
    print(status)
 
    time.sleep(30)
    continue


InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
InProgress
Completed


In [29]:
print(f"This is the training job name you will need during inference : {training_job_name}")

This is the training job name you will need during inference : search-training-2106302201087225


In [31]:
dict_metrics = [{metric["MetricName"]:metric['Value']} for metric in boto3.client("sagemaker").describe_training_job(TrainingJobName=training_job_name)["FinalMetricDataList"]]

In [32]:
dict_metrics

[{'train:accuracy': 1.0},
 {'train:progress': 100.0},
 {'test:cross_entropy': 0.39396604895591736},
 {'test:accuracy': 0.9140625},
 {'train:cross_entropy': 0.0004477451147977263},
 {'train:throughput': 777.2449340820312}]