In this tutorial, we build a simple matrix factorization model using the [MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/) with TFRS. We can use this model to recommend movies for a given user.

In [1]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Create the Experiment

In [2]:
import time
from smexperiments.experiment import Experiment

timestamp = int(time.time())

experiment = Experiment.create(
                experiment_name='Movie-Lens-Recommendations-{}'.format(timestamp),
                description='Movie Lens Recommendations', 
                sagemaker_boto_client=sm)

experiment_name = experiment.experiment_name
print('Experiment name: {}'.format(experiment_name))

Experiment name: Movie-Lens-Recommendations-1602991013


# Create the `Trial`

In [3]:
import time
from smexperiments.trial import Trial

timestamp = int(time.time())

trial = Trial.create(trial_name='trial-{}'.format(timestamp),
                     experiment_name=experiment_name,
                     sagemaker_boto_client=sm)

trial_name = trial.trial_name
print('Trial name: {}'.format(trial_name))

Trial name: trial-1602991014


In [4]:
# tracker_prepare.log_input(name='raw_data_s3_uri', 
#                           media_type='s3/uri', 
#                           value=raw_input_data_s3_uri)

# # must save after logging
# tracker_prepare.trial_component.save()

In [5]:
# tracker_prepare.log_parameters({
#     'movie_lens_dataset': '100k',
# })

# # must save after logging
# tracker_prepare.trial_component.save()

# Specify S3 `Distribution Strategy`

In [6]:
# from sagemaker.inputs import TrainingInput

# s3_input_train_data = TrainingInput(s3_data=processed_train_data_s3_uri, 
#                                          distribution='ShardedByS3Key') 
# s3_input_validation_data = TrainingInput(s3_data=processed_validation_data_s3_uri, 
#                                               distribution='ShardedByS3Key')
# s3_input_test_data = TrainingInput(s3_data=processed_test_data_s3_uri, 
#                                         distribution='ShardedByS3Key')

# print(s3_input_train_data.config)
# print(s3_input_validation_data.config)
# print(s3_input_test_data.config)

# Setup Hyper-Parameters for Classification Layer

In [7]:
epochs=100
learning_rate=0.5
train_instance_count=1
train_instance_type='ml.p3.16xlarge'
use_xla=True
use_amp=True
enable_tensorboard=False
input_mode='Pipe'

# Setup Metrics To Track Model Performance

These sample log lines...
```
45/50 [=====>..] - ETA: 3s - loss: 0.425 - accuracy: 0.881
50/50 [=======>] - ETA: 0s - val_loss: 0.407 - val_accuracy: 0.885
```
...will produce the following 4 metrics in CloudWatch:

`loss` = 0.425

`accuracy` = 0.881

`val_loss` = 0.407

`val_accuracy` = 0.885

In [8]:
metrics_definitions = [
     {'Name': 'train:loss', 'Regex': 'loss: ([0-9\\.]+)'},
     {'Name': 'train:accuracy', 'Regex': 'accuracy: ([0-9\\.]+)'},
     {'Name': 'validation:loss', 'Regex': 'val_loss: ([0-9\\.]+)'},
     {'Name': 'validation:accuracy', 'Regex': 'val_accuracy: ([0-9\\.]+)'}
]

# Setup Our BERT + TensorFlow Script to Run on SageMaker
Prepare our TensorFlow model to run on the managed SageMaker service

In [9]:
!pygmentize src/train.py

[34mimport[39;49;00m [04m[36mtime[39;49;00m
[34mimport[39;49;00m [04m[36mrandom[39;49;00m
[34mimport[39;49;00m [04m[36mpandas[39;49;00m [34mas[39;49;00m [04m[36mpd[39;49;00m
[34mfrom[39;49;00m [04m[36mglob[39;49;00m [34mimport[39;49;00m glob
[34mimport[39;49;00m [04m[36mpprint[39;49;00m
[34mimport[39;49;00m [04m[36margparse[39;49;00m
[34mimport[39;49;00m [04m[36mjson[39;49;00m
[34mimport[39;49;00m [04m[36msubprocess[39;49;00m
[34mimport[39;49;00m [04m[36msys[39;49;00m
[34mimport[39;49;00m [04m[36mos[39;49;00m

subprocess.check_call([sys.executable, [33m'[39;49;00m[33m-m[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mpip[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33minstall[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mscikit-learn==0.23.1[39;49;00m[33m'[39;49;00m])
subprocess.check_call([sys.executable, [33m'[39;49;00m[33m-m[39;49;00m[33m'[39;49;00m, [33m'[39;49;00m[33mpip[39;49;00m[33m'[39;49;00m,

In [10]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='train.py',
                       source_dir='src',
                       role=role,
                       instance_count=train_instance_count,
                       instance_type=train_instance_type,
                       py_version='py37',
                       framework_version='2.3.0',
                       hyperparameters={'epochs': epochs,
                                        'learning_rate': learning_rate,
                                        'use_xla': use_xla,
                                        'use_amp': use_amp,
                                        'enable_tensorboard': enable_tensorboard
                       },
                       input_mode=input_mode,
                       metric_definitions=metrics_definitions,
            )

# Create the `Experiment Config`

In [11]:
experiment_config = {
    'ExperimentName': experiment_name,
    'TrialName': trial.trial_name,
    'TrialComponentDisplayName': 'train'
}

# Train the Model on SageMaker

In [12]:
estimator.fit(
#               inputs={'train': s3_input_train_data, 
#                       'validation': s3_input_validation_data,
#                       'test': s3_input_test_data
#               },              
              experiment_config=experiment_config,                   
              wait=False)

INFO:sagemaker:Creating training-job with name: tensorflow-training-2020-10-18-03-16-55-940


In [13]:
training_job_name = estimator.latest_training_job.name
print('Training Job Name:  {}'.format(training_job_name))

Training Job Name:  tensorflow-training-2020-10-18-03-16-55-940


In [14]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/jobs/{}">Training Job</a></b>'.format(region, training_job_name)))


In [15]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/cloudwatch/home?region={}#logStream:group=/aws/sagemaker/TrainingJobs;prefix={};streamFilter=typeLogStreamPrefix">CloudWatch Logs</a></b>'.format(region, training_job_name)))


In [16]:
from IPython.core.display import display, HTML

display(HTML('<b>Review <a target="blank" href="https://s3.console.aws.amazon.com/s3/buckets/{}/{}/?region={}&tab=overview">S3 Output Data</a> After The Training Job Has Completed</b>'.format(bucket, training_job_name, region)))


In [None]:
estimator.latest_training_job.wait(logs=False)


2020-10-18 03:16:57 Starting - Starting the training job
2020-10-18 03:16:59 Starting - Launching requested ML instances................
2020-10-18 03:18:23 Starting - Preparing the instances for training.................
2020-10-18 03:19:53 Downloading - Downloading input data
2020-10-18 03:20:00 Training - Downloading the training image...................
2020-10-18 03:21:39 Training - Training image download completed. Training in progress..................

# Wait Until the ^^ Training Job ^^ Completes Above!

In [None]:
!aws s3 cp s3://$bucket/$training_job_name/output/model.tar.gz ./model.tar.gz

In [None]:
!mkdir -p ./model/
!tar -xvzf ./model.tar.gz -C ./model/

In [None]:
!saved_model_cli show --all --dir ./model/tensorflow/saved_model/0/

# Show the Experiment Tracking Lineage

In [None]:
from sagemaker.analytics import ExperimentAnalytics

lineage_table = ExperimentAnalytics(
    sagemaker_session=sess,
    experiment_name=experiment_name,
    metric_names=['validation:accuracy'],
    sort_by="CreationTime",
    sort_order="Ascending",
)

lineage_df = lineage_table.dataframe()
lineage_df.shape

In [None]:
lineage_df

In [None]:
sm.describe_trial_component(TrialComponentName=lineage_df.TrialComponentName[0])

# Analyze Debugger Rules

In [None]:
estimator.latest_training_job.rule_job_summary()

In [None]:
training_job_debugger_artifacts_path = estimator.latest_job_debugger_artifacts_path()
print(training_job_debugger_artifacts_path)

# Pass Variables to the Next Notebook(s)

In [None]:
%%javascript
Jupyter.notebook.save_checkpoint();
Jupyter.notebook.session.delete();