In this tutorial, we build a simple matrix factorization model using the [MovieLens 100K dataset](https://grouplens.org/datasets/movielens/100k/) with TensorFlow Recommender System (TFRS) using Amazon SageMaker. 

We will use this model to recommend movies for a given user.

In [1]:
!pip install -q sagemaker==2.9.2
!pip install -q sagemaker-experiments==0.1.24
!pip install -q tensorflow==2.3.0
!pip install -q tensorflow-recommenders==0.2.0
!pip install -q tensorflow-datasets==4.0.0

In [2]:
import boto3
import sagemaker
import pandas as pd

sess   = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name

sm = boto3.Session().client(service_name='sagemaker', region_name=region)

# Specify Input Data S3 URI and `Distribution Strategy`

In [3]:
from sagemaker.inputs import TrainingInput

input_train_data_s3_uri ='s3://{}/tensorflow_datasets/train/'.format(bucket)

s3_input_train_data = TrainingInput(s3_data=input_train_data_s3_uri,
                                    distribution='ShardedByS3Key')
print(s3_input_train_data.config)

{'DataSource': {'S3DataSource': {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sagemaker-us-east-1-835319576252/tensorflow_datasets/train/', 'S3DataDistributionType': 'ShardedByS3Key'}}}


# Setup Metrics To Track Model Performance

These sample log lines...
```
499/500 [=====>..] - ETA: 3s - root_mean_squared_error: 1.194 - factorized_top_k/top_10_categorical_accuracy: 0.481 - factorized_top_k/top_50_categorical_accuracy: 0.607 - factorized_top_k/top_100_categorical_accuracy: 0.885
```
...will produce the following metrics in CloudWatch:

`root_mean_squared_error` = 1.194

`factorized_top_k/top_10_categorical_accuracy` = 0.481

`factorized_top_k/top_50_categorical_accuracy` = 0.607

`factorized_top_k/top_100_categorical_accuracy` = 0.885

In [4]:
metrics_definitions = [
     {'Name': 'root_mean_squared_error', 'Regex': 'root_mean_squared_error: ([0-9\\.]+)'},    
     {'Name': 'top_10_categorical_accuracy', 'Regex': 'factorized_top_k/top_10_categorical_accuracy: ([0-9\\.]+)'},
     {'Name': 'top_50_categorical_accuracy', 'Regex': 'factorized_top_k/top_50_categorical_accuracy: ([0-9\\.]+)'},
     {'Name': 'top_100_categorical_accuracy', 'Regex': 'factorized_top_k/top_100_categorical_accuracy: ([0-9\\.]+)'}
]

# Setup Our TensorFlow Script to Run on SageMaker
Prepare our TensorFlow model to run on the managed SageMaker service

# Setup Manual Hyper-Parameters

In [5]:
dataset_variant='100k' # movielens 100k, 1m, 20m, 25m, etc
enable_tensorboard=True
train_instance_count=1
train_instance_type='ml.p3.2xlarge'

In [6]:
from sagemaker.tensorflow import TensorFlow

estimator = TensorFlow(entry_point='train.py',
                       source_dir='src',
                       role=role,
                       instance_count=train_instance_count,
                       instance_type=train_instance_type,
                       py_version='py37',
                       framework_version='2.3.0',
                       hyperparameters={
                           'dataset_variant': dataset_variant,
                           'enable_tensorboard': enable_tensorboard
                       },
                       metric_definitions=metrics_definitions,
                       debugger_hook_config=False
            )

# Setup Hyper-Parameter Ranges to Explore

# Optimize the Model on SageMaker

In [7]:
from sagemaker.tuner import IntegerParameter
from sagemaker.tuner import ContinuousParameter
from sagemaker.tuner import CategoricalParameter
from sagemaker.tuner import HyperparameterTuner
                                                
hyperparameter_ranges = {
    'epochs': CategoricalParameter([1, 2]),
    'learning_rate': ContinuousParameter(0.00001, 0.00005, scaling_type='Linear'),
    'embedding_dimension': CategoricalParameter([32, 64]),    
}

In [8]:
objective_metric_name = 'root_mean_squared_error'

tuner = HyperparameterTuner(
    estimator=estimator,
    objective_type='Minimize',
    objective_metric_name=objective_metric_name,
    hyperparameter_ranges=hyperparameter_ranges,
    metric_definitions=metrics_definitions,
    max_jobs=6,
    max_parallel_jobs=2,
    strategy='Bayesian',
    early_stopping_type='Off'
)

In [9]:
tuner.fit(inputs={
            'train': s3_input_train_data
          }, 
          include_cls_metadata=False,
          wait=False)

In [10]:
from pprint import pprint

tuning_job_name = tuner.latest_tuning_job.job_name

job_description = sm.describe_hyper_parameter_tuning_job(
    HyperParameterTuningJobName=tuning_job_name
)

status = job_description['HyperParameterTuningJobStatus']

pprint(job_description)

if status != 'Completed':
    job_count = job_description['TrainingJobStatusCounters']['Completed']
    print('Not yet complete, but {} jobs have completed.'.format(job_count))
    
    if job_description.get('BestTrainingJob', None):
        print("Best candidate:")
        pprint(job_description['BestTrainingJob']['TrainingJobName'])
        pprint(job_description['BestTrainingJob']['FinalHyperParameterTuningJobObjectiveMetric'])
    else:
        print("No training jobs have reported results yet.")    

{'CreationTime': datetime.datetime(2020, 11, 29, 23, 2, 5, 672000, tzinfo=tzlocal()),
 'HyperParameterTuningJobArn': 'arn:aws:sagemaker:us-east-1:835319576252:hyper-parameter-tuning-job/tensorflow-training-201129-2302',
 'HyperParameterTuningJobConfig': {'HyperParameterTuningJobObjective': {'MetricName': 'root_mean_squared_error',
                                                                        'Type': 'Minimize'},
                                   'ParameterRanges': {'CategoricalParameterRanges': [{'Name': 'epochs',
                                                                                       'Values': ['"1"',
                                                                                                  '"2"']},
                                                                                      {'Name': 'embedding_dimension',
                                                                                       'Values': ['"32"',
                                 

In [11]:
from IPython.core.display import display, HTML
    
display(HTML('<b>Review <a target="blank" href="https://console.aws.amazon.com/sagemaker/home?region={}#/hyper-tuning-jobs/{}">Hyper-Parameter Tuning Job</a></b>'.format(region, tuning_job_name)))

In [12]:
%%time

tuner.wait()

.............................................................................................................................................................................................................................!
CPU times: user 1.02 s, sys: 122 ms, total: 1.14 s
Wall time: 18min 38s


In [13]:
from sagemaker.analytics import HyperparameterTuningJobAnalytics

hp_results = HyperparameterTuningJobAnalytics(
    sagemaker_session=sess, 
    hyperparameter_tuning_job_name=tuning_job_name
)

df_results = hp_results.dataframe()
df_results.shape

(6, 9)

# All Candidates

In [14]:
df_results.sort_values('FinalObjectiveValue', ascending=1)

Unnamed: 0,embedding_dimension,epochs,learning_rate,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
1,"""64""","""2""",5e-05,tensorflow-training-201129-2302-005-aec2a92a,Completed,3.6797,2020-11-29 23:15:59+00:00,2020-11-29 23:19:43+00:00,224.0
0,"""64""","""2""",4.2e-05,tensorflow-training-201129-2302-006-e3e871b7,Completed,3.6804,2020-11-29 23:16:01+00:00,2020-11-29 23:18:41+00:00,160.0
3,"""64""","""1""",4.8e-05,tensorflow-training-201129-2302-003-4457b41d,Completed,3.6813,2020-11-29 23:10:50+00:00,2020-11-29 23:13:25+00:00,155.0
2,"""64""","""1""",4.3e-05,tensorflow-training-201129-2302-004-89000343,Completed,3.6815,2020-11-29 23:10:52+00:00,2020-11-29 23:13:23+00:00,151.0
4,"""64""","""1""",2.2e-05,tensorflow-training-201129-2302-002-621bcafa,Completed,3.6826,2020-11-29 23:05:04+00:00,2020-11-29 23:07:44+00:00,160.0
5,"""32""","""1""",1.4e-05,tensorflow-training-201129-2302-001-951e5119,Completed,3.6887,2020-11-29 23:04:58+00:00,2020-11-29 23:07:39+00:00,161.0


# Best Candidate

In [15]:
df_results.sort_values('FinalObjectiveValue', ascending=1).head(1)

Unnamed: 0,embedding_dimension,epochs,learning_rate,TrainingJobName,TrainingJobStatus,FinalObjectiveValue,TrainingStartTime,TrainingEndTime,TrainingElapsedTimeSeconds
1,"""64""","""2""",5e-05,tensorflow-training-201129-2302-005-aec2a92a,Completed,3.6797,2020-11-29 23:15:59+00:00,2020-11-29 23:19:43+00:00,224.0


In [22]:
recommender_training_job_name = df_results.sort_values('FinalObjectiveValue', ascending=1).head(1)['TrainingJobName'][1]

print(recommender_training_job_name)

tensorflow-training-201129-2302-005-aec2a92a


# Pass to Next Notebook(s)

In [23]:
%store recommender_training_job_name

Stored 'recommender_training_job_name' (str)
