<h2>Factorization Machines - Movie Recommendation Model</h2>
Input Features: [userId, moveId] <br>
Target: rating <br>

In [1]:
import numpy as np
import pandas as pd

# Define IAM role
import boto3
import re
import sagemaker
from sagemaker import get_execution_role

# SageMaker SDK Documentation: http://sagemaker.readthedocs.io/en/latest/estimators.html

## Upload Data to S3

In [2]:
# Specify your bucket name
bucket_name = 'sagemaker-gopi'
training_file_key = 'movie/user_movie_train.recordio'
test_file_key = 'movie/user_movie_test.recordio'

s3_model_output_location = r's3://{0}/movie/model'.format(bucket_name)
s3_training_file_location = r's3://{0}/{1}'.format(bucket_name,training_file_key)
s3_test_file_location = r's3://{0}/{1}'.format(bucket_name,test_file_key)

In [3]:
# Read Dimension: Number of unique users + Number of unique movies in our dataset
dim_movie = 0

# Update movie dimension - from file used for training 
with open(r'ml-latest-small/movie_dimension.txt','r') as f:
    dim_movie = int(f.read())

In [4]:
dim_movie

10334

In [5]:
print(s3_model_output_location)
print(s3_training_file_location)
print(s3_test_file_location)

s3://sagemaker-gopi/movie/model
s3://sagemaker-gopi/movie/user_movie_train.recordio
s3://sagemaker-gopi/movie/user_movie_test.recordio


In [6]:
# Write and Reading from S3 is just as easy
# files are referred as objects in S3.  
# file name is referred as key name in S3
# Files stored in S3 are automatically replicated across 3 different availability zones 
# in the region where the bucket was created.

# http://boto3.readthedocs.io/en/latest/guide/s3.html
def write_to_s3(filename, bucket, key):
    with open(filename,'rb') as f: # Read in binary mode
        return boto3.Session().resource('s3').Bucket(bucket).Object(key).upload_fileobj(f)

In [7]:
write_to_s3(r'ml-latest-small/user_movie_train.recordio',bucket_name,training_file_key)

In [8]:
write_to_s3(r'ml-latest-small/user_movie_test.recordio',bucket_name,test_file_key)

## Training Algorithm Docker Image
### AWS Maintains a separate image for every region and algorithm

In [9]:
# Use Spot Instance - Save up to 90% of training cost by using spot instances when compared to on-demand instances
# Reference: https://github.com/aws-samples/amazon-sagemaker-managed-spot-training/blob/main/xgboost_built_in_managed_spot_training_checkpointing/xgboost_built_in_managed_spot_training_checkpointing.ipynb

# if you are still on two-month free-tier you can use the on-demand instance by setting:
#   use_spot_instances = False

# We will use spot for training
use_spot_instances = True
max_run = 3600 # in seconds
max_wait = 3600 if use_spot_instances else None # in seconds

job_name = 'fm-movie-v4'

checkpoint_s3_uri = None

if use_spot_instances:
    checkpoint_s3_uri = f's3://{bucket_name}/movie/checkpoints/{job_name}'
    
print (f'Checkpoint uri: {checkpoint_s3_uri}')

Checkpoint uri: s3://sagemaker-gopi/movie/checkpoints/fm-movie-v4


In [10]:
sess = sagemaker.Session()

In [11]:
role = get_execution_role()

In [12]:
# This role contains the permissions needed to train, deploy models
# SageMaker Service is trusted to assume this role
print(role)

arn:aws:iam::512185592969:role/onshore-sagemaker-role


In [13]:
# https://sagemaker.readthedocs.io/en/stable/api/utility/image_uris.html#sagemaker.image_uris.retrieve

# SDK 2 uses image_uris.retrieve the container image location

# Use factorization-machines
container = sagemaker.image_uris.retrieve("factorization-machines",sess.boto_region_name)

print (f'Using FM Container {container}')

Using FM Container 382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:1


In [14]:
container

'382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:1'

## Build Model

In [15]:
# Configure the training job
# Specify type and number of instances to use
# S3 location where final artifacts needs to be stored

#   Reference: http://sagemaker.readthedocs.io/en/latest/estimators.html

# SDK 2.x version does not require train prefix for instance count and type

estimator = sagemaker.estimator.Estimator(container,
                                          role,                                        
                                          instance_count=1, 
                                          instance_type='ml.m5.xlarge',
                                          output_path=s3_model_output_location,
                                          sagemaker_session=sess,
                                          base_job_name = job_name,
                                          use_spot_instances=use_spot_instances,
                                          max_run=max_run,
                                          max_wait=max_wait,
                                          checkpoint_s3_uri=checkpoint_s3_uri)

### New Configuration after Model Tuning
### Refer to Hyperparameter Tuning Lecture on how to optimize hyperparameters

In [16]:
estimator.set_hyperparameters(feature_dim=dim_movie,
                              num_factors=8,
                              predictor_type='regressor', 
                              mini_batch_size=994,
                              epochs=91,
                              bias_init_method='normal',
                              bias_lr=0.21899531189430518,
                              factors_init_method='normal',
                              factors_lr=5.357593337770278e-05,
                              linear_init_method='normal',
                              linear_lr=0.00021524948053767607)

In [17]:
estimator.hyperparameters()

{'feature_dim': 10334,
 'num_factors': 8,
 'predictor_type': 'regressor',
 'mini_batch_size': 994,
 'epochs': 91,
 'bias_init_method': 'normal',
 'bias_lr': 0.21899531189430518,
 'factors_init_method': 'normal',
 'factors_lr': 5.357593337770278e-05,
 'linear_init_method': 'normal',
 'linear_lr': 0.00021524948053767607}

### Train the model

In [18]:
# New Hyperparameters
# Reference: Supported channels by algorithm
#   https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-algo-docker-registry-paths.html
estimator.fit({'train':s3_training_file_location, 'test': s3_test_file_location})

INFO:sagemaker:Creating training-job with name: fm-movie-v4-2023-06-09-14-05-39-444


2023-06-09 14:05:40 Starting - Starting the training job......
2023-06-09 14:06:24 Starting - Preparing the instances for training......
2023-06-09 14:07:42 Downloading - Downloading input data
2023-06-09 14:07:42 Training - Downloading the training image...............
2023-06-09 14:10:08 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[06/09/2023 14:10:20 INFO 140199547356992] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'epochs': 1, 'mini_batch_size': '1000', 'use_bias': 'true', 'use_linear': 'true', 'bias_lr': '0.1', 'linear_lr': '0.001', 'factors_lr': '0.0001', 'bias_wd': '0.01', 'linear_wd': '0.001', 'factors_wd': '0.00001', 'bias_init_method': 'normal', 'bias_init_sigma': '0.01', 'linear_init_method': 'normal', 'linear_init_sigma': '0.01', 'factors_init_method': 'normal',

[34m[2023-06-09 14:10:30.252] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 82, "duration": 218, "num_examples": 72, "num_bytes": 4517440}[0m
[34m[06/09/2023 14:10:30 INFO 140199547356992] #quality_metric: host=algo-1, epoch=40, train rmse <loss>=1.0096837523553426[0m
[34m[06/09/2023 14:10:30 INFO 140199547356992] #quality_metric: host=algo-1, epoch=40, train mse <loss>=1.0194612797703648[0m
[34m[06/09/2023 14:10:30 INFO 140199547356992] #quality_metric: host=algo-1, epoch=40, train absolute_loss <loss>=0.775761929875968[0m
[34m#metrics {"StartTime": 1686319830.0313244, "EndTime": 1686319830.2530115, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 221.03333473205566, "count": 1, "min": 221.03333473205566, "max": 221.03333473205566}}}[0m
[34m[06/09/2023 14:10:30 INFO 140199547356992] #progress_metric: host=algo-1, completed 45.05494505494506 % of epochs


2023-06-09 14:10:43 Uploading - Uploading generated training model[34m[2023-06-09 14:10:40.198] [tensorio] [info] epoch_stats={"data_pipeline": "/opt/ml/input/data/train", "epoch": 172, "duration": 211, "num_examples": 72, "num_bytes": 4517440}[0m
[34m[06/09/2023 14:10:40 INFO 140199547356992] #quality_metric: host=algo-1, epoch=85, train rmse <loss>=0.9530328320170279[0m
[34m[06/09/2023 14:10:40 INFO 140199547356992] #quality_metric: host=algo-1, epoch=85, train mse <loss>=0.9082715789023966[0m
[34m[06/09/2023 14:10:40 INFO 140199547356992] #quality_metric: host=algo-1, epoch=85, train absolute_loss <loss>=0.7215251679551593[0m
[34m#metrics {"StartTime": 1686319839.9848177, "EndTime": 1686319840.1993034, "Dimensions": {"Algorithm": "factorization-machines", "Host": "algo-1", "Operation": "training"}, "Metrics": {"update.time": {"sum": 213.8357162475586, "count": 1, "min": 213.8357162475586, "max": 213.8357162475586}}}[0m
[34m[06/09/2023 14:10:40 INFO 140199547356992] #prog


2023-06-09 14:10:54 Completed - Training job completed
Training seconds: 213
Billable seconds: 75
Managed Spot Training savings: 64.8%


## Deploy Model

In [None]:
# Ref: http://sagemaker.readthedocs.io/en/latest/estimators.html
predictor = estimator.deploy(initial_instance_count=1,
                             instance_type='ml.m5.xlarge',
                             endpoint_name = job_name)

## Run Predictions
### Dense and Sparse Formats
https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-inference.html


Define a custom serializer for the data.

This step defines a function fm_sparse_serializer(data), which converts data into a specific JSON format expected by the Factorization Machines model in SageMaker. This function iterates over a list of arrays, where each array contains feature values. For each array, it creates a dictionary in the format {'keys': column_list, 'shape': [dim_movie], 'values': value_list} and appends this dictionary to a list of instances. The final result is a JSON string which represents a list of instances, where each instance contains a sparse representation of its features. This custom serializer is necessary because the Factorization Machines model expects input in this specific format.

Set the custom serializer and a JSONDeserializer for the model.

After the fm_sparse_serializer function is defined, it's set as the serializer for the predictor object, which was created by the estimator.deploy() call. By doing this, you're specifying that whenever you pass data to the predictor.predict() method, the data should be serialized (i.e., converted into a format that can be transmitted over the network) using this function.

The content type for the serializer is then set to 'application/json', which indicates to the SageMaker endpoint that the data being sent is in JSON format.

Finally, the deserializer for the predictor is set to JSONDeserializer(). This means that when the predictor receives responses from the SageMaker endpoint, it should interpret them as JSON and convert them into Python objects.

Test the deployed model with some data.

In this step, the code reads from the testing data file, parses out user ID and movie ID pairs, and uses the deployed model to predict ratings for those pairs. The fm_sparse_serializer is used to convert the feature arrays into the required JSON format, and the predictor.predict() method is used to make the predictions. The results are then printed out, comparing the actual rating with the predicted rating from the model.

Here's a closer look at this process in the provided script:

python
Copy code
with open(r'ml-latest-small/user_movie_test.svm','r') as f:
    for i in range(3):
        rating = f.readline().split()
        print(f"Movie {rating}")
        userID = rating[1].split(':')[0]
        movieID = rating[2].split(':')[0]
        predicted_rating = predictor.predict([np.array([int(userID),int(movieID)])])
        print(f'  Actual Rating:\t{rating[0]}')
        print(f"  Predicted Rating:\t{predicted_rating['predictions'][0]['score']}")
        print()
This part of the code opens the test file, reads the first three lines (which each represent a movie rating), and splits each line into individual components. It then uses the predictor to make a prediction for the user and movie specified in each line. After making the prediction, it prints out the actual rating from the test data and the predicted rating from the model.






In [None]:
import json

def fm_sparse_serializer(data):
    js = {'instances': []}
    for row in data:
        
        column_list = row.tolist()
        value_list = np.ones(len(column_list),dtype=int).tolist()
       
        js['instances'].append({'data':{'features': { 'keys': column_list, 'shape':[dim_movie], 'values': value_list}}})
    return json.dumps(js)

In [None]:
# SDK 2
from sagemaker.deserializers import JSONDeserializer

In [None]:
# https://github.com/aws/amazon-sagemaker-examples/blob/master/introduction_to_amazon_algorithms/factorization_machines_mnist/factorization_machines_mnist.ipynb

# Specify custom serializer
predictor.serializer.serialize = fm_sparse_serializer
predictor.serializer.content_type = 'application/json'

predictor.deserializer = JSONDeserializer()

In [None]:
import numpy as np

In [None]:
fm_sparse_serializer([np.array([341,1416])])

In [None]:
# Let's test with few entries from test file
# Movie dataset is updated regularly...so, instead of hard coding userid and movie id, let's
# use actual values

# Each row is in this format: ['2.5', '426:1', '943:1']
# ActualRating, UserID, MovieID

with open(r'ml-latest-small/user_movie_test.svm','r') as f:
    for i in range(3):
        rating = f.readline().split()
        print(f"Movie {rating}")
        userID = rating[1].split(':')[0]
        movieID = rating[2].split(':')[0]
        predicted_rating = predictor.predict([np.array([int(userID),int(movieID)])])
        print(f'  Actual Rating:\t{rating[0]}')
        print(f"  Predicted Rating:\t{predicted_rating['predictions'][0]['score']}")
        print()

## Summary

1. Ensure Training, Test and Validation data are in S3 Bucket
2. Select Algorithm Container Registry Path - Path varies by region
3. Configure Estimator for training - Specify Algorithm container, instance count, instance type, model output location
4. Specify algorithm specific hyper parameters
5. Train model
6. Deploy model - Specify instance count, instance type and endpoint name
7. Run Predictions