Get data

In [2]:
import pandas as pd
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
import mlflow
import numpy as np
from azureml.core import Workspace

reader = Reader()
ratings = pd.read_csv('<location_of_ratings.csv>')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


Connect Azureml and logging

In [3]:
ws = Workspace(subscription_id="<your_subscription_key>",
               resource_group="<your_resource_group_name>",
               workspace_name="ml_workspace")

experiment_name = "movies"

mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)
mlflow.autolog()

2021/10/20 11:22:06 INFO mlflow.pyspark.ml: No SparkSession detected. Autologging will log pyspark.ml models contained in the default allowlist. To specify a custom allowlist, initialize a SparkSession prior to calling mlflow.pyspark.ml.autolog() and specify the path to your allowlist file via the spark.mlflow.pysparkml.autolog.logModelAllowlistFile conf.


Register the dataset

In [4]:
from azureml.core import Dataset

np.savetxt('ratings.csv', ratings, delimiter=',')
datastore = ws.get_default_datastore()
datastore.upload_files(files=["ratings.csv"],
                       target_path='movie_rating_data/',
                       overwrite=True)

input_dataset = Dataset.Tabular.from_delimited_files(path=[(datastore, 'movie_rating_data/ratings.csv')])

Uploading an estimated of 1 files
Uploading ratings.csv
Uploaded ratings.csv, 1 files out of an estimated total of 1
Uploaded 1 files


Prepare the data and configure the model

In [5]:
from surprise import Dataset
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8949  0.8983  0.8943  0.9057  0.8896  0.8966  0.0054  
MAE (testset)     0.6912  0.6908  0.6864  0.6942  0.6888  0.6903  0.0026  
Fit time          5.28    5.36    5.29    5.31    5.30    5.31    0.03    
Test time         0.22    0.20    0.21    0.19    0.33    0.23    0.05    


{'test_rmse': array([0.89492927, 0.89833978, 0.89430233, 0.90573773, 0.88956423]),
 'test_mae': array([0.69116724, 0.69079591, 0.68637034, 0.69424811, 0.68883862]),
 'fit_time': (5.279882192611694,
  5.355646848678589,
  5.291794061660767,
  5.314360857009888,
  5.301142692565918),
 'test_time': (0.2207350730895996,
  0.2048029899597168,
  0.21285653114318848,
  0.193650484085083,
  0.3318459987640381)}

Train the model

In [6]:
# create training set 
trainset = data.build_full_trainset()

# train the model
with mlflow.start_run() as run:
    svd.fit(trainset)

#ratings[ratings['userId'] == 1]

Setup environment & inference config for model deployment

In [11]:
# create environment for the deploy
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.webservice import AciWebservice

#create an environment

from azureml.core.environment import Environment
movies_env = Environment(name="movies_env")

conda_dep = CondaDependencies()

# Installs numpy version 1.17.0 conda package
conda_dep.add_conda_package("numpy==1.17.0")

# Installs surprise package
conda_dep.add_pip_package("surprise")


# Adds dependencies to PythonSection of myenv
movies_env.python.conda_dependencies=conda_dep

movies_env.register(workspace=ws)


{
    "databricks": {
        "eggLibraries": [],
        "jarLibraries": [],
        "mavenLibraries": [],
        "pypiLibraries": [],
        "rcranLibraries": []
    },
    "docker": {
        "arguments": [],
        "baseDockerfile": null,
        "baseImage": "mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210806.v1",
        "baseImageRegistry": {
            "address": null,
            "password": null,
            "registryIdentity": null,
            "username": null
        },
        "enabled": false,
        "platform": {
            "architecture": "amd64",
            "os": "Linux"
        },
        "sharedVolumes": true,
        "shmSize": null
    },
    "environmentVariables": {
        "EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
    },
    "inferencingStackVersion": null,
    "name": "movies_env",
    "python": {
        "baseCondaEnvironment": null,
        "condaDependencies": {
            "channels": [
                "anaconda",
                "conda-forge"
   

In [35]:
from azureml.core import Environment
from azureml.core.model import InferenceConfig

inference_config = InferenceConfig(
    environment=movies_env,
    source_directory="./source_dir",
    entry_script="./score.py",
)

Model dump and register

In [36]:
import joblib
from azureml.core.model import Model

joblib.dump(svd, filename="movie.pkl")
model = Model.register(workspace=ws, model_path="movie.pkl", model_name="movie_model")


Registering model movie_model


In [37]:
svd.predict(1, 1061)

Prediction(uid=1, iid=1061, r_ui=None, est=2.618327449743631, details={'was_impossible': False})

Container configuration

In [38]:
# create deployment config i.e. compute resources
aciconfig = AciWebservice.deploy_configuration(
    cpu_cores=1,
    memory_gb=1,
    tags={"data": "ratings", "method": "svd"},
    description="Predict movie ratings with svd",
)

In [40]:
service = Model.deploy(
    ws,
    "movieservice2",
    [model],
    inference_config,
    deployment_config=aciconfig,
    overwrite=True,
)

service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-10-20 13:39:53+00:00 Creating Container Registry if not exists.
2021-10-20 13:39:53+00:00 Registering the environment.
2021-10-20 13:39:55+00:00 Use the existing image.
2021-10-20 13:39:55+00:00 Generating deployment configuration.
2021-10-20 13:39:55+00:00 Submitting deployment to compute..
2021-10-20 13:39:59+00:00 Checking the status of deployment movieservice2..
2021-10-20 13:46:12+00:00 Checking the status of inference endpoint movieservice2.
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [41]:
print(service.get_logs())

2021-10-20T13:45:49,286896900+00:00 - gunicorn/run 
Dynamic Python package installation is disabled.
Starting HTTP server
2021-10-20T13:45:49,285769100+00:00 - iot-server/run 
2021-10-20T13:45:49,306012100+00:00 - rsyslog/run 
2021-10-20T13:45:49,327915000+00:00 - nginx/run 
EdgeHubConnectionString and IOTEDGE_IOTHUBHOSTNAME are not set. Exiting...
2021-10-20T13:45:49,625280600+00:00 - iot-server/finish 1 0
2021-10-20T13:45:49,627234000+00:00 - Exit code 1 is normal. Not restarting iot-server.
Starting gunicorn 20.1.0
Listening at: http://127.0.0.1:31311 (73)
Using worker: sync
worker timeout is set to 300
Booting worker with pid: 98
SPARK_HOME not set. Skipping PySpark Initialization.
Initializing logger
2021-10-20 13:45:50,877 | root | INFO | Starting up app insights client
logging socket was found. logging is available.
logging socket was found. logging is available.
2021-10-20 13:45:50,878 | root | INFO | Starting up request id generator
2021-10-20 13:45:50,878 | root | INFO | Star

In [45]:
service

AciWebservice(workspace=Workspace.create(name='ml_workspace', subscription_id='1cca6153-5c74-48ed-b75f-acd01f2cff8c', resource_group='edaaccess'), name=movieservice2, image_id=None, compute_type=None, state=ACI, scoring_uri=Healthy, tags=http://61d5b46a-1a8b-4693-9698-6dd8856e398d.westeurope.azurecontainer.io/score, properties={'data': 'ratings', 'method': 'svd'}, created_by={'hasInferenceSchema': 'False', 'hasHttps': 'False'})

In [47]:
import json


input_payload = json.dumps({
    'user_id': "1",
    'movie_id': "1029"
})

output = service.run(input_payload)

print(output)

[1, 1029, None, 2.9065018007190515, {'was_impossible': False}]


In [48]:
input_payload

'{"user_id": "1", "movie_id": "1029"}'