In [1]:
%pip install --quiet -U dotenv
%cd Users/djenk.ivanov/


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/anaconda/envs/azureml_py310_sdkv2/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
/mnt/batch/tasks/shared/LS_root/mounts/clusters/movielens-compute/code/Users/djenk.ivanov


In [8]:
from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment, Model
from azure.identity import DefaultAzureCredential
import uuid
from azure.ai.ml.entities import Environment
from azure.ai.ml import MLClient
from dotenv import load_dotenv
import os


dependencies_dir = "./dependencies"
os.makedirs(dependencies_dir, exist_ok=True)

In [46]:


custom_env_name = "movielens-recsys-env"

load_dotenv()

credential = DefaultAzureCredential()

custom_job_env = Environment(
    name=custom_env_name,
    description="Environment for MovieLens recommendation system",
    conda_file=os.path.join(dependencies_dir, "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
)

ml_client = MLClient(
    credential=credential,
    subscription_id=os.getenv("AZURE_SUBSCRIPTION_ID"),
    resource_group_name=os.getenv("AZURE_RESOURCE_GROUP"),
    workspace_name=os.getenv("AZURE_ML_WORKSPACE"),
)

custom_job_env = ml_client.environments.create_or_update(custom_job_env)

print(
    f"Environment with name {custom_job_env.name} is registered to workspace, the environment version is {custom_job_env.version}"
)

Environment with name movielens-recsys-env is registered to workspace, the environment version is 8


# Run job

In [70]:
from azure.ai.ml import command, Input

model_name = "movielens_recommender"
data = Input(type="uri_folder", path="azureml:movielens32m:1")

job = command(
    inputs=dict(
        data=data,
        model_name=model_name,
    ),
    code="./src/",
    command="python main.py --data ${{inputs.data}} --model_name ${{inputs.model_name}}",
    environment="movielens-recsys-env@latest",
    display_name="movielens_recommender_training_job",
)
ml_client.jobs.create_or_update(job)

[32mUploading src (0.01 MBs): 100%|██████████| 8845/8845 [00:00<00:00, 191186.63it/s]
[39m



Experiment,Name,Type,Status,Details Page
djenk,modest_soccer_f4njqrbfmm,command,Starting,Link to Azure Machine Learning studio


# Create endpoint/deploy

In [71]:
endpoint_name = f'movielens-endpoint-{str(uuid.uuid4())[:8]}'

endpoint = ManagedOnlineEndpoint(
    name=endpoint_name,
    auth_mode="key",
)

model_name = "movielens_recommender"

latest_model_version = max(
    [int(m.version) for m in ml_client.models.list(name=model_name)]
)

ml_client.online_endpoints.begin_create_or_update(endpoint).result()
model = ml_client.models.get(name=model_name, version=latest_model_version)
deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name=endpoint_name,
    model=model,
    code_path="./src",
    environment="movielens-recsys-env@latest",
    scoring_script="score.py",
    instance_type="Standard_E2s_v3",
    instance_count=1,
)

ml_client.online_deployments.begin_create_or_update(deployment).result()

endpoint.traffic = {"blue": 100}
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

KeyboardInterrupt: 

In [55]:
latest_endpoint_name = "movielens-endpoint-503789da"

response = ml_client.online_endpoints.invoke(
    endpoint_name=latest_endpoint_name,
    deployment_name="blue",
    request_file="./payload.json"
)

print(response)

"{\"error\": \"('movieId', 'title')\"}"


In [66]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OrdinalEncoder, normalize
from sklearn.neighbors import NearestNeighbors
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from azure.identity import DefaultAzureCredential
import argparse
import mlflow
import mlflow.sklearn
from azure.ai.ml import MLClient
from dotenv import load_dotenv
import os
import joblib, scipy.sparse as sp
from pathlib import Path





def evaluate(k):
    users_evaluated = 0
    users_with_hits = 0
    sum_precision = 0.0
    sum_recall = 0.0
    total_hits = 0
    total_liked = 0
    
    users = (u for u in liked_by_users.keys() if u in sample_users)
    
    for u in users:
        liked_movies = liked_by_users[u]
        print(f'Users evaluated: {users_evaluated}/{len(sample_users)}')
        recs = get_recommendations_for_user(u, top_k=k)
        if not recs.any():
            continue
        
        users_evaluated += 1
        num_hits = len(set(recs) & liked_movies)
        
        if num_hits > 0:
            users_with_hits += 1
            
        sum_precision += num_hits / k
        sum_recall += num_hits / len(liked_movies)
        
        total_hits += num_hits
        total_liked += len(liked_movies)
    
    precision = sum_precision / users_evaluated
    recall = sum_recall / users_evaluated
    hitrate = users_with_hits / users_evaluated
    
    
    return {
        'precision': precision,
        'recall': recall,
        'hitrate': hitrate,
        'total_hits': total_hits,
        'total_liked': total_liked
    }
    
    

data_asset = ml_client.data.get(name="movielens32m", version="1")

ratings = pd.read_csv(data_asset.path + "/ratings.csv")
movies = pd.read_csv(data_asset.path + "/movies.csv")

hot_encode_genres = pd.Series(movies['genres']).str.get_dummies(sep='|')
encoded_movies = movies.merge(hot_encode_genres, left_index=True, right_index=True)
encoded_movies.drop(columns=['genres'], inplace=True)

ratings = ratings.drop(columns=['timestamp'])
merged = ratings.merge(encoded_movies, on='movieId', how='left')

df = merged.copy()

max_users = 100_000
sampled_users = np.random.choice(df['userId'].unique(), size=max_users, replace=False)
df = df[df['userId'].isin(sampled_users)].copy()
    
df['liked'] = (df['rating'] >= 3).astype('int8')

row_hash = pd.util.hash_pandas_object(df[['userId', 'movieId', 'liked']], index=False).astype('uint64')
rand = (row_hash % (2**32)) / (2**32)

df['split'] = np.where(rand < 0.5, 'train', 'test')

train = df[df['split'] == 'train'].copy()
test = df[df['split'] == 'test'].copy()

user_encoder = OrdinalEncoder()
movie_encoder = OrdinalEncoder()

train['userId_enc'] = user_encoder.fit_transform(train[['userId']]).astype(np.int64)
train['movieId_enc'] = movie_encoder.fit_transform(train[['movieId']]).astype(np.int64)

known_users = train['userId'].unique()
known_movies = train['movieId'].unique()

test = test[test['userId'].isin(known_users) & test['movieId'].isin(known_movies)].copy()

test['userId_enc'] = user_encoder.transform(test[['userId']]).astype(np.int64)
test['movieId_enc'] = movie_encoder.transform(test[['movieId']]).astype(np.int64)

train_liked = train[train['liked'] == 1]
total_users = train['userId_enc'].max() + 1 
total_movies = train['movieId_enc'].max() + 1

X_train = csr_matrix(   
    (np.ones(train_liked.shape[0], dtype=np.float32), (train_liked['userId_enc'].to_numpy(), train_liked['movieId_enc'].to_numpy())),
    shape=(total_users, total_movies),
    dtype=np.float32
)

K = 100
X_movies = X_train.T.tocsr().astype(np.float32)
X_movies = normalize(X_movies, axis=1, copy=False)

nn = NearestNeighbors(n_neighbors=K+1, metric='cosine', algorithm='brute', n_jobs=-1)
nn.fit(X_movies)

dists, indices = nn.kneighbors(X_movies, return_distance=True, n_neighbors=K+1)
sims = 1.0 - dists

indices = indices[:, 1:]
sims = sims[:, 1:].astype(np.float32)

seen_by_user = [set() for _ in range(total_users)]
for u, m in zip(train['userId_enc'].to_numpy(), train['movieId_enc'].to_numpy()):
    seen_by_user[u].add(m)
        
liked_by_users = {}

for u, m, l in zip(test['userId_enc'].to_numpy(), test['movieId_enc'].to_numpy(), test['liked'].to_numpy()):
    if l == 1:
        liked_by_users.setdefault(u, set()).add(m)
        
all_eval_users = np.array(list(liked_by_users.keys()), dtype=np.int64)
h = pd.util.hash_array(all_eval_users)
sample_mask = (h % 1000) == 0
sample_users = set(all_eval_users[sample_mask])




    
    

In [69]:
def get_recommendations_for_user(user_id_enc, top_k=10):
    liked = [i for i in liked_by_users[user_id_enc]]

    scores = np.zeros(len(indices), dtype=np.float32)
    neighbors = indices[liked].ravel()
    weights = sims[liked].ravel()
    print(neighbors.shape, weights.shape)
    np.add.at(scores, neighbors, weights)
    
    scores[list(seen_by_user[user_id_enc])] = -np.inf
    
    top = np.argpartition(scores, -top_k)[-top_k:]
    recommended_indices = top[np.argsort(-scores[top])]
    return movies.iloc[recommended_indices][['movieId', 'title']].to_dict(orient='records')

recs = get_recommendations_for_user(user_id_enc=100, top_k=10)
print(recs)

(10700,) (10700,)
[{'movieId': 5431, 'title': "Goin' South (1978)"}, {'movieId': 4980, 'title': "Bill & Ted's Bogus Journey (1991)"}, {'movieId': 4950, 'title': 'Lone Wolf McQuade (1983)'}, {'movieId': 1269, 'title': 'Arsenic and Old Lace (1944)'}, {'movieId': 33415, 'title': 'Solino (2002)'}, {'movieId': 2956, 'title': 'Someone to Watch Over Me (1987)'}, {'movieId': 5404, 'title': '84 Charing Cross Road (1987)'}, {'movieId': 1238, 'title': 'Local Hero (1983)'}, {'movieId': 6352, 'title': 'Lambada (1990)'}, {'movieId': 1194, 'title': "Cheech and Chong's Up in Smoke (1978)"}]
