In [1]:
%pip install --quiet -U azureml-fsspec dotenv
%cd Users/djenk.ivanov/


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/anaconda/envs/azureml_py310_sdkv2/bin/python -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
/mnt/batch/tasks/shared/LS_root/mounts/clusters/movielens-compute/code/Users/djenk.ivanov


In [2]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes
from dotenv import load_dotenv
import os

load_dotenv()

# authenticate
credential = DefaultAzureCredential()

# Get a handle to the workspace
ml_client = MLClient(
    credential=credential,
    subscription_id=os.getenv("AZURE_SUBSCRIPTION_ID"),
    resource_group_name=os.getenv("AZURE_RESOURCE_GROUP"),
    workspace_name=os.getenv("AZURE_ML_WORKSPACE"),
)

In [3]:
import pandas as pd

data_asset = ml_client.data.get(name="movielens32m", version="1")
print(f'Data asset path: {data_asset.path}')

ratings = pd.read_csv(data_asset.path + "/ratings.csv")
movies = pd.read_csv(data_asset.path + "/movies.csv")

Data asset path: azureml://subscriptions/ba117730-305b-41b8-8447-bf6834f0a56f/resourcegroups/djenk-rg1/workspaces/djenk-ml/datastores/movielens32m_ds/paths/ml-32m/


  mlflow.mismatch._check_version_mismatch()


In [4]:
ratings.head() 

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
hot_encode_genres = pd.Series(movies['genres']).str.get_dummies(sep='|')
encoded_movies = movies.merge(hot_encode_genres, left_index=True, right_index=True)
encoded_movies.drop(columns=['genres'], inplace=True)
encoded_movies.head()

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
ratings = ratings.drop(columns=['timestamp'])
merged = ratings.merge(encoded_movies, on='movieId', how='left')

In [8]:
print(ratings.shape, movies.shape, merged.shape)
merged.head()

(32000204, 3) (87585, 3) (32000204, 24)


Unnamed: 0,userId,movieId,rating,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,17,4.0,Sense and Sensibility (1995),0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,25,1.0,Leaving Las Vegas (1995),0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,29,2.0,"City of Lost Children, The (Cité des enfants p...",0,0,1,0,0,0,...,0,0,0,0,1,0,1,0,0,0
3,1,30,5.0,Shanghai Triad (Yao a yao yao dao waipo qiao) ...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,32,5.0,Twelve Monkeys (a.k.a. 12 Monkeys) (1995),0,0,0,0,0,0,...,0,0,0,0,1,0,1,1,0,0


In [9]:
import numpy as np

df = merged.copy()

df['liked'] = (df['rating'] >= 3).astype('int8')

row_hash = pd.util.hash_pandas_object(df[['userId', 'movieId', 'liked']], index=False).astype('uint64')
rand = (row_hash % (2**32)) / (2**32)

df['split'] = np.where(rand < 0.5, 'train', 'test')

In [10]:
train = df[df['split'] == 'train']
test = df[df['split'] == 'test']

In [11]:
train.head()

Unnamed: 0,userId,movieId,rating,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,liked,split
2,1,29,2.0,"City of Lost Children, The (Cité des enfants p...",0,0,1,0,0,0,...,0,0,1,0,1,0,0,0,0,train
5,1,34,2.0,Babe (1995),0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,train
6,1,36,1.0,Dead Man Walking (1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,train
10,1,161,1.0,Crimson Tide (1995),0,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,train
11,1,166,5.0,"Doom Generation, The (1995)",0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,train


# Movie based movie recommendations

In [13]:
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OrdinalEncoder
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

user_encoder = OrdinalEncoder()
movie_encoder = OrdinalEncoder()

df['userId_enc'] = user_encoder.fit_transform(df[['userId']]).astype(np.int64)
df['movieId_enc'] = movie_encoder.fit_transform(df[['movieId']]).astype(np.int64)

X = csr_matrix((df['rating'].to_numpy(), (df['userId_enc'].to_numpy(), df['movieId_enc'].to_numpy())))
V = cosine_similarity(X.T, X.T, dense_output=False)

In [14]:
movies_by_id = movies.set_index("movieId")

def get_recommendations_from_movie(title: str, top_k: int = 10):
    match = movies.loc[movies["title"] == title, "movieId"].values
    movie_id = int(match[0])

    movie_id_enc = int(movie_encoder.transform(pd.DataFrame([[movie_id]], columns=["movieId"]))[0, 0])

    row = V[movie_id_enc]
    sims = row.toarray().ravel()

    sims[movie_id_enc] = -np.inf

    idx = np.argpartition(sims, -top_k)[-top_k:]
    idx = idx[np.argsort(-sims[idx])]

    rec_movie_ids = (movie_encoder.inverse_transform(idx.reshape(-1, 1)).ravel().astype(int))
    
    return movies_by_id.loc[rec_movie_ids]

In [15]:
recommended = get_recommendations_from_movie('Unfriended: Dark Web (2018)', top_k=10)
recommended

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
130636,Unfriended (2014),Horror|Mystery|Thriller
197343,Escape Room (2019),Action|Horror|Thriller
207890,Countdown (2019),Horror|Thriller
185989,Truth or Dare (2018),Horror|Thriller
194722,Look Away (2018),Horror|Thriller
197663,Happy Death Day 2U (2019),Horror|Mystery|Thriller
175649,Wish Upon (2017),Fantasy|Horror|Thriller
221396,Host (2020),Horror
168288,The Belko Experiment (2017),Action|Horror|Thriller
206272,Haunt (2019),Horror|Thriller


# User specific movie recommendations

In [12]:
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OrdinalEncoder
import numpy as np

train = df[df['split'] == 'train'].copy()
test = df[df['split'] == 'test'].copy()

user_encoder = OrdinalEncoder()
movie_encoder = OrdinalEncoder()

train['userId_enc'] = user_encoder.fit_transform(train[['userId']]).astype(np.int64)
train['movieId_enc'] = movie_encoder.fit_transform(train[['movieId']]).astype(np.int64)

In [13]:
known_users = train['userId'].unique()
known_movies = train['movieId'].unique()

test = test[test['userId'].isin(known_users) & test['movieId'].isin(known_movies)].copy()

test['userId_enc'] = user_encoder.transform(test[['userId']]).astype(np.int64)
test['movieId_enc'] = movie_encoder.transform(test[['movieId']]).astype(np.int64)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

train_liked = train[train['liked'] == 1]
total_users = train['userId_enc'].max() + 1 
total_movies = train['movieId_enc'].max() + 1

X_train = csr_matrix(   
    (np.ones(train_liked.shape[0]), (train_liked['userId_enc'].to_numpy(), train_liked['movieId_enc'].to_numpy())),
    shape=(total_users, total_movies)
)

V_train = cosine_similarity(X_train.T, X_train.T, dense_output=False)

In [15]:
seen_by_user = [set() for _ in range(total_users)]
for u, m in zip(train['userId_enc'].to_numpy(), train['movieId_enc'].to_numpy()):
    seen_by_user[u].add(m)

In [16]:
print(seen_by_user[1])

{534, 151, 279, 30, 545, 292, 38, 301, 47, 183, 314, 191, 577, 452, 580, 586, 461, 345, 219, 222, 351, 352, 503, 357, 359, 375, 767}


In [28]:
def get_recommendations_for_user(user_id_enc, top_k=10):
    liked = [i for i in seen_by_user[user_id_enc] if X_train[user_id_enc, i] > 0]
    scores = V_train[:, liked].sum(axis=1).A.ravel()
    if seen_by_user[user_id_enc]:
        scores[list(seen_by_user[user_id_enc])] = -np.inf
    top_indices = np.argpartition(scores, -top_k)[-top_k:]
    return top_indices[np.argsort(-scores[top_indices])]

movies_by_id = movies.set_index("movieId")
rec_movie_idxs = get_recommendations_for_user(user_id_enc=2014, top_k=10)
# print(rec_movie_idxs)
movies.iloc[rec_movie_idxs]


Unnamed: 0,movieId,title,genres
6253,6370,"Spanish Apartment, The (L'auberge espagnole) (...",Comedy|Drama|Romance
8238,8951,Vera Drake (2004),Drama
4777,4882,Ouch (Aïe) (2000),Comedy
14415,75341,Remember Me (2010),Drama|Romance
12396,59854,"Wings of Eagles, The (1957)",Drama|War
4787,4892,Maze (2000),Romance
13315,68574,"Django the Bastard (Strangers Gundown, The) (D...",Action|Thriller|Western
4199,4303,Sordid Lives (2000),Comedy
7021,7145,Prisoner of Paradise (2002),Documentary
6411,6533,"What's Up, Doc? (1972)",Comedy


In [41]:
liked_by_user = {}

for u, m, l in zip(test['userId_enc'].to_numpy(), test['movieId_enc'].to_numpy(), test['liked'].to_numpy()):
    if l == 1:
        liked_by_user.setdefault(u, set()).add(m)
        
all_eval_users = np.array(list(liked_by_user.keys()), dtype=np.int64)
h = pd.util.hash_array(all_eval_users)
sample_mask = (h % 1000) == 0
sample_users = set(all_eval_users[sample_mask])

def evaluate(k = 10):
    users_evaluated = 0
    users_with_hits = 0
    sum_precision = 0.0
    sum_recall = 0.0
    total_hits = 0
    total_liked = 0
    
    users = (u for u in liked_by_user.keys() if u in sample_users)
    
    for u in users:
        liked_movies = liked_by_user[u]
        print(f'Users evaluated: {users_evaluated}/{len(sample_users)}')
        recs = get_recommendations_for_user(u, top_k=k)
        if not recs.any():
            continue
        
        users_evaluated += 1
        num_hits = len(set(recs) & liked_movies)
        
        if num_hits > 0:
            users_with_hits += 1
            
        sum_precision += num_hits / k
        sum_recall += num_hits / len(liked_movies)
        
        total_hits += num_hits
        total_liked += len(liked_movies)
    
    precision = sum_precision / users_evaluated
    recall = sum_recall / users_evaluated
    hitrate = users_with_hits / users_evaluated
    
    
    return {
        'precision': precision,
        'recall': recall,
        'hitrate': hitrate,
        'total_hits': total_hits,
        'total_liked': total_liked
    }
    
eval_results = evaluate(k=10)
eval_results



Users evaluated: 0/191
Users evaluated: 1/191
Users evaluated: 2/191
Users evaluated: 3/191
Users evaluated: 4/191
Users evaluated: 5/191
Users evaluated: 6/191
Users evaluated: 7/191
Users evaluated: 8/191
Users evaluated: 9/191
Users evaluated: 10/191
Users evaluated: 11/191
Users evaluated: 12/191
Users evaluated: 13/191
Users evaluated: 14/191
Users evaluated: 15/191
Users evaluated: 16/191
Users evaluated: 17/191
Users evaluated: 18/191
Users evaluated: 19/191
Users evaluated: 20/191
Users evaluated: 21/191
Users evaluated: 22/191
Users evaluated: 23/191
Users evaluated: 24/191
Users evaluated: 25/191
Users evaluated: 26/191
Users evaluated: 27/191
Users evaluated: 28/191
Users evaluated: 29/191
Users evaluated: 30/191
Users evaluated: 31/191
Users evaluated: 32/191
Users evaluated: 33/191
Users evaluated: 34/191
Users evaluated: 35/191
Users evaluated: 36/191
Users evaluated: 37/191
Users evaluated: 38/191
Users evaluated: 39/191
Users evaluated: 40/191
Users evaluated: 41/191
Us

{'precision': 0.4565445026178012,
 'recall': 0.12653197958726026,
 'hitrate': 0.9109947643979057,
 'total_hits': 872,
 'total_liked': 12578}