In [19]:
# Standard library imports
import pickle
import os

# Third-party imports
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

from tqdm.notebook import tqdm
from sklearn.neighbors import NearestNeighbors

## Load validation dataset

In [20]:
columns_name=['index','user_id','item_id','rating','timestamp']
val = pd.read_csv("../data/interim/val-1-core.csv", index_col=0, sep=",", names=columns_name, header=0)
val.head()

Unnamed: 0_level_0,user_id,item_id,rating,timestamp
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
27134,186,7,5,879465273
30669,534,191,5,879617663
42114,768,734,5,875971655
22438,639,236,5,874778274
2186,304,13,4,886322893


## Load models

In [21]:
model_path = '../models/'

In [22]:
with open(os.path.join(model_path, f'KNN/knnpickle_file.pickle'), 'rb') as knnPickle:
    knn_model = pickle.load(knnPickle) 

In [23]:
with open ('../data/interim/user2movie-20-core.pickle', 'rb') as file:
    user_to_movie_sparse_df = pickle.load(file)

In [24]:
movies_list = pd.read_csv("../data/interim/item-library.csv", index_col=0, sep=",", header=0)['movie title']
movies_list

0                                 Toy Story (1995)
1                                 GoldenEye (1995)
2                                Four Rooms (1995)
3                                Get Shorty (1995)
4                                   Copycat (1995)
                           ...                    
1225               Night Falls on Manhattan (1997)
1227          Under Siege 2: Dark Territory (1995)
1239    Ghost in the Shell (Kokaku kidotai) (1995)
1243                                  Metro (1997)
1266                               Clockers (1995)
Name: movie title, Length: 811, dtype: object

In [25]:
## function to find top n similar users of the given input user 
def get_similar_users(user, knn_size = 5, verbose = False):
  ## input to this function is the user and number of top similar users you want.

  knn_input = np.asarray([user_to_movie_sparse_df[user-1].toarray()]).reshape(1,-1)
  # knn_input = user_to_movie_df.iloc[0,:].values.reshape(1,-1)
  distances, indices = knn_model.kneighbors(knn_input, n_neighbors=knn_size+1)
  if verbose:
    print("Top",knn_size,"users who are very much similar to the User-",user, "are: ")
    print(" ")
    for i in range(1,len(distances[0])):
      print(i,". User:", indices[0][i]+1, "separated by distance of",distances[0][i])
  return indices.flatten()[1:] + 1, distances.flatten()[1:]

In [26]:
from pprint import pprint
def recommend_movies(user_id, n, knn_size=5, verbose = False):
    similar_user_list, distance_list = get_similar_users(user_id, knn_size, verbose=verbose)
    weightage_list = distance_list/np.sum(distance_list)
    mov_rtngs_sim_users = user_to_movie_sparse_df[similar_user_list].toarray()
    weightage_list = weightage_list[:,np.newaxis] + np.zeros(len(movies_list))
    new_rating_matrix = weightage_list*mov_rtngs_sim_users
    mean_rating_list = new_rating_matrix.sum(axis = 0)

    n = min(len(mean_rating_list),n)

    if verbose:
        print(" Few of movies seen by the User:")
        pprint(list(val[val['user_id'] == user_id]['item_id'])[:10])
        print(f'{similar_user_list=}', f'{distance_list=}')
        print(f'{weightage_list=}')
        print(f'{mov_rtngs_sim_users=}')
        print("Weightage list shape:", len(weightage_list))
        print("mov_rtngs_sim_users shape:", mov_rtngs_sim_users.shape)
        print("Number of movies:", len(movies_list))
        print(f'{weightage_list.shape=}')
        
    return list(np.argsort(mean_rating_list)[::-1][:n])

In [34]:
result = recommend_movies(val['user_id'].iloc[1], n=5)
print(result)

[90, 273, 160, 8, 104]


I don't get how to make recall/precision calculation for KNN