https://github.com/rposhala/Recommender-System-on-MovieLens-dataset/blob/main/Item_based_Collaborative_Recommender_System_using_KNN.ipynb

In [48]:
# Standard library imports
import pickle
import os

# Third-party imports
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

from tqdm.notebook import tqdm
from sklearn.neighbors import NearestNeighbors

In [49]:
model_path = '../models/'

In [50]:
columns_name=['index','user_id','item_id','rating','timestamp']
refined_dataset = pd.read_csv("../data/interim/train-20-core.csv", index_col=0, sep=",", names=columns_name, header=0)
with open ('../data/interim/user2movie-20-core.pickle', 'rb') as file:
    user_to_movie_sparse_df = pickle.load(file)

In [51]:
user_to_movie_sparse_df[2].toarray()

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        4., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 

In [52]:
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_to_movie_sparse_df)

In [53]:
with open(os.path.join(model_path, f'KNN/knnpickle_file.pickle'), 'wb') as knnPickle:
    pickle.dump(knn_model, knnPickle)

In [54]:
## function to find top n similar users of the given input user 
def get_similar_users(user, knn_size = 5, verbose = False):
  ## input to this function is the user and number of top similar users you want.

  knn_input = np.asarray([user_to_movie_sparse_df[user-1].toarray()]).reshape(1,-1)
  # knn_input = user_to_movie_df.iloc[0,:].values.reshape(1,-1)
  distances, indices = knn_model.kneighbors(knn_input, n_neighbors=knn_size+1)
  if verbose:
    print("Top",knn_size,"users who are very much similar to the User-",user, "are: ")
    print(" ")
    for i in range(1,len(distances[0])):
      print(i,". User:", indices[0][i]+1, "separated by distance of",distances[0][i])
  return indices.flatten()[1:] + 1, distances.flatten()[1:]

In [55]:
item_df = pd.read_csv("../data/interim/item-library.csv", index_col=0, sep=",", header=0)
item_df.head()

Unnamed: 0,index,movie title,release date,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,Toy Story (1995),01-Jan-1995,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1,GoldenEye (1995),01-Jan-1995,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,2,Four Rooms (1995),01-Jan-1995,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,3,Get Shorty (1995),01-Jan-1995,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,Copycat (1995),01-Jan-1995,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,0


In [56]:
movies_list = item_df['movie title']
movies_list

0                                 Toy Story (1995)
1                                 GoldenEye (1995)
2                                Four Rooms (1995)
3                                Get Shorty (1995)
4                                   Copycat (1995)
                           ...                    
1225               Night Falls on Manhattan (1997)
1227          Under Siege 2: Dark Territory (1995)
1239    Ghost in the Shell (Kokaku kidotai) (1995)
1243                                  Metro (1997)
1266                               Clockers (1995)
Name: movie title, Length: 811, dtype: object

In [60]:
from pprint import pprint
def recommend_movies(user_id, n, knn_size=5, verbose = False):
    similar_user_list, distance_list = get_similar_users(user_id, knn_size, verbose=verbose)
    weightage_list = distance_list/np.sum(distance_list)
    mov_rtngs_sim_users = user_to_movie_sparse_df[similar_user_list].toarray()
    weightage_list = weightage_list[:,np.newaxis] + np.zeros(len(movies_list))
    new_rating_matrix = weightage_list*mov_rtngs_sim_users
    mean_rating_list = new_rating_matrix.sum(axis = 0)

    n = min(len(mean_rating_list),n)

    if verbose:
        print(" Few of movies seen by the User:")
        pprint(list(refined_dataset[refined_dataset['user_id'] == user_id]['item_id'])[:10])
        print(f'{similar_user_list=}', f'{distance_list=}')
        print(f'{weightage_list=}')
        print(f'{mov_rtngs_sim_users=}')
        print("Weightage list shape:", len(weightage_list))
        print("mov_rtngs_sim_users shape:", mov_rtngs_sim_users.shape)
        print("Number of movies:", len(movies_list))
        print(f'{weightage_list.shape=}')
        
    return list(np.argsort(mean_rating_list)[::-1][:n])

In [64]:
print("Movies recommended based on similar users are: ")
pprint(movies_list[recommend_movies(510, 5, knn_size=5, verbose=False)])

Movies recommended based on similar users are: 
153    Monty Python's Life of Brian (1979)
160                         Top Gun (1986)
20           Muppet Treasure Island (1996)
285            English Patient, The (1996)
88                     Blade Runner (1982)
Name: movie title, dtype: object
