In [3]:
# load the pandas library
import pandas as pd 
# reading the data from my GitHub repository into a Pandas dataframe and saving it in 'reviews'
reviews = pd.read_csv('https://raw.githubusercontent.com/getfitwithapurv/recommendersys/main/review.csv')
df = pd.DataFrame(reviews) #adding the data to dataframe for further use 
df = df.set_index('Name') # setting the name of reviewers as index
df #sanity check

Unnamed: 0_level_0,Shubham,Varad,Mihir,Saurabh,Pooja
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Thor Ragnarok,4.0,3.0,5.0,,4.0
The Batman (2022),,4.0,3.0,,
Spiderman: No way home,3.5,4.0,,4.0,3.5
Shershah,3.5,,4.0,,4.0
Gully Boy,,3.5,,3.5,
Andhadhun,4.0,,,3.0,


In [7]:
df = df.fillna(0) #filling the nan values in dataset with 0 for further process
df

Unnamed: 0_level_0,Shubham,Varad,Mihir,Saurabh,Pooja
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Thor Ragnarok,4.0,3.0,5.0,0.0,4.0
The Batman (2022),0.0,4.0,3.0,0.0,0.0
Spiderman: No way home,3.5,4.0,0.0,4.0,3.5
Shershah,3.5,0.0,4.0,0.0,4.0
Gully Boy,0.0,3.5,0.0,3.5,0.0
Andhadhun,4.0,0.0,0.0,3.0,0.0


### Using scikit-learn 

In [4]:
from sklearn.neighbors import NearestNeighbors
# store the original dataset in 'df', and create the copy of df, df1 = df.copy().
df1 = df.copy()
def movie_recommender(user, number_neighbors, num_recommendation):
  
  #finding ditances and indices using cosine similarity
  knn = NearestNeighbors(metric='cosine', algorithm='brute')
  knn.fit(df.values)
  distances, indices = knn.kneighbors(df.values, n_neighbors=number_neighbors) 
   # indices shows the indices of the nearest neighbors for each movie. 
   # distances shows the distance between movies. A smaller number means the movie is closer.
  
  user_index = df.columns.tolist().index(user) #converting dataframe to a list of movies for the specified user i.e. one who gave rating 
  
  # t: movie_title, m: the row number of t in dataframe
  for m,t in list(enumerate(df.index)): # find movies without ratings by user
    if df.iloc[m, user_index] == 0:
      sim_movies = indices[m].tolist()  # make list for similar movies and putting it in sim_movies
      movie_distances = distances[m].tolist() # the list for distances of similar movies    
      if m in sim_movies:
        id_movie = sim_movies.index(m)  # get the position of the movie itself in indices and distances
        sim_movies.remove(m)  # remove the movie itself in indices
        movie_distances.pop(id_movie) # remove the movie itself in distances
      else:
        sim_movies = sim_movies[:n_neighbors-1]
        movie_distances = movie_distances[:n_neighbors-1]           
      movie_similarity = [1-x for x in movie_distances]
      movie_similarity_copy = movie_similarity.copy()
      nominator = 0
      
      # for each similar movie
      for s in range(0, len(movie_similarity)):
        if df.iloc[sim_movies[s], user_index] == 0:   # check if the rating of a similar movie is zero
          if len(movie_similarity_copy) == (number_neighbors - 1): 
            movie_similarity_copy.pop(s) # if the rating is zero, ignore the rating and the similarity in calculating the predicted rating 
          else:   
            movie_similarity_copy.pop(s-(len(movie_similarity)-len(movie_similarity_copy)))    
        else: # if the rating is not zero, use the rating and similarity in the calculation
          nominator = nominator + movie_similarity[s]*df.iloc[sim_movies[s],user_index]
      
      # check if the number of the ratings with non-zero is positive
      if len(movie_similarity_copy) > 0:
        if sum(movie_similarity_copy) > 0: # check if the sum of the ratings of the similar movies is positive
          predicted_r = nominator/sum(movie_similarity_copy)
          # Even if there are some movies for which the ratings are positive, 
          # some movies have zero similarity even though they are selected as similar movies.
          # in this case, the predicted rating becomes zero as well 
        else: 
          predicted_r = 0

      else:  # if all the ratings of the similar movies are zero, then predicted rating should be zero
        predicted_r = 0
       
      df1.iloc[m,user_index] = predicted_r # place the predicted rating into the copy of the original dataset
  recommend_movies(user, num_recommendation) #calls the recommend_movies function

In [53]:
# The following function shows the recommended movies for a selected user using the updated dataset df1
def recommend_movies(user, num_recommendation): 
  print('The list of the movies {} has already watched \n'.format(user)) #to print the movies users has already watched
  for m in df[df[user] > 0][user].index.tolist():
    print(m)
  print('')  

  recommended_movies = []
  for m in df[df[user] == 0].index.tolist(): 
    index_df = df.index.tolist().index(m)
    predicted_rating = df1.iloc[index_df, df1.columns.tolist().index(user)]
    recommended_movies.append((m, predicted_rating))

  sorted_rm = sorted(recommended_movies, key=lambda x:x[1], reverse=True)  #to sort the list of recommended movies
  print('The list of the recommended movies for {} \n'.format(user))
  
  for recommended_movie in sorted_rm[:num_recommendation]:
    print('{} - predicted rating:{}'.format(recommended_movie[0], recommended_movie[1]))
    

In [54]:
movie_recommender('Pooja',3,3) #checking recommendor for user pooja

The list of the movies Pooja has already watched 

Thor Ragnarok
Spiderman: No way home
Shershah

The list of the recommended movies for Pooja 

The Batman (2022) - predicted rating:4.0
Gully Boy - predicted rating:3.5
Andhadhun - predicted rating:3.5


In [88]:
# References : 
# https://www.youtube.com/watch?v=z0dx-YckFko
# https://towardsdatascience.com/item-based-collaborative-filtering-in-python-91f747200fab
# https://github.com/yjeong5126/movie_recommender/blob/master/item_based_collaborative_filtering/item_based_collaborative_filtering.ipynb
        

### Using LightFM

In [55]:
!pip install lightfm #installing lightfm



In [29]:
import numpy as np
from lightfm import LightFM #importing from lightfm


Importing dataset again for using it for Lightfm and making changes to it

In [15]:
# load the pandas library
import pandas as pd 
# reading the data from my GitHub repository into a Pandas dataframe and saving it in 'reviews'
reviews = pd.read_csv('https://raw.githubusercontent.com/getfitwithapurv/recommendersys/main/review.csv')
dfl = pd.DataFrame(reviews) #adding the data to dataframe for further use 
dfl #sanity check

Unnamed: 0,Name,Shubham,Varad,Mihir,Saurabh,Pooja
0,Thor Ragnarok,4.0,3.0,5.0,,4.0
1,The Batman (2022),,4.0,3.0,,
2,Spiderman: No way home,3.5,4.0,,4.0,3.5
3,Shershah,3.5,,4.0,,4.0
4,Gully Boy,,3.5,,3.5,
5,Andhadhun,4.0,,,3.0,


In [20]:
dfl = pd.melt(dfl,id_vars= 'Name', value_vars=['Shubham', 'Varad', 'Mihir', 'Saurabh', 'Pooja'])
dfl #converting to long format 

Unnamed: 0,Name,variable,value
0,Thor Ragnarok,Shubham,4.0
1,The Batman (2022),Shubham,
2,Spiderman: No way home,Shubham,3.5
3,Shershah,Shubham,3.5
4,Gully Boy,Shubham,
5,Andhadhun,Shubham,4.0
6,Thor Ragnarok,Varad,3.0
7,The Batman (2022),Varad,4.0
8,Spiderman: No way home,Varad,4.0
9,Shershah,Varad,


Conversion of dataframe to sparse matrix 

In [21]:
from pandas.api.types import CategoricalDtype
from scipy import sparse

users = dfl["variable"].unique()
movies = dfl["Name"].unique()
shape = (len(users), len(movies))

# Create indices for users and movies
user_cat = CategoricalDtype(categories=sorted(users), ordered=True)
movie_cat = CategoricalDtype(categories=sorted(movies), ordered=True)
user_index = dfl["variable"].astype(user_cat).cat.codes
movie_index = dfl["Name"].astype(movie_cat).cat.codes

# Conversion via COO matrix
coo = sparse.coo_matrix((dfl["value"], (user_index, movie_index)), shape=shape)
csr = coo.tocsr()

In [24]:
csr

<5x6 sparse matrix of type '<class 'numpy.float64'>'
	with 30 stored elements in Compressed Sparse Row format>

In [26]:
# spliting dataset into test and train
from sklearn.model_selection import train_test_split 

train, test = train_test_split(dfl, test_size=0.2, random_state=25)

print(f"No. of training examples: {train.shape[0]}")
print(f"No. of testing examples: {test.shape[0]}")
 

No. of training examples: 24
No. of testing examples: 6


In [2]:
#References: 
#https://hippocampus-garden.com/pandas_sparse/
#http://localhost:8888/notebooks/Downloads/recsys3_LightFM_quickstart.ipynb

### Unable to complete using LightFM 