In [1]:
# Required libraries
import pandas as pd
import numpy as np
import scipy
import sklearn
from sklearn.neighbors import NearestNeighbors
import time

In [2]:
data_folder = '../Data/'
#paths to files
file_path = data_folder + 'df_filtered.csv'

# load the data
df= pd.read_csv(file_path, index_col=0)
# Dropping this column because hopefully the final dataframe of ratings in the project won't have it
df.drop('movieId', axis=1, inplace=True)

In [3]:
df.head()

Unnamed: 0,userId,rating,imdbId
0,1,1.0,112573
1,1,4.5,112461
2,1,5.0,68646
3,1,5.0,71562
4,1,5.0,97165


In [4]:
def formating_imdbId(x) :
    x= round(x)
    x_str= str(x)
    while len(x_str) < 7 :
        x_str= '0' + x_str
    return 'tt' + x_str

In [5]:
# Generate unique new IDs for movies globally
movie_mapping = {old_id: new_id for new_id, old_id in enumerate(df['imdbId'].unique(), start=0)}

# Add the new_movieId column using the global mapping
df['new_movieId'] = df['imdbId'].map(movie_mapping)

reverse_movie_mapping = {old_id: new_id for new_id, old_id in zip(df['imdbId'].unique(),df['new_movieId'].unique())}

# Generate unique new IDs for user globally
user_mapping = {old_id: new_id for new_id, old_id in enumerate(df['userId'].unique(), start=0)}

# Add the new_userId column using the global mapping
df['new_userId'] = df['userId'].map(user_mapping)

reverse_user_mapping= {old_id: new_id for new_id, old_id in zip(df['userId'].unique(),df['new_userId'].unique())}
df.head()

Unnamed: 0,userId,rating,imdbId,new_movieId,new_userId
0,1,1.0,112573,0,0
1,1,4.5,112461,1,0
2,1,5.0,68646,2,0
3,1,5.0,71562,3,0
4,1,5.0,97165,4,0


In [6]:
sparse_matrix_rep= scipy.sparse.load_npz("Sparse_hyperspace_user_movie.npz")

In [7]:
print(sparse_matrix_rep)

  (0, 0)	-1.75
  (0, 1)	1.75
  (0, 2)	2.25
  (0, 3)	2.25
  (0, 4)	2.25
  (0, 5)	1.25
  (0, 6)	1.75
  (0, 7)	2.25
  (0, 8)	1.25
  (0, 9)	1.25
  (0, 10)	2.25
  (0, 11)	2.25
  (0, 12)	1.25
  (0, 13)	0.75
  (0, 14)	1.25
  (0, 15)	2.25
  (0, 16)	2.25
  (0, 17)	2.25
  (0, 18)	2.25
  (0, 19)	2.25
  (0, 20)	-0.25
  (0, 21)	2.25
  (0, 22)	2.25
  (0, 23)	2.25
  (0, 24)	1.25
  :	:
  (232049, 1571)	0.75
  (232049, 1587)	2.25
  (232049, 1590)	0.25
  (232049, 1595)	0.25
  (232049, 1596)	0.75
  (232049, 1607)	0.75
  (232049, 1618)	0.75
  (232049, 1625)	2.25
  (232049, 1652)	1.75
  (232049, 1678)	0.25
  (232049, 1765)	0.75
  (232049, 1770)	2.25
  (232049, 1779)	1.25
  (232049, 1839)	0.25
  (232049, 1956)	0.75
  (232049, 2015)	0.75
  (232049, 2119)	2.25
  (232049, 2237)	1.75
  (232049, 2239)	0.75
  (232049, 2280)	0.75
  (232049, 2395)	2.25
  (232049, 2512)	0.75
  (232049, 5848)	-0.25
  (232049, 7075)	-0.75
  (232049, 11567)	1.25


In [8]:
# Function that generate a sparse vector of the hyperspace user-movies from the ratings and movie-ids given by the website user
def generate_sparse_vector_from_ratings(list_of_imdbid_and_rating, total_nbr_of_movies) :
    small_df= pd.DataFrame(data= list_of_imdbid_and_rating, columns= ['imdbId', 'rating'])
    ratings= np.array(small_df['rating'].values, dtype= float)
    movie_ids= small_df['imdbId'].map(movie_mapping).values
    imdb_ids= small_df['imdbId']
    return scipy.sparse.csr_matrix((ratings, (np.zeros(len(movie_ids)), movie_ids)), shape= (1, total_nbr_of_movies)), imdb_ids

In [9]:
def weighted_rating(R, v, m, C):
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

def recommand_movies_for_website_user(list_, sparse_matrix_rep, n_neighbors=30, n_movies= 5) :
    total_nbr_of_movies= sparse_matrix_rep.shape[1]
    sparse_vec, movies_watched= generate_sparse_vector_from_ratings(list_, total_nbr_of_movies)
    # Generate a fit to approximate nearest neighbors of a given user in the database
    knn_function= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
    knn_function.fit(sparse_matrix_rep)
    distances, indices= knn_function.kneighbors(sparse_vec, n_neighbors= n_neighbors)
    indices= indices[0, :]
    moviesid_to_check= df[df['new_userId'].isin(indices)]
    df_temp= moviesid_to_check.groupby('imdbId')
    averages= df_temp['rating'].mean()
    C= averages.mean()
    number_of_votes= df_temp['new_userId'].count()
    m= number_of_votes.quantile(0.8)
    scores= weighted_rating(averages, number_of_votes, m , C)
    sorted_scores_id= pd.DataFrame(data= scores.sort_values(ascending= False).index, columns= ['imdbId'])
    final_recommandation= []
    n_temp= n_movies
    while len(final_recommandation) < n_movies :
        movies_recommanded= sorted_scores_id['imdbId'].values[:n_temp]
        final_recommandation= list(set(movies_recommanded) - set(movies_recommanded).intersection(set(movies_watched.values)))
        n_temp+=1
    final_rec_df= pd.DataFrame(data= final_recommandation[:n_movies], columns= ['imdbId'])
    return final_rec_df['imdbId'].apply(formating_imdbId).values