In [1]:
# Required libraries
import pandas as pd
import numpy as np
import scipy
import sklearn
from sklearn.neighbors import NearestNeighbors

In [2]:
data_folder = '../Data/'
#paths to files
file_path = data_folder + 'df_filtered.csv'

# load the data
df= pd.read_csv(file_path, index_col=0)
# Dropping this column because hopefully the final dataframe of ratings in the project won't have it
df.drop('movieId', axis=1, inplace=True)

# For now, another dataframe is needed with the mapping between imdbId and names such that results can be interpreted on this jupyter notebook
df_names= pd.read_csv(data_folder  + 'df_final_dataset.csv', index_col= 0)

KeyboardInterrupt: 

In [5]:
df.head()

Unnamed: 0,userId,rating,imdbId
0,1,1.0,112573
1,1,4.5,112461
2,1,5.0,68646
3,1,5.0,71562
4,1,5.0,97165


In [6]:
df_names.head()

Unnamed: 0,imdb_id,title,budget,genres_cmu,languages,original_language,spoken_languages,countries,production_companies,plot_summary,...,popularity,vote_average,vote_count,cast,adult,belongs_to_collection,box_office_clean,release_date_clean,runtime_clean,director
0,tt0228333,Ghosts of Mars,28000000,"Thriller, Science Fiction, Horror, Adventure, ...",English Language,en,1,United States of America,"['Screen Gems', 'Storm King Productions', 'Ani...","Set in the second half of the 22nd century, th...",...,7.058599,4.8,299.0,"['Natasha Henstridge', 'Ice Cube', 'Jason Stat...",False,,14010832.0,2001-08-24,98.0,
1,tt0094320,White of the Eye,0,"Thriller, Erotic thriller, Psychological thriller",English Language,en,1,United Kingdom,"[""Mrs. White's Productions""]",A series of murders of rich young women throug...,...,3.121105,5.7,15.0,"['David Keith', 'Cathy Moriarty', 'Alan Rosenb...",False,,,,110.0,
2,tt0029852,Alexander's Ragtime Band,2000000,"Musical, Comedy, Black-and-white",English Language,en,1,United States of America,['Twentieth Century Fox Film Corporation'],,...,0.632261,4.8,6.0,"['Tyrone Power', 'Alice Faye', 'Don Ameche', '...",False,,3600000.0,1938-08-16,106.0,
3,tt0053719,The City of the Dead,0,"Horror, Supernatural",English Language,en,1,United Kingdom,['Vulcan Productions Inc.'],,...,1.514972,6.5,34.0,"['Christopher Lee', 'Dennis Lotis', 'Patricia ...",False,,,,76.0,
4,tt0119548,Little City,0,"Romantic comedy, Ensemble Film, Comedy-drama, ...",English Language,en,0,United States of America,['Bandeira Entertainment'],"Adam, a San Francisco-based artist who works a...",...,0.036814,6.0,1.0,"['Jon Bon Jovi', 'Penelope Ann Miller', 'Annab...",False,,93.0,1997-04-04,,


In [7]:
def formating_imdbId(x) :
    x= round(x)
    x_str= str(x)
    while len(x_str) < 7 :
        x_str= '0' + x_str
    return 'tt' + x_str

In [8]:
df.head()

Unnamed: 0,userId,rating,imdbId
0,1,1.0,112573
1,1,4.5,112461
2,1,5.0,68646
3,1,5.0,71562
4,1,5.0,97165


In [9]:
# Generate unique new IDs for movies globally
movie_mapping = {old_id: new_id for new_id, old_id in enumerate(df['imdbId'].unique(), start=0)}

# Add the new_movieId column using the global mapping
df['new_movieId'] = df['imdbId'].map(movie_mapping)

reverse_movie_mapping = {old_id: new_id for new_id, old_id in zip(df['imdbId'].unique(),df['new_movieId'].unique())}

# Generate unique new IDs for user globally
user_mapping = {old_id: new_id for new_id, old_id in enumerate(df['userId'].unique(), start=0)}

# Add the new_userId column using the global mapping
df['new_userId'] = df['userId'].map(user_mapping)

reverse_user_mapping= {old_id: new_id for new_id, old_id in zip(df['userId'].unique(),df['new_userId'].unique())}
df.head()

Unnamed: 0,userId,rating,imdbId,new_movieId,new_userId
0,1,1.0,112573,0,0
1,1,4.5,112461,1,0
2,1,5.0,68646,2,0
3,1,5.0,71562,3,0
4,1,5.0,97165,4,0


In [10]:
def create_sparse_matrix_representation(pd_dataframe) :
    ratings = pd_dataframe['rating'].values - 2.75
    userIds = pd_dataframe['new_userId'].values
    movieIds = pd_dataframe['new_movieId'].values
    shape = (np.max(userIds) + 1, np.max(movieIds) + 1)
    return scipy.sparse.csr_matrix((ratings, (userIds, movieIds)), shape= shape)

In [11]:
sparse_matrix_rep= create_sparse_matrix_representation(df)

In [12]:
print(sparse_matrix_rep)

  (0, 0)	-1.75
  (0, 1)	1.75
  (0, 2)	2.25
  (0, 3)	2.25
  (0, 4)	2.25
  (0, 5)	1.25
  (0, 6)	1.75
  (0, 7)	2.25
  (0, 8)	1.25
  (0, 9)	1.25
  (0, 10)	2.25
  (0, 11)	2.25
  (0, 12)	1.25
  (0, 13)	0.75
  (0, 14)	1.25
  (0, 15)	2.25
  (0, 16)	2.25
  (0, 17)	2.25
  (0, 18)	2.25
  (0, 19)	2.25
  (0, 20)	-0.25
  (0, 21)	2.25
  (0, 22)	2.25
  (0, 23)	2.25
  (0, 24)	1.25
  :	:
  (232049, 1571)	0.75
  (232049, 1587)	2.25
  (232049, 1590)	0.25
  (232049, 1595)	0.25
  (232049, 1596)	0.75
  (232049, 1607)	0.75
  (232049, 1618)	0.75
  (232049, 1625)	2.25
  (232049, 1652)	1.75
  (232049, 1678)	0.25
  (232049, 1765)	0.75
  (232049, 1770)	2.25
  (232049, 1779)	1.25
  (232049, 1839)	0.25
  (232049, 1956)	0.75
  (232049, 2015)	0.75
  (232049, 2119)	2.25
  (232049, 2237)	1.75
  (232049, 2239)	0.75
  (232049, 2280)	0.75
  (232049, 2395)	2.25
  (232049, 2512)	0.75
  (232049, 5848)	-0.25
  (232049, 7075)	-0.75
  (232049, 11567)	1.25


In [11]:
scipy.sparse.save_npz("Sparse_hyperspace_user_movie.npz", sparse_matrix_rep)

In [12]:
sparse_matrix_rep_loaded= scipy.sparse.load_npz("Sparse_hyperspace_user_movie.npz")

In [13]:
print(sparse_matrix_rep_loaded)

  (0, 0)	-1.75
  (0, 1)	1.75
  (0, 2)	2.25
  (0, 3)	2.25
  (0, 4)	2.25
  (0, 5)	1.25
  (0, 6)	1.75
  (0, 7)	2.25
  (0, 8)	1.25
  (0, 9)	1.25
  (0, 10)	2.25
  (0, 11)	2.25
  (0, 12)	1.25
  (0, 13)	0.75
  (0, 14)	1.25
  (0, 15)	2.25
  (0, 16)	2.25
  (0, 17)	2.25
  (0, 18)	2.25
  (0, 19)	2.25
  (0, 20)	-0.25
  (0, 21)	2.25
  (0, 22)	2.25
  (0, 23)	2.25
  (0, 24)	1.25
  :	:
  (232049, 1571)	0.75
  (232049, 1587)	2.25
  (232049, 1590)	0.25
  (232049, 1595)	0.25
  (232049, 1596)	0.75
  (232049, 1607)	0.75
  (232049, 1618)	0.75
  (232049, 1625)	2.25
  (232049, 1652)	1.75
  (232049, 1678)	0.25
  (232049, 1765)	0.75
  (232049, 1770)	2.25
  (232049, 1779)	1.25
  (232049, 1839)	0.25
  (232049, 1956)	0.75
  (232049, 2015)	0.75
  (232049, 2119)	2.25
  (232049, 2237)	1.75
  (232049, 2239)	0.75
  (232049, 2280)	0.75
  (232049, 2395)	2.25
  (232049, 2512)	0.75
  (232049, 5848)	-0.25
  (232049, 7075)	-0.75
  (232049, 11567)	1.25


In [13]:
# Function that generate a sparse vector of the hyperspace user-movies from the ratings and movie-ids given by the website user
def generate_sparse_vector_from_ratings(list_of_imdbid_and_rating, total_nbr_of_movies) :
    small_df= pd.DataFrame(data= list_of_imdbid_and_rating, columns= ['imdbId', 'rating'])
    ratings= np.array(small_df['rating'].values, dtype= float)
    movie_ids= small_df['imdbId'].map(movie_mapping).values
    imdb_ids= small_df['imdbId'].apply(formating_imdbId)
    return scipy.sparse.csr_matrix((ratings, (np.zeros(len(movie_ids)), movie_ids)), shape= (1, total_nbr_of_movies)), movie_ids, imdb_ids

In [14]:
def generate_list_from_user(userid) :
    movies_watched_imdbids= df[df['userId']== userid][['imdbId', 'rating']]
    return movies_watched_imdbids.values.tolist()

def find_liked_movies_user(userid, threshold= 2) :
    movies_id= df[(df['userId']== userid) & (df['rating']>= threshold)]
    movies_id= movies_id['imdbId'].apply(formating_imdbId)
    names_of_the_moovies= df_names[df_names['imdb_id'].isin(movies_id.values)]['title']
    return names_of_the_moovies

In [16]:
knn_function= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=30, n_jobs=-1)
knn_function.fit(sparse_matrix_rep)

In [17]:
def weighted_rating(R, v, m, C):
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

def recommand_movies_for_website_user(list_, sparse_matrix_rep, n_neighbors=30, n_movies= 5, show_movie_names = False, user_id = 1) :
    total_nbr_of_movies= sparse_matrix_rep.shape[1]
    sparse_vec, movies_watched_id, movies_watched= generate_sparse_vector_from_ratings(list_, total_nbr_of_movies)
    # Generate a fit to approximate nearest neighbors of a given user in the database
    distances, indices= knn_function.kneighbors(sparse_vec, n_neighbors= n_neighbors)
    indices= indices[0, :]
    moviesid_to_check= df[df['new_userId'].isin(indices)]
    df_temp= moviesid_to_check.groupby('imdbId')
    averages= df_temp['rating'].mean()
    C= averages.mean()
    number_of_votes= df_temp['new_userId'].count()
    m= number_of_votes.quantile(0.8)
    scores= weighted_rating(averages, number_of_votes, m , C)
    sorted_scores_id= pd.DataFrame(data= scores.sort_values(ascending= False).index, columns= ['imdbId'])
    sorted_scores_id= sorted_scores_id['imdbId'].apply(formating_imdbId)
    if show_movie_names :
        print("Movies liked : ")
        print(find_liked_movies_user(user_id))
        print("")
    final_recommandation= []
    n_temp= n_movies
    while len(final_recommandation) < n_movies :
        movies_recommanded= df_names[df_names['imdb_id'].isin(sorted_scores_id[:n_temp])]['imdb_id']
        final_recommandation= list(set(movies_recommanded.values) - set(movies_recommanded.values).intersection(set(movies_watched.values)))
        n_temp+=1
    if show_movie_names :
        print("Movies recommanded : ")
        print(df_names[df_names['imdb_id'].isin(final_recommandation[:n_movies])]['title'])
    return final_recommandation[:n_movies]

In [29]:
user_id= 29437
truc= recommand_movies_for_website_user(generate_list_from_user(user_id), sparse_matrix_rep, 30, 5, False, user_id)
print(truc)

['tt0114746', 'tt0107290', 'tt0112346', 'tt0109830', 'tt0115759']


In [20]:
def generate_partial_list_from_user(userid) :
    liked_movies_id = movies_id= df[(df['userId']== userid) & (df['rating']>= 2.2)]['imdbId'].values
    movies_droped= liked_movies_id[:1]
    movies_watched_imdbids= df[(df['userId']== userid) & (~df['imdbId'].isin(movies_droped))][['imdbId', 'rating']]
    return movies_watched_imdbids.values.tolist(), movies_droped

In [21]:
def test_recommandation(num_to_test) :
    rng= np.random.default_rng()
    accuracy= 0
    for i in range(num_to_test) :
        pos_id= rng.integers(0, df.shape[0])
        user_id= df.iloc[pos_id, 0]
        list_, target = generate_partial_list_from_user(user_id)
        target= formating_imdbId(target[0])
        recommandation= recommand_movies_for_website_user(list_, sparse_matrix_rep, 30, 5, False, user_id)
        if target in recommandation :
            accuracy+=1
        if i%10==0 :
            print(i)
    return accuracy/num_to_test
test_recommandation(100)

0
10
20
30
40
50
60
70
80
90


0.27

In [None]:
accuray= 0.273