In [5]:
# Required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from statsmodels.stats import diagnostic
import statsmodels.api as sm
import statsmodels.formula.api as smf
import sklearn
from sklearn.neighbors import NearestNeighbors

In [2]:
data_folder = './MovieSummaries/MovieSummaries/'
#paths to files
movie_metadata_path = data_folder + 'movie.metadata.tsv'

# load the data
movie_metadata_df = pd.read_csv(movie_metadata_path, delimiter='\t', names=['wikipedia_movie_id', 'freebase_movie_id', 
                                                                            'movie_name', 'release_date', 'box_office_revenue',
                                                                            'runtime', 'languages', 'countries', 'genres'], 
                                 encoding='utf-8')

df = pd.read_csv('./ml-32m/ratings.csv')

movies = pd.read_csv('ml-32m/movies.csv')

In [3]:
# Formatting the movie names in the two database such that they can latter be merged on the names
movie_metadata_df['movie_name_formatted'] = movie_metadata_df['movie_name'].str.lower().str.strip()
movies['title_format'] = movies['title'].str[:-6].str.strip().str.lower()

In [4]:
# calculate the number of common movies between the two datasets
common_movies = set(movie_metadata_df['movie_name_formatted']).intersection(set(movies['title_format']))
print('Number of common movies:', len(common_movies))

# merge the two datasets
merged_df = pd.merge(movies, movie_metadata_df, left_on='title_format', right_on='movie_name_formatted', how='inner')

Number of common movies: 25632


In [9]:
# Only take common movies for analysis
movieId_to_keep= set(merged_df[merged_df['movie_name_formatted'].isin(common_movies)]['movieId'])
df_reduced= df[df['movieId'].isin(movieId_to_keep)]
print(df_reduced.shape)
# Number of user is reduced for now otherwise the code won't run because the matrix is too big
df_more_reduced= df_reduced[df_reduced['userId'] < 80000].copy()
df_more_reduced['rating']= df_more_reduced['rating'].apply(lambda x : x -2.5)
print(df_more_reduced)

(20323585, 4)
          userId  movieId  rating  timestamp
0              1       17     1.5  944249077
1              1       25    -1.5  944250228
5              1       34    -0.5  943228491
6              1       36    -1.5  944249008
8              1      110     0.5  943231119
...          ...      ...     ...        ...
12781363   79999     3114     0.5  974951172
12781364   79999     3362     0.5  974951172
12781365   79999     3424    -1.5  974951104
12781366   79999     3504     1.5  974951254
12781367   79999     3916     2.5  974951915

[8124111 rows x 4 columns]


In [None]:
hyper_space= df_more_reduced.pivot(index='userId',columns='movieId',values='rating')

hyper_space.fillna(0, inplace=True)
print(hyper_space.head())

sparse_matrix_rep= scipy.sparse.csr_matrix(hyper_space.values)
print(sparse_matrix_rep)
print(sparse_matrix_rep.shape)

In [13]:
scipy.sparse.save_npz("Sparse_hyperspace_user_movie.npz", sparse_matrix_rep)

In [14]:
sparse_matrix_rep_loaded= scipy.sparse.load_npz("Sparse_hyperspace_user_movie.npz")

In [15]:
print(sparse_matrix_rep_loaded)

  (0, 15)	1.5
  (0, 23)	-1.5
  (0, 29)	-0.5
  (0, 31)	-1.5
  (0, 80)	0.5
  (0, 81)	2.5
  (0, 109)	-1.5
  (0, 123)	1.5
  (0, 214)	1.5
  (0, 237)	-0.5
  (0, 358)	0.5
  (0, 368)	2.5
  (0, 380)	2.5
  (0, 405)	-0.5
  (0, 500)	1.5
  (0, 518)	0.5
  (0, 521)	2.5
  (0, 555)	2.5
  (0, 558)	-0.5
  (0, 560)	2.5
  (0, 561)	1.5
  (0, 565)	2.5
  (0, 568)	2.5
  (0, 573)	2.5
  (0, 600)	0.5
  :	:
  (79997, 3719)	-1.5
  (79997, 3727)	2.0
  (79997, 3761)	1.5
  (79997, 3875)	1.5
  (79997, 3936)	2.5
  (79998, 0)	0.5
  (79998, 133)	2.5
  (79998, 237)	0.5
  (79998, 405)	2.5
  (79998, 698)	-1.5
  (79998, 735)	2.5
  (79998, 741)	1.5
  (79998, 747)	0.5
  (79998, 754)	0.5
  (79998, 841)	1.5
  (79998, 1252)	2.5
  (79998, 1409)	2.5
  (79998, 1740)	2.5
  (79998, 1835)	2.5
  (79998, 1898)	2.5
  (79998, 1907)	0.5
  (79998, 2051)	0.5
  (79998, 2089)	-1.5
  (79998, 2147)	1.5
  (79998, 2410)	2.5


In [16]:
# Function that generate a sparse vector of the hyperspace user-movies from the ratings and movie-ids given by the website user
def generate_sparse_vector_from_ratings(list_of_imdbid_and_rating, total_nbr_of_movies) :
    bidule= np.array(list_of_imdbid_and_rating)
    imdb_ids= bidule[:, 0]
    ratings= np.array(bidule[:, 1], dtype= float)
    movie_ids= merged_df[merged_df['freebase_movie_id'].isin(imdb_ids)]['movieId'].values
    movie_ids= movie_ids - 1
    return scipy.sparse.csr_matrix((ratings, (np.zeros(len(movie_ids)), movie_ids)), shape= (1, total_nbr_of_movies)), movie_ids + 1

In [49]:
def weighted_rating(R, v, m, C):
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

def recommand_movies_for_website_user(list_, sparse_matrix_rep, n_neighbors=30, n_movies= 5) :
    total_nbr_of_movies= sparse_matrix_rep.shape[1]
    sparse_vec, movies_watched_id= generate_sparse_vector_from_ratings(list_, total_nbr_of_movies)
    # Generate a fit to approximate nearest neighbors of a given user in the database
    knn_function= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
    knn_function.fit(sparse_matrix_rep)
    distances, indices= knn_function.kneighbors(sparse_vec, n_neighbors= n_neighbors)
    indices= indices[0, 1:] + 1
    moviesid_to_check= df_more_reduced[(df_more_reduced['userId'].isin(indices)) & (df_more_reduced['rating']> -5)]
    df_temp= moviesid_to_check.groupby('movieId')
    averages= df_temp['rating'].mean()
    C= averages.mean()
    number_of_votes= df_temp['userId'].count()
    m= number_of_votes.quantile(0.8)
    scores= weighted_rating(averages, number_of_votes, m , C)
    sorted_scores_id= scores.sort_values(ascending= False).index
    movies_watched=  merged_df[merged_df['movieId'].isin(movies_watched_id)]['movie_name_formatted']
    print(movies_watched)
    final_recommandation= []
    n_temp= n_movies
    while len(final_recommandation) < n_movies :
        movies_recommanded= merged_df[merged_df['movieId'].isin(sorted_scores_id[:n_temp])]['movie_name_formatted']
        final_recommandation= list(set(movies_recommanded.values) - set(movies_recommanded.values).intersection(set(movies_watched.values)))
        n_temp+=1
    return final_recommandation[:n_movies]

# Testing of the function
list_= [["/m/0dyb1", 4.5], ["/m/09w353", 3], ["/m/0676dr", 2], ["/m/03vny7", 3.5], ["/m/094g2z", 5]]
recommand_movies_for_website_user(list_, sparse_matrix_rep)

0                      toy story
1                        jumanji
2               grumpier old men
3              waiting to exhale
4    father of the bride part ii
Name: movie_name_formatted, dtype: object


['executive decision',
 "mr. holland's opus",
 'dead man walking',
 'ransom',
 'jerry maguire']

In [50]:
def find_liked_movies_user(userid) :
    movies_id= df_more_reduced[(df_more_reduced['userId']== userid) & (df_more_reduced['rating']>= 2)]['movieId'].values
    names_of_the_moovies= merged_df[merged_df['movieId'].isin(movies_id)]['movie_name']
    return names_of_the_moovies

def find_disliked_movies_user(userid) :
    movies_id= df_more_reduced[(df_more_reduced['userId']== userid) & (df_more_reduced['rating']< -1)]['movieId'].values
    names_of_the_moovies= merged_df[merged_df['movieId'].isin(movies_id)]['movie_name']
    return names_of_the_moovies

In [138]:
def weighted_rating(R, v, m, C):
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

def recommand_movies_for_user(userid, sparse_matrix_rep, n_neighbors=30, n_movies= 5) :
    userid_matrix= userid-1
    knn_function= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=n_neighbors, n_jobs=-1)
    knn_function.fit(sparse_matrix_rep)
    distances, indices= knn_function.kneighbors(sparse_matrix_rep[userid_matrix], n_neighbors= n_neighbors)
    indices= indices[0, 1:] + 1
    moviesid_to_check= df_more_reduced[(df_more_reduced['userId'].isin(indices)) & (df_more_reduced['rating']> -5)]
    df_temp= moviesid_to_check.groupby('movieId')
    averages= df_temp['rating'].mean()
    C= averages.mean()
    number_of_votes= df_temp['userId'].count()
    m= number_of_votes.quantile(0.8)
    scores= weighted_rating(averages, number_of_votes, m , C)
    sorted_scores_id= scores.sort_values(ascending= False).index
    movies_watched_id= df_more_reduced[(df_more_reduced['userId']== userid) & (df_more_reduced['rating']> -5)]['movieId'].values
    movies_watched=  merged_df[merged_df['movieId'].isin(movies_watched_id)]['movie_name_formatted']
    print(find_liked_movies_user(userid))
    n_temp= n_movies
    final_recommandation= []
    while len(final_recommandation) < n_movies :
        movies_recommanded= merged_df[merged_df['movieId'].isin(sorted_scores_id[:n_temp])]['movie_name_formatted']
        final_recommandation= list(set(movies_recommanded.values) - set(movies_recommanded.values).intersection(set(movies_watched.values)))
        n_temp+=1
    return final_recommandation[:n_movies]

recommand_movies_for_user(24205, sparse_matrix_rep)

194                                    Waterworld
2674                                  Pitch Black
3535                                        Shrek
4032                               Monsters, Inc.
5153                                 Finding Nemo
6359     Harry Potter and the Prisoner of Azkaban
6515                                     I, Robot
7905                                Batman Begins
8207          Harry Potter and the Goblet of Fire
9598                                     Iron Man
9599                                     Iron Man
10274                                   Star Trek
10317                                          Up
10318                                          Up
10439      Harry Potter and the Half-Blood Prince
10829                                      Avatar
10830                                      Avatar
10831                                      Avatar
11118                    How to Train Your Dragon
11790                                     Tangled


['toy story',
 'the hunger games: catching fire',
 'kung fu panda',
 'harry potter and the chamber of secrets',
 'toy story 2']

In [None]:
['indiana jones and the last crusade',
 'pirates of the caribbean: the curse of the black pearl',
 'toy story',
 'monty python and the holy grail',
 'back to the future']

In [None]:
# Testing this recommandation model by comparing the recommandation with what the user liked and disliked
test_recommand= recommand_movies_for_user(28)
test_liked= find_liked_movies_user(28)
test_disliked= find_disliked_movies_user(28)
print("Number of recommanded movies : ", test_recommand.shape)
print("Number of liked movies : ", test_liked.shape)
print("Number of disliked movies : ", test_disliked.shape)
print("Percentage of movies liked by user 1 (rating >= 4) that are recommanded by the sytem : ", len(set(test_recommand.values).intersection(set(test_liked.values))) / test_liked.shape[0])
print("Percentage of movies disliked by user 1 (rating < 3) that are recommanded by the sytem : ", len(set(test_recommand.values).intersection(set(test_disliked.values))) / test_disliked.shape[0])

In [None]:
def test_model(ids_to_test, n_neighbors=30, n_duplicates= 8, print_inter= False) :
    scores_positive= []
    scores_negative= []
    count= 0
    for i in ids_to_test :
        test_recommand= recommand_movies_for_user(i, n_neighbors, n_duplicates)
        test_liked= find_liked_movies_user(i)
        test_disliked= find_disliked_movies_user(i)
        if print_inter :
            print("Number of recommanded movies : ", test_recommand.shape)
            print("Number of liked movies : ", test_liked.shape)
            print("Number of disliked movies : ", test_disliked.shape)
        nbr_movies_liked_recommanded= len(set(test_recommand.values).intersection(set(test_liked.values)))
        if test_liked.shape[0] != 0 :
            proportion_liked_recommanded=  nbr_movies_liked_recommanded / test_liked.shape[0]
        else :
            proportion_liked_recommanded = 0

        if print_inter :
            print("Percentage of movies liked by user 1 (rating >= 4) that are recommanded by the sytem : ", proportion_liked_recommanded)
        
        nbr_movies_disliked_recommanded= len(set(test_recommand.values).intersection(set(test_disliked.values)))
        if test_disliked.shape[0] != 0 :
            proportion_disliked_recommanded= nbr_movies_disliked_recommanded / test_disliked.shape[0]
        else :
            proportion_disliked_recommanded= 0

        if print_inter :
            print("Percentage of movies disliked by user 1 (rating < 3) that are recommanded by the sytem : ", proportion_disliked_recommanded)
        
        if test_recommand.shape[0] != 0 :
            scores_positive.append(nbr_movies_liked_recommanded/test_recommand.shape[0])
            scores_negative.append(nbr_movies_disliked_recommanded/test_recommand.shape[0])
        else :
            scores_positive.append(0)
            scores_negative.append(1)

        count+=1
        if count%50 == 0 :
            print(f"{count} users have been tested")
    return scores_positive, scores_negative

In [None]:
ids_to_test= []
ids_to_test= range(1000)

In [None]:
scores_pos, scores_neg= test_model(ids_to_test)

In [None]:
scores_pos= np.array(scores_pos)
scores_neg= np.array(scores_neg)
print(np.mean(scores_pos), np.mean(scores_neg))

In [None]:
def parameter_scanning(values_neighbors, values_duplicates) :
    ids_to_test= np.array(range(1, 49998), dtype= int)
    values_pos= 0
    values_neg= 1
    best_n_neighbors_pos= values_neighbors[0]
    best_n_duplicates_pos= values_duplicates[0]
    best_n_neighbors_neg= values_neighbors[0]
    best_n_duplicates_neg= values_duplicates[0]
    for n_neighbors in values_neighbors :
        for n_duplicates in values_duplicates :
            np.random.shuffle(ids_to_test)
            ids_to_test_prime= ids_to_test[:100]
            scores_pos, scores_neg= test_model(ids_to_test_prime, n_neighbors, n_duplicates)
            if np.mean(np.array(scores_pos)) > values_pos :
                values_pos= np.mean(np.array(scores_pos))
                best_n_neighbors_pos= n_neighbors
                best_n_duplicates_pos= n_duplicates
            if np.mean(np.array(scores_neg)) < values_neg :
                values_neg= np.mean(np.array(scores_neg))
                best_n_neighbors_neg= n_neighbors
                best_n_duplicates_neg= n_duplicates
            print(f"The couple of parameters ({n_neighbors}, {n_duplicates}) has been tested.")
    print(f"The best parameters for the positive score are : ({best_n_neighbors_pos}, {best_n_duplicates_pos})")
    print(f"The best parameters for the negative score are : ({best_n_neighbors_neg}, {best_n_duplicates_neg})")

In [None]:
parameter_scanning([10,20,30,40,50], [2,3,4,5,6,7,8,9,10])

In [None]:
ids_to_test= range(1000)
scores_pos, scores_neg= test_model(ids_to_test, 40, 2)
scores_pos= np.array(scores_pos)
scores_neg= np.array(scores_neg)
print(np.mean(scores_pos), np.mean(scores_neg))

0.5758076498476176 0.06482582511974802 : (30, 8)
0.6206855745121398 0.10482759500198037 : (30, 10)
0.2068181921610013 0.024991750957941428 : (40, 2)

In [22]:
def movie_recommandation(userid, n_movies= 5) :
    final_recommandation= []
    n_duplicates= 8
    movies_watched_id= df_more_reduced[(df_more_reduced['userId']== userid) & (df_more_reduced['rating']>= 0)]['movieId'].values
    movies_watched=  merged_df[merged_df['movieId'].isin(movies_watched_id)]['movie_name']
    print(find_liked_movies_user(userid))
    while len(final_recommandation) < n_movies and n_duplicates >= 2:
        print(n_duplicates)
        movies_recommanded= recommand_movies_for_user(userid, 30, n_duplicates)
        n_duplicates-= 1
        final_recommandation= list(set(movies_recommanded.values) - set(movies_recommanded.values).intersection(set(movies_watched.values)))
    return final_recommandation[:n_movies]

In [25]:
movie_recommandation(78932, 6)

0                            Toy Story
13                           GoldenEye
27                   Leaving Las Vegas
50                     Dead Presidents
82                        Bed of Roses
                     ...              
9354                              Juno
9387              Charlie Wilson's War
9388    Walk Hard: The Dewey Cox Story
9472                 Definitely, Maybe
9759                     Step Brothers
Name: movie_name, Length: 418, dtype: object
8
7
6


['Gladiator',
 'Stand by Me',
 'Shakespeare in Love',
 'Indiana Jones and the Last Crusade',
 'Fight Club',
 'To Kill a Mockingbird']