In [1]:
# Required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from statsmodels.stats import diagnostic
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re
import sklearn
from sklearn.neighbors import NearestNeighbors

In [2]:
data_folder = './MovieSummaries/MovieSummaries/'
#paths to files
plot_summaries_path = data_folder + 'plot_summaries.txt'
movie_metadata_path = data_folder + 'movie.metadata.tsv'
character_metadata_path = data_folder + 'character.metadata.tsv'
role_classification_path = data_folder + 'tvtropes.clusters.txt'

# load the data
# 1. Plot summaries data
plot_summaries_df = pd.read_csv(plot_summaries_path, delimiter='\t', names=['wikipedia_movie_id', 'plot_summary'], 
                                 encoding='utf-8')

# 2. Movie metadata
movie_metadata_df = pd.read_csv(movie_metadata_path, delimiter='\t', names=['wikipedia_movie_id', 'freebase_movie_id', 
                                                                            'movie_name', 'release_date', 'box_office_revenue',
                                                                            'runtime', 'languages', 'countries', 'genres'], 
                                 encoding='utf-8')

# 3. Character metadata
character_metadata_df = pd.read_csv(character_metadata_path, delimiter='\t', names=['wikipedia_movie_id', 'freebase_movie_id', 'release_date', 'character_name', 
                                                                                    'actor_dob', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 
                                                                                    'actor_age_at_release', 'freebase_character_actor_map_id', 'freebase_character_id', 
                                                                                    'freebase_actor_id'], 
                                     encoding='utf-8')
role_classification_df = pd.read_csv(role_classification_path, delimiter='\t', names= ['role_type', 'poubelle1'])


In [3]:
print(role_classification_df.head())

                 role_type                                          poubelle1
0  absent_minded_professor  {"char": "Professor Philip Brainard", "movie":...
1  absent_minded_professor  {"char": "Professor Keenbean", "movie": "Richi...
2  absent_minded_professor  {"char": "Dr. Reinhardt Lane", "movie": "The S...
3  absent_minded_professor  {"char": "Dr. Harold Medford", "movie": "Them!...
4  absent_minded_professor  {"char": "Daniel Jackson", "movie": "Stargate"...


In [4]:
role_classification_df['character_name']= role_classification_df['poubelle1'].apply(lambda x : re.search('"char": "(.+.?)", "movie":', x).group(1) if re.search('"char": "(.+.?)", "movie":', x) else np.nan)

role_classification_df['movie_name']= role_classification_df['poubelle1'].apply(lambda x : re.search('"movie": "(.+.?)", "id":', x).group(1) if re.search('"char": "(.+.?)", "id":', x) else np.nan)

role_classification_df['freebase_movie_id']= role_classification_df['poubelle1'].apply(lambda x : re.search('"id": "(.+.?)", "actor":', x).group(1) if re.search('"id": "(.+.?)", "actor":', x) else np.nan)

role_classification_df['actor_name']= role_classification_df['poubelle1'].apply(lambda x : re.search('"actor": "(.+.?)"}', x).group(1) if re.search('"actor": "(.+.?)"}', x) else np.nan)

role_classification_df.drop(columns= 'poubelle1', inplace= True)

In [5]:
print(role_classification_df)

                   role_type                     character_name  \
0    absent_minded_professor          Professor Philip Brainard   
1    absent_minded_professor                 Professor Keenbean   
2    absent_minded_professor                 Dr. Reinhardt Lane   
3    absent_minded_professor                 Dr. Harold Medford   
4    absent_minded_professor                     Daniel Jackson   
..                       ...                                ...   
496                young_gun                        Morgan Earp   
497                young_gun                      Colorado Ryan   
498                young_gun                         Tom Sawyer   
499                young_gun  William H. 'Billy the Kid' Bonney   
500                young_gun                               Jake   

                                movie_name freebase_movie_id       actor_name  
0                                  Flubber         /m/0jy9q0   Robin Williams  
1                              Rich

In [6]:
df = pd.read_csv('./ml-32m/ratings.csv')

In [7]:
movies = pd.read_csv('ml-32m/movies.csv')

In [8]:
print(df)
print(movies.head())

          userId  movieId  rating   timestamp
0              1       17     4.0   944249077
1              1       25     1.0   944250228
2              1       29     2.0   943230976
3              1       30     5.0   944249077
4              1       32     5.0   943228858
...          ...      ...     ...         ...
32000199  200948    79702     4.5  1294412589
32000200  200948    79796     1.0  1287216292
32000201  200948    80350     0.5  1294412671
32000202  200948    80463     3.5  1350423800
32000203  200948    87304     4.5  1350423523

[32000204 rows x 4 columns]
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres  
0  Adventure|Animation|Children|Comedy|Fantasy  
1                   

In [9]:
movie_metadata_df['movie_name_formatted'] = movie_metadata_df['movie_name'].str.lower().str.strip()
movies['title_format'] = movies['title'].str[:-6].str.strip().str.lower()

In [10]:
# calculate the number of common movies between the two datasets

common_movies = set(movie_metadata_df['movie_name_formatted']).intersection(set(movies['title_format']))
print('Number of common movies:', len(common_movies))

# merge the two datasets

merged_df = pd.merge(movies, movie_metadata_df, left_on='title_format', right_on='movie_name_formatted', how='inner')

Number of common movies: 25632


In [11]:
# Remove this user from the dataset, because it has too many ratings
df = df[df['userId'] != 175325]

In [65]:
print(merged_df.head())
print(df.head())
print(df.shape)

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                      genres_x                 title_format  \
0  Adventure|Animation|Children|Comedy|Fantasy                    toy story   
1                   Adventure|Children|Fantasy                      jumanji   
2                               Comedy|Romance             grumpier old men   
3                         Comedy|Drama|Romance            waiting to exhale   
4                                       Comedy  father of the bride part ii   

   wikipedia_movie_id freebase_movie_id                   movie_name  \
0               53085          /m/0dyb1                    Toy Story   
1             3700174         /m/09w353                      Jumanji   
2       

In [13]:
# Only take common movies for analysis
movieId_to_keep= set(merged_df[merged_df['movie_name_formatted'].isin(common_movies)]['movieId'])
df_reduced= df[df['movieId'].isin(movieId_to_keep)]
print(df_reduced.shape)
df_more_reduced= df_reduced[df_reduced['userId'] < 80000]
print(df_more_reduced)

(20309328, 4)
          userId  movieId  rating  timestamp
0              1       17     4.0  944249077
1              1       25     1.0  944250228
5              1       34     2.0  943228491
6              1       36     1.0  944249008
8              1      110     3.0  943231119
...          ...      ...     ...        ...
12781363   79999     3114     3.0  974951172
12781364   79999     3362     3.0  974951172
12781365   79999     3424     1.0  974951104
12781366   79999     3504     4.0  974951254
12781367   79999     3916     5.0  974951915

[8124111 rows x 4 columns]


In [14]:
hyper_space= df_more_reduced.pivot(index='userId',columns='movieId',values='rating')

In [15]:
print(hyper_space.shape)
print(hyper_space.head())

(79999, 24489)
movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
2           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     3.5     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

movieId  9       10      ...  292031  292035  292037  292051  292055  292057  \
userId                   ...                                                   
1           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
2           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     4.0  ...     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN  ...     NaN     NaN     NaN     Na

In [16]:
hyper_space.fillna(0, inplace=True)
print(hyper_space.head())

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     3.5     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  292031  292035  292037  292051  292055  292057  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     4.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

In [17]:
sparse_matrix_rep= scipy.sparse.csr_matrix(hyper_space.values)
print(sparse_matrix_rep)
print(sparse_matrix_rep.shape)

  (0, 15)	4.0
  (0, 23)	1.0
  (0, 29)	2.0
  (0, 31)	1.0
  (0, 80)	3.0
  (0, 81)	5.0
  (0, 109)	1.0
  (0, 123)	4.0
  (0, 214)	4.0
  (0, 237)	2.0
  (0, 358)	3.0
  (0, 368)	5.0
  (0, 380)	5.0
  (0, 405)	2.0
  (0, 500)	4.0
  (0, 518)	3.0
  (0, 521)	5.0
  (0, 555)	5.0
  (0, 558)	2.0
  (0, 560)	5.0
  (0, 561)	4.0
  (0, 565)	5.0
  (0, 568)	5.0
  (0, 573)	5.0
  (0, 600)	3.0
  :	:
  (79997, 3727)	4.5
  (79997, 3761)	4.0
  (79997, 3875)	4.0
  (79997, 3936)	5.0
  (79997, 4024)	2.5
  (79998, 0)	3.0
  (79998, 133)	5.0
  (79998, 237)	3.0
  (79998, 405)	5.0
  (79998, 698)	1.0
  (79998, 735)	5.0
  (79998, 741)	4.0
  (79998, 747)	3.0
  (79998, 754)	3.0
  (79998, 841)	4.0
  (79998, 1252)	5.0
  (79998, 1409)	5.0
  (79998, 1740)	5.0
  (79998, 1835)	5.0
  (79998, 1898)	5.0
  (79998, 1907)	3.0
  (79998, 2051)	3.0
  (79998, 2089)	1.0
  (79998, 2147)	4.0
  (79998, 2410)	5.0
(79999, 24489)


In [None]:
scipy.sparse.save_npz("Sparse_hyperspace_user_movie.npz", sparse_matrix_rep)

In [18]:
print(sparse_matrix_rep[0])
N_films= sparse_matrix_rep.shape[1]

  (0, 15)	4.0
  (0, 23)	1.0
  (0, 29)	2.0
  (0, 31)	1.0
  (0, 80)	3.0
  (0, 81)	5.0
  (0, 109)	1.0
  (0, 123)	4.0
  (0, 214)	4.0
  (0, 237)	2.0
  (0, 358)	3.0
  (0, 368)	5.0
  (0, 380)	5.0
  (0, 405)	2.0
  (0, 500)	4.0
  (0, 518)	3.0
  (0, 521)	5.0
  (0, 555)	5.0
  (0, 558)	2.0
  (0, 560)	5.0
  (0, 561)	4.0
  (0, 565)	5.0
  (0, 568)	5.0
  (0, 573)	5.0
  (0, 600)	3.0
  :	:
  (0, 1363)	1.0
  (0, 1366)	5.0
  (0, 1408)	1.0
  (0, 1420)	5.0
  (0, 1424)	5.0
  (0, 1462)	5.0
  (0, 1471)	1.0
  (0, 1483)	1.0
  (0, 1533)	3.0
  (0, 1544)	5.0
  (0, 1552)	5.0
  (0, 1593)	5.0
  (0, 1611)	5.0
  (0, 1654)	1.0
  (0, 1662)	1.0
  (0, 1702)	5.0
  (0, 1757)	1.0
  (0, 1764)	4.0
  (0, 1785)	4.0
  (0, 1817)	5.0
  (0, 1825)	5.0
  (0, 1835)	4.0
  (0, 1856)	4.0
  (0, 1885)	2.0
  (0, 1891)	3.0


In [55]:
def special_dot_prod(s1, mu1, s2, mu2) :
    return s1.multiply(s2).sum()- mu2*s1.sum() - mu1*s2.sum() + mu1*mu2
    
def correlation_distance_by_vincent_uwu(sparse_vec1, sparse_vec2) :
    mu1= sparse_vec1.data.mean()/N_films if sparse_vec1.nnz else 0
    mu2= sparse_vec2.data.mean()/N_films if sparse_vec1.nnz else 0
    
    dot_prod= special_dot_prod(sparse_vec1, mu1, sparse_vec2, mu2)
    norm1= np.sqrt(special_dot_prod(sparse_vec1, mu1, sparse_vec1, mu1))
    norm2= np.sqrt(special_dot_prod(sparse_vec2, mu2, sparse_vec2, mu2))
    
    if norm1 == 0 or norm2 == 0 :
        return 1
    else :
        return 1 - dot_prod/(norm1*norm2)

import time
start= time.perf_counter()
print(correlation_distance_by_vincent_uwu(sparse_matrix_rep[0], sparse_matrix_rep[1]))
end= time.perf_counter()
print(end - start)
start= time.perf_counter()
print(sklearn.metrics.pairwise.cosine_distances(sparse_matrix_rep[0], sparse_matrix_rep[1]))
end= time.perf_counter()
print(end - start)

0.9798072303681449
0.003437599982134998
[[0.97972868]]
0.003977500018663704


In [19]:
knn_function= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=30, n_jobs=-1)
knn_function.fit(sparse_matrix_rep)

In [20]:
distances, indices= knn_function.kneighbors(sparse_matrix_rep[1], n_neighbors= 30)
print(indices) #[[    0 33142 28126 35091 27981 14588  8379 17592 16920 35810 30721 37216 7372   371  5548  4371 10453 24242 40471 41932 29154 39551 23217 24154 44740 35181 26485 48094 37339 16318]]

[[    1 69114 15318 61757 11822 74767 34786 19316 38594 57398 14389 25327
   6575 25398 24969 38355 60797 42746 24770 75186  9934 33441 75862 51287
  29568 48219 57635 49022 46049 74239]]


In [None]:
[[    1 69114 15318 61757 11822 74767 34786 19316 38594 57398 14389 25327
   6575 25398 24969 38355 60797 42746 24770 75186  9934 33441 75862 51287
  29568 48219 57635 49022 46049 74239]]

In [None]:
moviesid_to_check= df_more_reduced[(df_more_reduced['userId'].isin(indices[0, 1:])) & (df_more_reduced['rating']>= 4.5)]
print(moviesid_to_check)
duplicates = moviesid_to_check['movieId'].value_counts()
print(duplicates[duplicates.values >= 2].index.values)
#duplicates= moviesid_to_check[]

In [78]:
def generate_sparse_vector_from_ratings(list_of_imdbid_and_rating, total_nbr_of_movies) :
    bidule= np.array(list_of_imdbid_and_rating)
    imdb_ids= bidule[:, 0]
    ratings= np.array(bidule[:, 1], dtype= float)
    movie_ids= merged_df[merged_df['freebase_movie_id'].isin(imdb_ids)]['movieId'].values
    movie_ids= movie_ids - 1
    return scipy.sparse.csr_matrix((ratings, (np.zeros(len(movie_ids)), movie_ids)), shape= (1, total_nbr_of_movies))
    
list_= [["/m/0dyb1", 4.5], ["/m/09w353", 3], ["/m/0676dr", 2], ["/m/03vny7", 3.5], ["/m/094g2z", 5]]
total_nbr_of_movies= 24489
def weighted_rating(R, v, m, C):
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

def recommand_movies_for_website_user(list_, n_neighbors=30, n_movies= 5) :
    total_nbr_of_movies= 24489
    sparse_vec= generate_sparse_vector_from_ratings(list_, total_nbr_of_movies)
    distances, indices= knn_function.kneighbors(sparse_vec, n_neighbors= n_neighbors)
    indices= indices[0, 1:] + 1
    moviesid_to_check= df_more_reduced[(df_more_reduced['userId'].isin(indices)) & (df_more_reduced['rating']>= 0)]
    df_temp= moviesid_to_check.groupby('movieId')
    averages= df_temp['rating'].mean()
    C= averages.mean()
    number_of_votes= df_temp['userId'].count()
    m= number_of_votes.quantile(0.8)
    scores= weighted_rating(averages, number_of_votes, m , C)
    sorted_scores_id= scores.sort_values(ascending= False).index
    movies_recommanded= merged_df[merged_df['movieId'].isin(sorted_scores_id[:n_movies])]['movie_name_formatted']
    final_recommandation= list(set(movies_recommanded.values))
    print()
    return final_recommandation[:n_movies]

recommand_movies_for_website_user(list_)

['father of the bride part ii',
 'dead man walking',
 'grumpier old men',
 'toy story',
 "mr. holland's opus"]

In [None]:
def weighted_rating(R, v, m, C):
    # Calculation based on the IMDB formula
    return (v/(v+m) * R) + (m/(m+v) * C)

def recommand_movies_for_user(userid_or_list_of_taste, n_neighbors=30, n_movies= 5) :
    userid= userid-1
    distances, indices= knn_function.kneighbors(sparse_matrix_rep[userid], n_neighbors= n_neighbors)
    indices= indices[0, 1:] + 1
    moviesid_to_check= df_more_reduced[(df_more_reduced['userId'].isin(indices)) & (df_more_reduced['rating']>= 0)]
    df_temp= moviesid_to_check.groupby('movieId')
    averages= df_temp['rating'].mean()
    C= averages.mean()
    number_of_votes= df_temp['userId'].count()
    m= number_of_votes.quantile(0.8)
    scores= weighted_rating(averages, number_of_votes, m , C)
    sorted_scores_id= scores.sort_values(ascending= False).index
    movies_watched_id= df_more_reduced[(df_more_reduced['userId']== userid) & (df_more_reduced['rating']>= 0)]['movieId'].values
    movies_watched=  merged_df[merged_df['movieId'].isin(movies_watched_id)]['movie_name']
    print(find_liked_movies_user(userid))
    n_temp= n_movies
    final_recommandation= []
    while len(final_recommandation) < n_movies :
        movies_recommanded= merged_df[merged_df['movieId'].isin(sorted_scores_id[:n_temp])]['movie_name_formatted']
        final_recommandation= list(set(movies_recommanded.values) - set(movies_recommanded.values).intersection(set(movies_watched.values)))
    return final_recommandation[:n_movies]

recommand_movies_for_user(13792)

(1, 24489)
213                             Ed Wood
295         What's Eating Gilbert Grape
300          Ace Ventura: Pet Detective
526                               Fargo
972     One Flew Over the Cuckoo's Nest
981                          Goodfellas
1585                              Rocky
1586                              Rocky
1912                          Elizabeth
2325                    American Beauty
2326                    American Beauty
2422                         Fight Club
2434              Bringing Out the Dead
2451               Being John Malkovich
2497                  Anywhere But Here
2541                           Flawless
2542                           Flawless
2569                  Anna and the King
2585                    Man on the Moon
2594                  Girl, Interrupted
2673                         Hanging Up
2906                          Gladiator
2907                          Gladiator
2995                    Blazing Saddles
3001                         

['pirates of the caribbean: the curse of the black pearl',
 'monty python and the holy grail',
 'indiana jones and the last crusade',
 'back to the future',
 'toy story']

In [24]:
def find_liked_movies_user(userid) :
    movies_id= df_more_reduced[(df_more_reduced['userId']== userid) & (df_more_reduced['rating']>= 4)]['movieId'].values
    names_of_the_moovies= merged_df[merged_df['movieId'].isin(movies_id)]['movie_name']
    return names_of_the_moovies

def find_disliked_movies_user(userid) :
    movies_id= df_more_reduced[(df_more_reduced['userId']== userid) & (df_more_reduced['rating']< 3)]['movieId'].values
    names_of_the_moovies= merged_df[merged_df['movieId'].isin(movies_id)]['movie_name']
    return names_of_the_moovies

In [None]:
# Testing this recommandation model by comparing the recommandation with what the user liked and disliked
test_recommand= recommand_movies_for_user(28)
test_liked= find_liked_movies_user(28)
test_disliked= find_disliked_movies_user(28)
print("Number of recommanded movies : ", test_recommand.shape)
print("Number of liked movies : ", test_liked.shape)
print("Number of disliked movies : ", test_disliked.shape)
print("Percentage of movies liked by user 1 (rating >= 4) that are recommanded by the sytem : ", len(set(test_recommand.values).intersection(set(test_liked.values))) / test_liked.shape[0])
print("Percentage of movies disliked by user 1 (rating < 3) that are recommanded by the sytem : ", len(set(test_recommand.values).intersection(set(test_disliked.values))) / test_disliked.shape[0])

In [None]:
def test_model(ids_to_test, n_neighbors=30, n_duplicates= 8, print_inter= False) :
    scores_positive= []
    scores_negative= []
    count= 0
    for i in ids_to_test :
        test_recommand= recommand_movies_for_user(i, n_neighbors, n_duplicates)
        test_liked= find_liked_movies_user(i)
        test_disliked= find_disliked_movies_user(i)
        if print_inter :
            print("Number of recommanded movies : ", test_recommand.shape)
            print("Number of liked movies : ", test_liked.shape)
            print("Number of disliked movies : ", test_disliked.shape)
        nbr_movies_liked_recommanded= len(set(test_recommand.values).intersection(set(test_liked.values)))
        if test_liked.shape[0] != 0 :
            proportion_liked_recommanded=  nbr_movies_liked_recommanded / test_liked.shape[0]
        else :
            proportion_liked_recommanded = 0

        if print_inter :
            print("Percentage of movies liked by user 1 (rating >= 4) that are recommanded by the sytem : ", proportion_liked_recommanded)
        
        nbr_movies_disliked_recommanded= len(set(test_recommand.values).intersection(set(test_disliked.values)))
        if test_disliked.shape[0] != 0 :
            proportion_disliked_recommanded= nbr_movies_disliked_recommanded / test_disliked.shape[0]
        else :
            proportion_disliked_recommanded= 0

        if print_inter :
            print("Percentage of movies disliked by user 1 (rating < 3) that are recommanded by the sytem : ", proportion_disliked_recommanded)
        
        if test_recommand.shape[0] != 0 :
            scores_positive.append(nbr_movies_liked_recommanded/test_recommand.shape[0])
            scores_negative.append(nbr_movies_disliked_recommanded/test_recommand.shape[0])
        else :
            scores_positive.append(0)
            scores_negative.append(1)

        count+=1
        if count%50 == 0 :
            print(f"{count} users have been tested")
    return scores_positive, scores_negative

In [None]:
ids_to_test= []
ids_to_test= range(1000)

In [None]:
scores_pos, scores_neg= test_model(ids_to_test)

In [None]:
scores_pos= np.array(scores_pos)
scores_neg= np.array(scores_neg)
print(np.mean(scores_pos), np.mean(scores_neg))

In [None]:
def parameter_scanning(values_neighbors, values_duplicates) :
    ids_to_test= np.array(range(1, 49998), dtype= int)
    values_pos= 0
    values_neg= 1
    best_n_neighbors_pos= values_neighbors[0]
    best_n_duplicates_pos= values_duplicates[0]
    best_n_neighbors_neg= values_neighbors[0]
    best_n_duplicates_neg= values_duplicates[0]
    for n_neighbors in values_neighbors :
        for n_duplicates in values_duplicates :
            np.random.shuffle(ids_to_test)
            ids_to_test_prime= ids_to_test[:100]
            scores_pos, scores_neg= test_model(ids_to_test_prime, n_neighbors, n_duplicates)
            if np.mean(np.array(scores_pos)) > values_pos :
                values_pos= np.mean(np.array(scores_pos))
                best_n_neighbors_pos= n_neighbors
                best_n_duplicates_pos= n_duplicates
            if np.mean(np.array(scores_neg)) < values_neg :
                values_neg= np.mean(np.array(scores_neg))
                best_n_neighbors_neg= n_neighbors
                best_n_duplicates_neg= n_duplicates
            print(f"The couple of parameters ({n_neighbors}, {n_duplicates}) has been tested.")
    print(f"The best parameters for the positive score are : ({best_n_neighbors_pos}, {best_n_duplicates_pos})")
    print(f"The best parameters for the negative score are : ({best_n_neighbors_neg}, {best_n_duplicates_neg})")

In [None]:
parameter_scanning([10,20,30,40,50], [2,3,4,5,6,7,8,9,10])

In [None]:
ids_to_test= range(1000)
scores_pos, scores_neg= test_model(ids_to_test, 40, 2)
scores_pos= np.array(scores_pos)
scores_neg= np.array(scores_neg)
print(np.mean(scores_pos), np.mean(scores_neg))

0.5758076498476176 0.06482582511974802 : (30, 8)
0.6206855745121398 0.10482759500198037 : (30, 10)
0.2068181921610013 0.024991750957941428 : (40, 2)

In [22]:
def movie_recommandation(userid, n_movies= 5) :
    final_recommandation= []
    n_duplicates= 8
    movies_watched_id= df_more_reduced[(df_more_reduced['userId']== userid) & (df_more_reduced['rating']>= 0)]['movieId'].values
    movies_watched=  merged_df[merged_df['movieId'].isin(movies_watched_id)]['movie_name']
    print(find_liked_movies_user(userid))
    while len(final_recommandation) < n_movies and n_duplicates >= 2:
        print(n_duplicates)
        movies_recommanded= recommand_movies_for_user(userid, 30, n_duplicates)
        n_duplicates-= 1
        final_recommandation= list(set(movies_recommanded.values) - set(movies_recommanded.values).intersection(set(movies_watched.values)))
    return final_recommandation[:n_movies]

In [25]:
movie_recommandation(78932, 6)

0                            Toy Story
13                           GoldenEye
27                   Leaving Las Vegas
50                     Dead Presidents
82                        Bed of Roses
                     ...              
9354                              Juno
9387              Charlie Wilson's War
9388    Walk Hard: The Dewey Cox Story
9472                 Definitely, Maybe
9759                     Step Brothers
Name: movie_name, Length: 418, dtype: object
8
7
6


['Gladiator',
 'Stand by Me',
 'Shakespeare in Love',
 'Indiana Jones and the Last Crusade',
 'Fight Club',
 'To Kill a Mockingbird']