In [1]:
# Required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from statsmodels.stats import diagnostic
import statsmodels.api as sm
import statsmodels.formula.api as smf
import re
from sklearn.neighbors import NearestNeighbors

In [2]:
data_folder = './MovieSummaries/MovieSummaries/'
#paths to files
plot_summaries_path = data_folder + 'plot_summaries.txt'
movie_metadata_path = data_folder + 'movie.metadata.tsv'
character_metadata_path = data_folder + 'character.metadata.tsv'
role_classification_path = data_folder + 'tvtropes.clusters.txt'

# load the data
# 1. Plot summaries data
plot_summaries_df = pd.read_csv(plot_summaries_path, delimiter='\t', names=['wikipedia_movie_id', 'plot_summary'], 
                                 encoding='utf-8')

# 2. Movie metadata
movie_metadata_df = pd.read_csv(movie_metadata_path, delimiter='\t', names=['wikipedia_movie_id', 'freebase_movie_id', 
                                                                            'movie_name', 'release_date', 'box_office_revenue',
                                                                            'runtime', 'languages', 'countries', 'genres'], 
                                 encoding='utf-8')

# 3. Character metadata
character_metadata_df = pd.read_csv(character_metadata_path, delimiter='\t', names=['wikipedia_movie_id', 'freebase_movie_id', 'release_date', 'character_name', 
                                                                                    'actor_dob', 'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name', 
                                                                                    'actor_age_at_release', 'freebase_character_actor_map_id', 'freebase_character_id', 
                                                                                    'freebase_actor_id'], 
                                     encoding='utf-8')
role_classification_df = pd.read_csv(role_classification_path, delimiter='\t', names= ['role_type', 'poubelle1'])


In [3]:
print(role_classification_df.head())

                 role_type                                          poubelle1
0  absent_minded_professor  {"char": "Professor Philip Brainard", "movie":...
1  absent_minded_professor  {"char": "Professor Keenbean", "movie": "Richi...
2  absent_minded_professor  {"char": "Dr. Reinhardt Lane", "movie": "The S...
3  absent_minded_professor  {"char": "Dr. Harold Medford", "movie": "Them!...
4  absent_minded_professor  {"char": "Daniel Jackson", "movie": "Stargate"...


In [4]:
role_classification_df['character_name']= role_classification_df['poubelle1'].apply(lambda x : re.search('"char": "(.+.?)", "movie":', x).group(1) if re.search('"char": "(.+.?)", "movie":', x) else np.nan)

role_classification_df['movie_name']= role_classification_df['poubelle1'].apply(lambda x : re.search('"movie": "(.+.?)", "id":', x).group(1) if re.search('"char": "(.+.?)", "id":', x) else np.nan)

role_classification_df['freebase_movie_id']= role_classification_df['poubelle1'].apply(lambda x : re.search('"id": "(.+.?)", "actor":', x).group(1) if re.search('"id": "(.+.?)", "actor":', x) else np.nan)

role_classification_df['actor_name']= role_classification_df['poubelle1'].apply(lambda x : re.search('"actor": "(.+.?)"}', x).group(1) if re.search('"actor": "(.+.?)"}', x) else np.nan)

role_classification_df.drop(columns= 'poubelle1', inplace= True)

In [5]:
print(role_classification_df)

                   role_type                     character_name  \
0    absent_minded_professor          Professor Philip Brainard   
1    absent_minded_professor                 Professor Keenbean   
2    absent_minded_professor                 Dr. Reinhardt Lane   
3    absent_minded_professor                 Dr. Harold Medford   
4    absent_minded_professor                     Daniel Jackson   
..                       ...                                ...   
496                young_gun                        Morgan Earp   
497                young_gun                      Colorado Ryan   
498                young_gun                         Tom Sawyer   
499                young_gun  William H. 'Billy the Kid' Bonney   
500                young_gun                               Jake   

                                movie_name freebase_movie_id       actor_name  
0                                  Flubber         /m/0jy9q0   Robin Williams  
1                              Rich

In [6]:
df = pd.read_csv('./ml-32m/ratings.csv')

In [7]:
movies = pd.read_csv('ml-32m/movies.csv')

In [24]:
print(df)
print(movies.head())

          userId  movieId  rating   timestamp
0              1       17     4.0   944249077
1              1       25     1.0   944250228
2              1       29     2.0   943230976
3              1       30     5.0   944249077
4              1       32     5.0   943228858
...          ...      ...     ...         ...
32000199  200948    79702     4.5  1294412589
32000200  200948    79796     1.0  1287216292
32000201  200948    80350     0.5  1294412671
32000202  200948    80463     3.5  1350423800
32000203  200948    87304     4.5  1350423523

[31966872 rows x 4 columns]
   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                        genres                 title_format  
0  Adventure|Animation|Children|Comedy|F

In [9]:
movie_metadata_df['movie_name_formatted'] = movie_metadata_df['movie_name'].str.lower().str.strip()
movies['title_format'] = movies['title'].str[:-6].str.strip().str.lower()

In [10]:
# calculate the number of common movies between the two datasets

common_movies = set(movie_metadata_df['movie_name_formatted']).intersection(set(movies['title_format']))
print('Number of common movies:', len(common_movies))

# merge the two datasets

merged_df = pd.merge(movies, movie_metadata_df, left_on='title_format', right_on='movie_name_formatted', how='inner')

Number of common movies: 25632


In [11]:
# Remove this user from the dataset, because it has too many ratings
df = df[df['userId'] != 175325]

In [16]:
print(merged_df.head())
print(df.head())
print(df.shape)

   movieId                               title  \
0        1                    Toy Story (1995)   
1        2                      Jumanji (1995)   
2        3             Grumpier Old Men (1995)   
3        4            Waiting to Exhale (1995)   
4        5  Father of the Bride Part II (1995)   

                                      genres_x                 title_format  \
0  Adventure|Animation|Children|Comedy|Fantasy                    toy story   
1                   Adventure|Children|Fantasy                      jumanji   
2                               Comedy|Romance             grumpier old men   
3                         Comedy|Drama|Romance            waiting to exhale   
4                                       Comedy  father of the bride part ii   

   wikipedia_movie_id freebase_movie_id                   movie_name  \
0               53085          /m/0dyb1                    Toy Story   
1             3700174         /m/09w353                      Jumanji   
2       

In [49]:
# Only take common movies for analysis
movieId_to_keep= set(merged_df[merged_df['movie_name_formatted'].isin(common_movies)]['movieId'])
df_reduced= df[df['movieId'].isin(movieId_to_keep)]
print(df_reduced.shape)
df_more_reduced= df_reduced[df_reduced['userId'] < 50000]
print(df_more_reduced)

(20309328, 4)
         userId  movieId  rating   timestamp
0             1       17     4.0   944249077
1             1       25     1.0   944250228
5             1       34     2.0   943228491
6             1       36     1.0   944249008
8             1      110     3.0   943231119
...         ...      ...     ...         ...
7993792   49999   286901     3.0  1686913018
7993796   49999   287397     2.5  1694800199
7993800   49999   287635     2.5  1685997776
7993803   49999   287823     3.0  1688731713
7993813   49999   290091     2.5  1693394863

[5080173 rows x 4 columns]


In [None]:
hyper_space= df_more_reduced.pivot(index='userId',columns='movieId',values='rating')

In [None]:
print(hyper_space.shape)
print(hyper_space.head())

(49999, 21669)
movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
2           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     3.5     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   
5           NaN     NaN     NaN     NaN     NaN     NaN     NaN     NaN   

movieId  9       10      ...  292031  292035  292037  292051  292055  292057  \
userId                   ...                                                   
1           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
2           NaN     NaN  ...     NaN     NaN     NaN     NaN     NaN     NaN   
3           NaN     4.0  ...     NaN     NaN     NaN     NaN     NaN     NaN   
4           NaN     NaN  ...     NaN     NaN     NaN     Na

In [52]:
hyper_space.fillna(0, inplace=True)
print(hyper_space.head())

movieId  1       2       3       4       5       6       7       8       \
userId                                                                    
1           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     3.5     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   
5           0.0     0.0     0.0     0.0     0.0     0.0     0.0     0.0   

movieId  9       10      ...  292031  292035  292037  292051  292055  292057  \
userId                   ...                                                   
1           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
2           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
3           0.0     4.0  ...     0.0     0.0     0.0     0.0     0.0     0.0   
4           0.0     0.0  ...     0.0     0.0     0.0     0.0     0.0     0

In [56]:
sparse_matrix_rep= scipy.sparse.csr_matrix(hyper_space.values)
print(sparse_matrix_rep)

  (0, 15)	4.0
  (0, 23)	1.0
  (0, 29)	2.0
  (0, 31)	1.0
  (0, 80)	3.0
  (0, 81)	5.0
  (0, 109)	1.0
  (0, 123)	4.0
  (0, 214)	4.0
  (0, 237)	2.0
  (0, 358)	3.0
  (0, 368)	5.0
  (0, 380)	5.0
  (0, 405)	2.0
  (0, 500)	4.0
  (0, 518)	3.0
  (0, 521)	5.0
  (0, 555)	5.0
  (0, 558)	2.0
  (0, 560)	5.0
  (0, 561)	4.0
  (0, 565)	5.0
  (0, 568)	5.0
  (0, 573)	5.0
  (0, 600)	3.0
  :	:
  (49998, 20760)	4.0
  (49998, 20799)	2.0
  (49998, 20803)	4.0
  (49998, 20988)	3.5
  (49998, 20994)	3.5
  (49998, 20995)	3.0
  (49998, 21135)	3.0
  (49998, 21169)	3.5
  (49998, 21286)	3.0
  (49998, 21289)	3.0
  (49998, 21291)	3.0
  (49998, 21333)	3.5
  (49998, 21381)	3.0
  (49998, 21391)	2.5
  (49998, 21405)	2.5
  (49998, 21426)	2.0
  (49998, 21452)	3.5
  (49998, 21519)	2.0
  (49998, 21527)	3.5
  (49998, 21546)	3.0
  (49998, 21579)	3.0
  (49998, 21585)	2.5
  (49998, 21590)	2.5
  (49998, 21596)	3.0
  (49998, 21640)	2.5


In [265]:
knn_function= NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=30, n_jobs=-1)
knn_function.fit(sparse_matrix_rep)

In [266]:
distances, indices= knn_function.kneighbors(sparse_matrix_rep[0], n_neighbors= 30)
print(indices) #[[    0 33142 28126 35091 27981 14588  8379 17592 16920 35810 30721 37216 7372   371  5548  4371 10453 24242 40471 41932 29154 39551 23217 24154 44740 35181 26485 48094 37339 16318]]

[[    0 33142 28126 35091 27981 14588  8379 17592 16920 35810 30721 37216
   7372   371  5548  4371 10453 24242 40471 41932 29154 39551 23217 24154
  44740 35181 26485 48094 37339 16318]]


In [110]:
moviesid_to_check= df_more_reduced[(df_more_reduced['userId'].isin(indices[0, 1:])) & (df_more_reduced['rating']>= 4.5)]
print(moviesid_to_check)
duplicates = moviesid_to_check['movieId'].value_counts()
print(duplicates[duplicates.values >= 2].index.values)
#duplicates= moviesid_to_check[]

         userId  movieId  rating   timestamp
1310384    8379      588     5.0  1193288800
1310399    8379     2396     5.0  1193288807
1310400    8379     2485     4.5  1193288483
1310409    8379     4963     4.5  1193288822
1310413    8379     6377     5.0  1193288824
...         ...      ...     ...         ...
5731304   35810     1245     5.0   901379382
5731309   35810     1589     5.0   901379134
5731313   35810     1639     5.0   901378679
5731317   35810     1673     5.0   901378058
5731320   35810     1704     5.0   901378019

[139 rows x 4 columns]
[ 296 2329  480  608 7361 6377  110   36 1704 1673 2395 4226 1060]


In [295]:
def recommand_movies_for_user(userid, n_neighbors=30, n_duplicates= 8) :
    userid= userid-1
    distances, indices= knn_function.kneighbors(sparse_matrix_rep[userid], n_neighbors= n_neighbors)
    indices= indices[0, 1:] + 1
    moviesid_to_check= df_more_reduced[(df_more_reduced['userId'].isin(indices)) & (df_more_reduced['rating']== 5)]
    duplicates= moviesid_to_check['movieId'].value_counts()
    moviesid_to_return= duplicates[duplicates.values >= n_duplicates].index.values
    names_of_the_moovies= merged_df[merged_df['movieId'].isin(moviesid_to_return)]['movie_name']
    return names_of_the_moovies

In [291]:
def find_liked_movies_user(userid) :
    movies_id= df_more_reduced[(df_more_reduced['userId']== userid) & (df_more_reduced['rating']>= 4)]['movieId'].values
    names_of_the_moovies= merged_df[merged_df['movieId'].isin(movies_id)]['movie_name']
    return names_of_the_moovies

def find_disliked_movies_user(userid) :
    movies_id= df_more_reduced[(df_more_reduced['userId']== userid) & (df_more_reduced['rating']< 3)]['movieId'].values
    names_of_the_moovies= merged_df[merged_df['movieId'].isin(movies_id)]['movie_name']
    return names_of_the_moovies

In [292]:
# Testing this recommandation model by comparing the recommandation with what the user liked and disliked
test_recommand= recommand_movies_for_user(28)
test_liked= find_liked_movies_user(28)
test_disliked= find_disliked_movies_user(28)
print("Number of recommanded movies : ", test_recommand.shape)
print("Number of liked movies : ", test_liked.shape)
print("Number of disliked movies : ", test_disliked.shape)
print("Percentage of movies liked by user 1 (rating >= 4) that are recommanded by the sytem : ", len(set(test_recommand.values).intersection(set(test_liked.values))) / test_liked.shape[0])
print("Percentage of movies disliked by user 1 (rating < 3) that are recommanded by the sytem : ", len(set(test_recommand.values).intersection(set(test_disliked.values))) / test_disliked.shape[0])

Number of recommanded movies :  (38,)
Number of liked movies :  (1077,)
Number of disliked movies :  (537,)
Percentage of movies liked by user 1 (rating >= 4) that are recommanded by the sytem :  0.029712163416898793
Percentage of movies disliked by user 1 (rating < 3) that are recommanded by the sytem :  0.0


In [250]:
def test_model(ids_to_test, n_neighbors=30, n_duplicates= 8, print_inter= False) :
    scores_positive= []
    scores_negative= []
    count= 0
    for i in ids_to_test :
        test_recommand= recommand_movies_for_user(i, n_neighbors, n_duplicates)
        test_liked= find_liked_movies_user(i)
        test_disliked= find_disliked_movies_user(i)
        if print_inter :
            print("Number of recommanded movies : ", test_recommand.shape)
            print("Number of liked movies : ", test_liked.shape)
            print("Number of disliked movies : ", test_disliked.shape)
        nbr_movies_liked_recommanded= len(set(test_recommand.values).intersection(set(test_liked.values)))
        if test_liked.shape[0] != 0 :
            proportion_liked_recommanded=  nbr_movies_liked_recommanded / test_liked.shape[0]
        else :
            proportion_liked_recommanded = 0

        if print_inter :
            print("Percentage of movies liked by user 1 (rating >= 4) that are recommanded by the sytem : ", proportion_liked_recommanded)
        
        nbr_movies_disliked_recommanded= len(set(test_recommand.values).intersection(set(test_disliked.values)))
        if test_disliked.shape[0] != 0 :
            proportion_disliked_recommanded= nbr_movies_disliked_recommanded / test_disliked.shape[0]
        else :
            proportion_disliked_recommanded= 0

        if print_inter :
            print("Percentage of movies disliked by user 1 (rating < 3) that are recommanded by the sytem : ", proportion_disliked_recommanded)
        
        if test_recommand.shape[0] != 0 :
            scores_positive.append(nbr_movies_liked_recommanded/test_recommand.shape[0])
            scores_negative.append(nbr_movies_disliked_recommanded/test_recommand.shape[0])
        else :
            scores_positive.append(0)
            scores_negative.append(1)

        count+=1
        if count%50 == 0 :
            print(f"{count} users have been tested")
    return scores_positive, scores_negative

In [244]:
ids_to_test= []
ids_to_test= range(1000)

In [232]:
scores_pos, scores_neg= test_model(ids_to_test)

User 0 has been tested.
User 50 has been tested.
User 100 has been tested.
User 150 has been tested.
User 200 has been tested.
User 250 has been tested.
User 300 has been tested.
User 350 has been tested.
User 400 has been tested.
User 450 has been tested.
User 500 has been tested.
User 550 has been tested.
User 600 has been tested.
User 650 has been tested.
User 700 has been tested.
User 750 has been tested.
User 800 has been tested.
User 850 has been tested.
User 900 has been tested.
User 950 has been tested.


In [233]:
scores_pos= np.array(scores_pos)
scores_neg= np.array(scores_neg)
print(np.mean(scores_pos), np.mean(scores_neg))

0.6048399683273293 0.01767418605015549


In [253]:
def parameter_scanning(values_neighbors, values_duplicates) :
    ids_to_test= np.array(range(1, 49998), dtype= int)
    values_pos= 0
    values_neg= 1
    best_n_neighbors_pos= values_neighbors[0]
    best_n_duplicates_pos= values_duplicates[0]
    best_n_neighbors_neg= values_neighbors[0]
    best_n_duplicates_neg= values_duplicates[0]
    for n_neighbors in values_neighbors :
        for n_duplicates in values_duplicates :
            np.random.shuffle(ids_to_test)
            ids_to_test_prime= ids_to_test[:100]
            scores_pos, scores_neg= test_model(ids_to_test_prime, n_neighbors, n_duplicates)
            if np.mean(np.array(scores_pos)) > values_pos :
                values_pos= np.mean(np.array(scores_pos))
                best_n_neighbors_pos= n_neighbors
                best_n_duplicates_pos= n_duplicates
            if np.mean(np.array(scores_neg)) < values_neg :
                values_neg= np.mean(np.array(scores_neg))
                best_n_neighbors_neg= n_neighbors
                best_n_duplicates_neg= n_duplicates
            print(f"The couple of parameters ({n_neighbors}, {n_duplicates}) has been tested.")
    print(f"The best parameters for the positive score are : ({best_n_neighbors_pos}, {best_n_duplicates_pos})")
    print(f"The best parameters for the negative score are : ({best_n_neighbors_neg}, {best_n_duplicates_neg})")

In [254]:
parameter_scanning([10,20,30,40,50], [2,3,4,5,6,7,8,9,10])

50 users have been tested
100 users have been tested
the couple of parameters (10, 2) has been tested.
50 users have been tested
100 users have been tested
the couple of parameters (10, 3) has been tested.
50 users have been tested
100 users have been tested
the couple of parameters (10, 4) has been tested.
50 users have been tested
100 users have been tested
the couple of parameters (10, 5) has been tested.
50 users have been tested
100 users have been tested
the couple of parameters (10, 6) has been tested.
50 users have been tested
100 users have been tested
the couple of parameters (10, 7) has been tested.
50 users have been tested
100 users have been tested
the couple of parameters (10, 8) has been tested.
50 users have been tested
100 users have been tested
the couple of parameters (10, 9) has been tested.
50 users have been tested
100 users have been tested
the couple of parameters (10, 10) has been tested.
50 users have been tested
100 users have been tested
the couple of param

In [259]:
ids_to_test= range(1000)
scores_pos, scores_neg= test_model(ids_to_test, 40, 2)
scores_pos= np.array(scores_pos)
scores_neg= np.array(scores_neg)
print(np.mean(scores_pos), np.mean(scores_neg))

50 users have been tested
100 users have been tested
150 users have been tested
200 users have been tested
250 users have been tested
300 users have been tested
350 users have been tested
400 users have been tested
450 users have been tested
500 users have been tested
550 users have been tested
600 users have been tested
650 users have been tested
700 users have been tested
750 users have been tested
800 users have been tested
850 users have been tested
900 users have been tested
950 users have been tested
1000 users have been tested
0.2068181921610013 0.024991750957941428


0.5758076498476176 0.06482582511974802 : (30, 8)
0.6206855745121398 0.10482759500198037 : (30, 10)
0.2068181921610013 0.024991750957941428 : (40, 2)

In [308]:
def movie_recommandation(userid, n_movies= 5) :
    final_recommandation= []
    n_duplicates= 8
    movies_watched_id= df_more_reduced[(df_more_reduced['userId']== userid) & (df_more_reduced['rating']> 0)]['movieId'].values
    movies_watched=  merged_df[merged_df['movieId'].isin(movies_watched_id)]['movie_name']
    print(find_liked_movies_user(userid)[:30])
    while len(final_recommandation) < 5 and n_duplicates >= 2:
        print(n_duplicates)
        movies_recommanded= recommand_movies_for_user(userid, 30, n_duplicates)
        n_duplicates-= 1
        final_recommandation= list(set(movies_recommanded.values) - set(movies_recommanded.values).intersection(set(movies_watched.values)))
    return final_recommandation[:5]

In [319]:
movie_recommandation(23754, 5)

9                   Sabrina
10                  Sabrina
19    Sense and Sensibility
27        Leaving Las Vegas
Name: movie_name, dtype: object
8
7
6
5
4


['Ransom', 'Phenomenon', 'Heat', 'Fargo', 'Toy Story']