# Collaborative Recommender System user Id 5 

# Prepare data

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

# Data Cleaning

In [4]:
#Convert to lower case
movies['title'] = movies.title.str.lower()

In [5]:
#split title name into name and year
movies[['movie_name', 'year']] = movies['title'].str.split(' \(', 1, expand=True)

In [6]:
movies.head(3)

Unnamed: 0,movieId,title,genres,movie_name,year
0,1,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,toy story,1995)
1,2,jumanji (1995),Adventure|Children|Fantasy,jumanji,1995)
2,3,grumpier old men (1995),Comedy|Romance,grumpier old men,1995)


In [7]:
#Replace Special characters
movies['year'] = movies['year'].str.replace("\)", "", regex=True) 
movies['year'] 

0       1995
1       1995
2       1995
3       1995
4       1995
        ... 
9737    2017
9738    2017
9739    2017
9740    2018
9741    1991
Name: year, Length: 9742, dtype: object

In [8]:
movies['movie_name'] = movies['movie_name'].str.replace('[^a-zA-Z0-9\n\.]', ' ', regex=True)

In [9]:
movies['genres'] = movies['genres'].str.replace("[^a-zA-Z0-9_]", ",", regex=True)
movies['genres']

0       Adventure,Animation,Children,Comedy,Fantasy
1                        Adventure,Children,Fantasy
2                                    Comedy,Romance
3                              Comedy,Drama,Romance
4                                            Comedy
                           ...                     
9737                Action,Animation,Comedy,Fantasy
9738                       Animation,Comedy,Fantasy
9739                                          Drama
9740                               Action,Animation
9741                                         Comedy
Name: genres, Length: 9742, dtype: object

In [10]:
#Ratings Df whats in it
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [11]:
#Merge the two Dataframes
movies_ratings_df = pd.merge(movies,ratings, on = 'movieId')
movies_ratings_df

Unnamed: 0,movieId,title,genres,movie_name,year,userId,rating,timestamp
0,1,toy story (1995),"Adventure,Animation,Children,Comedy,Fantasy",toy story,1995,1,4.0,964982703
1,1,toy story (1995),"Adventure,Animation,Children,Comedy,Fantasy",toy story,1995,5,4.0,847434962
2,1,toy story (1995),"Adventure,Animation,Children,Comedy,Fantasy",toy story,1995,7,4.5,1106635946
3,1,toy story (1995),"Adventure,Animation,Children,Comedy,Fantasy",toy story,1995,15,2.5,1510577970
4,1,toy story (1995),"Adventure,Animation,Children,Comedy,Fantasy",toy story,1995,17,4.5,1305696483
...,...,...,...,...,...,...,...,...
100831,193581,black butler: book of the atlantic (2017),"Action,Animation,Comedy,Fantasy",black butler book of the atlantic,2017,184,4.0,1537109082
100832,193583,no game no life: zero (2017),"Animation,Comedy,Fantasy",no game no life zero,2017,184,3.5,1537109545
100833,193585,flint (2017),Drama,flint,2017,184,3.5,1537109805
100834,193587,bungo stray dogs: dead apple (2018),"Action,Animation",bungo stray dogs dead apple,2018,184,3.5,1537110021


In [12]:
#Keep relevant columns
movies_ratings_df = movies_ratings_df[['movieId','userId','movie_name','year','genres', 'rating']]
movies_ratings_df

Unnamed: 0,movieId,userId,movie_name,year,genres,rating
0,1,1,toy story,1995,"Adventure,Animation,Children,Comedy,Fantasy",4.0
1,1,5,toy story,1995,"Adventure,Animation,Children,Comedy,Fantasy",4.0
2,1,7,toy story,1995,"Adventure,Animation,Children,Comedy,Fantasy",4.5
3,1,15,toy story,1995,"Adventure,Animation,Children,Comedy,Fantasy",2.5
4,1,17,toy story,1995,"Adventure,Animation,Children,Comedy,Fantasy",4.5
...,...,...,...,...,...,...
100831,193581,184,black butler book of the atlantic,2017,"Action,Animation,Comedy,Fantasy",4.0
100832,193583,184,no game no life zero,2017,"Animation,Comedy,Fantasy",3.5
100833,193585,184,flint,2017,Drama,3.5
100834,193587,184,bungo stray dogs dead apple,2018,"Action,Animation",3.5


In [13]:
# Aggregate by movie
avg_ratings = movies_ratings_df.groupby('movie_name').agg(avg_rating = ('rating', 'mean'),
                                                number_of_ratings = ('rating', 'count')).reset_index()

# Keep the movies with over 100 ratings
avg_ratings100 = avg_ratings[avg_ratings['number_of_ratings']>100]
avg_ratings100                     

Unnamed: 0,movie_name,avg_rating,number_of_ratings
74,2001 a space odyssey,3.894495,109
205,ace ventura pet detective,3.040373,161
295,aladdin,3.792350,183
319,alien,3.969178,146
326,aliens,3.964286,126
...,...,...,...
8815,usual suspects the,4.237745,204
8939,wall e,4.057692,104
8990,waterworld,2.913043,115
9174,willy wonka the chocolate factory,3.873950,119


In [14]:
# Check popular movies
avg_ratings100.sort_values(by='number_of_ratings', ascending=False).head()

Unnamed: 0,movie_name,avg_rating,number_of_ratings
3060,forrest gump,4.164134,329
7325,shawshank redemption the,4.429022,317
6623,pulp fiction,4.197068,307
7412,silence of the lambs the,4.16129,279
5321,matrix the,4.192446,278


In [15]:
# Merge data
merged_df = pd.merge(movies_ratings_df, avg_ratings100[['movie_name']], on='movie_name', how='inner')
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19952 entries, 0 to 19951
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   movieId     19952 non-null  int64  
 1   userId      19952 non-null  int64  
 2   movie_name  19952 non-null  object 
 3   year        19952 non-null  object 
 4   genres      19952 non-null  object 
 5   rating      19952 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 1.1+ MB


In [16]:
merged_df

Unnamed: 0,movieId,userId,movie_name,year,genres,rating
0,1,1,toy story,1995,"Adventure,Animation,Children,Comedy,Fantasy",4.0
1,1,5,toy story,1995,"Adventure,Animation,Children,Comedy,Fantasy",4.0
2,1,7,toy story,1995,"Adventure,Animation,Children,Comedy,Fantasy",4.5
3,1,15,toy story,1995,"Adventure,Animation,Children,Comedy,Fantasy",2.5
4,1,17,toy story,1995,"Adventure,Animation,Children,Comedy,Fantasy",4.5
...,...,...,...,...,...,...
19947,79132,596,inception,2010,"Action,Crime,Drama,Mystery,Sci,Fi,Thriller,IMAX",4.0
19948,79132,598,inception,2010,"Action,Crime,Drama,Mystery,Sci,Fi,Thriller,IMAX",5.0
19949,79132,599,inception,2010,"Action,Crime,Drama,Mystery,Sci,Fi,Thriller,IMAX",3.0
19950,79132,601,inception,2010,"Action,Crime,Drama,Mystery,Sci,Fi,Thriller,IMAX",5.0


In [17]:
# Pivot the data to get a user-item matrix
movies_ratings_pivot = pd.pivot_table(merged_df, index='userId', columns='movie_name', values='rating', fill_value = 0)


In [18]:
movies_ratings_pivot

movie_name,2001 a space odyssey,ace ventura pet detective,aladdin,alien,aliens,amelie,american beauty,american history x,american pie,apocalypse now,...,true lies,truman show the,twelve monkeys,twister,up,usual suspects the,wall e,waterworld,willy wonka the chocolate factory,x men
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,4.0,0.0,0.0,5.0,5.0,0.0,4.0,...,0.0,0.0,0.0,3.0,0.0,5.0,0.0,0.0,5.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,4.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
5,0.0,3.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,5.0,0.0,0.0,4.0,3.5,4.5,4.5,4.0,1.0,4.5,...,0.0,4.5,4.0,0.0,0.0,4.5,4.0,0.0,0.0,0.0
607,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,...,4.0,0.0,0.0,5.0,0.0,0.0,0.0,3.0,0.0,3.0
608,3.0,3.5,3.0,4.0,4.5,0.0,5.0,4.0,2.5,3.0,...,3.0,4.5,3.5,3.0,0.0,4.5,0.0,3.0,3.5,4.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0


In [19]:
# Calculate the cosine similarity matrix of users
similarity_matrix = cosine_similarity(movies_ratings_pivot)

In [20]:
# Convert it to a DataFrame
similarity_matrix_df = pd.DataFrame(similarity_matrix, index=movies_ratings_pivot.index, columns=movies_ratings_pivot.index)
similarity_matrix_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.055674,0.148282,0.497143,0.314129,0.356333,0.375438,0.289184,0.160039,0.093287,...,0.255794,0.412053,0.648917,0.183075,0.300662,0.588922,0.576125,0.624547,0.244138,0.604437
2,0.055674,1.000000,0.000000,0.019890,0.046363,0.050349,0.035471,0.061566,0.000000,0.227727,...,0.310043,0.051480,0.084093,0.000000,0.000000,0.189484,0.048930,0.157831,0.073012,0.222941
3,0.148282,0.000000,1.000000,0.000000,0.274411,0.107280,0.000000,0.218635,0.000000,0.000000,...,0.177109,0.182818,0.091108,0.000000,0.000000,0.131511,0.173762,0.100763,0.000000,0.080513
4,0.497143,0.019890,0.000000,1.000000,0.198991,0.230690,0.306955,0.146191,0.061477,0.080363,...,0.187644,0.254814,0.531973,0.139434,0.264859,0.377129,0.355105,0.469134,0.073255,0.442488
5,0.314129,0.046363,0.274411,0.198991,1.000000,0.624101,0.210509,0.614356,0.000000,0.115703,...,0.147746,0.622074,0.286678,0.498556,0.387118,0.253337,0.358570,0.388487,0.466030,0.261967
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.588922,0.189484,0.131511,0.377129,0.253337,0.286401,0.563569,0.311680,0.260981,0.300137,...,0.569716,0.345732,0.671373,0.213851,0.373437,1.000000,0.447433,0.847175,0.219934,0.818628
607,0.576125,0.048930,0.173762,0.355105,0.358570,0.444902,0.386105,0.363188,0.045372,0.025585,...,0.214808,0.454901,0.553561,0.308719,0.279023,0.447433,1.000000,0.598579,0.360426,0.565600
608,0.624547,0.157831,0.100763,0.469134,0.388487,0.464821,0.572239,0.443358,0.181983,0.303049,...,0.517867,0.488162,0.723425,0.356527,0.418503,0.847175,0.598579,1.000000,0.323308,0.815615
609,0.244138,0.073012,0.000000,0.073255,0.466030,0.593401,0.192289,0.586720,0.000000,0.078089,...,0.073474,0.583037,0.175201,0.432681,0.283740,0.219934,0.360426,0.323308,1.000000,0.217702


In [21]:
select_userid = 5
similarities = similarity_matrix_df[select_userid].drop(select_userid)
weights = similarities/similarities.sum()

In [22]:
similarities

userId
1      0.314129
2      0.046363
3      0.274411
4      0.198991
6      0.624101
         ...   
606    0.253337
607    0.358570
608    0.388487
609    0.466030
610    0.261967
Name: 5, Length: 596, dtype: float64

In [23]:
weights

userId
1      0.002174
2      0.000321
3      0.001899
4      0.001377
6      0.004318
         ...   
606    0.001753
607    0.002481
608    0.002688
609    0.003225
610    0.001813
Name: 5, Length: 596, dtype: float64

In [24]:
n =10 #num of silimar users
user_similarity_threshold =  0.5
# Get top n similar users
top_similar_users = similarity_matrix_df[similarity_matrix_df[select_userid]>user_similarity_threshold][select_userid].sort_values(ascending=False)[:n]

In [25]:
# Print out top n similar users
print(f'The top similar users for user {select_userid} are', top_similar_users)

The top similar users for user 5 are userId
5      1.000000
470    0.703033
117    0.684129
40     0.681779
142    0.672452
109    0.661853
565    0.655640
455    0.654090
38     0.652586
229    0.651689
Name: 5, dtype: float64


In [26]:
watched_movies = movies_ratings_pivot.loc[movies_ratings_pivot.index== select_userid, movies_ratings_pivot.loc[select_userid,:]>0]

In [27]:
watched_movies
#movies watched by user 5 and his ratings

movie_name,ace ventura pet detective,aladdin,apollo 13,babe,batman,batman forever,beauty and the beast,braveheart,clear and present danger,clueless,...,mask the,pretty woman,pulp fiction,schindler s list,shawshank redemption the,stargate,terminator 2 judgment day,toy story,true lies,usual suspects the
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,3.0,4.0,3.0,4.0,3.0,3.0,5.0,4.0,3.0,3.0,...,4.0,3.0,5.0,5.0,3.0,2.0,3.0,4.0,2.0,4.0


In [28]:
watched_movies.columns

Index(['ace ventura  pet detective', 'aladdin', 'apollo 13', 'babe', 'batman',
       'batman forever', 'beauty and the beast', 'braveheart',
       'clear and present danger', 'clueless', 'dances with wolves', 'fargo',
       'four weddings and a funeral', 'fugitive  the',
       'interview with the vampire  the vampire chronicles', 'lion king  the',
       'mask  the', 'pretty woman', 'pulp fiction', 'schindler s list',
       'shawshank redemption  the', 'stargate', 'terminator 2  judgment day',
       'toy story', 'true lies', 'usual suspects  the'],
      dtype='object', name='movie_name')

In [29]:
# Movies that similar users watched.
similar_user_movies = movies_ratings_pivot[movies_ratings_pivot.index.isin(top_similar_users.index)].replace(0, np.nan).dropna(axis=1, how='all')

#similar_user_movies = movies_ratings_pivot[movies_ratings_pivot.index.isin(top similar_users.index)].dropna(axis=1, how='all')
similar_user_movies

movie_name,ace ventura pet detective,aladdin,apollo 13,babe,batman,batman forever,beauty and the beast,braveheart,clear and present danger,clerks,...,taxi driver,terminator 2 judgment day,toy story,trainspotting,true lies,twelve monkeys,twister,usual suspects the,waterworld,willy wonka the chocolate factory
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5,3.0,4.0,3.0,4.0,3.0,3.0,5.0,4.0,3.0,,...,,3.0,4.0,,2.0,,,4.0,,
38,,3.0,4.0,,4.0,3.0,4.0,5.0,,5.0,...,,,,,3.0,,2.0,5.0,1.0,
40,4.0,4.0,5.0,5.0,4.0,3.0,3.0,4.0,,5.0,...,5.0,,5.0,,3.0,,,,2.0,
109,2.0,3.0,3.0,3.0,4.0,3.0,,5.0,3.0,3.0,...,,3.0,,4.0,3.0,4.0,,4.0,,
117,2.0,4.0,4.0,3.0,3.0,3.0,4.0,5.0,3.0,,...,,4.0,,3.0,4.0,3.0,2.0,4.0,,
142,4.0,5.0,5.0,4.0,3.0,,4.0,,3.0,,...,,3.0,,,4.0,,,5.0,,
229,3.0,,5.0,5.0,3.0,3.0,,5.0,4.0,,...,,5.0,5.0,,4.0,,4.0,,,
455,3.0,3.0,3.0,4.0,3.0,,3.0,4.0,4.0,,...,3.0,3.0,,,3.0,,,3.0,,
470,3.0,3.0,3.0,4.0,3.0,4.0,3.0,5.0,4.0,,...,,5.0,4.0,,5.0,3.0,4.0,3.0,3.0,4.0
565,3.0,3.0,3.0,4.0,2.0,2.0,,,,,...,,,,,4.0,5.0,,5.0,,


In [30]:
# Remove the select_userid's watched movies from the movie list
similar_user_movies = similar_user_movies.drop(select_userid, axis=0)


In [31]:
similar_user_movies

movie_name,ace ventura pet detective,aladdin,apollo 13,babe,batman,batman forever,beauty and the beast,braveheart,clear and present danger,clerks,...,taxi driver,terminator 2 judgment day,toy story,trainspotting,true lies,twelve monkeys,twister,usual suspects the,waterworld,willy wonka the chocolate factory
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
38,,3.0,4.0,,4.0,3.0,4.0,5.0,,5.0,...,,,,,3.0,,2.0,5.0,1.0,
40,4.0,4.0,5.0,5.0,4.0,3.0,3.0,4.0,,5.0,...,5.0,,5.0,,3.0,,,,2.0,
109,2.0,3.0,3.0,3.0,4.0,3.0,,5.0,3.0,3.0,...,,3.0,,4.0,3.0,4.0,,4.0,,
117,2.0,4.0,4.0,3.0,3.0,3.0,4.0,5.0,3.0,,...,,4.0,,3.0,4.0,3.0,2.0,4.0,,
142,4.0,5.0,5.0,4.0,3.0,,4.0,,3.0,,...,,3.0,,,4.0,,,5.0,,
229,3.0,,5.0,5.0,3.0,3.0,,5.0,4.0,,...,,5.0,5.0,,4.0,,4.0,,,
455,3.0,3.0,3.0,4.0,3.0,,3.0,4.0,4.0,,...,3.0,3.0,,,3.0,,,3.0,,
470,3.0,3.0,3.0,4.0,3.0,4.0,3.0,5.0,4.0,,...,,5.0,4.0,,5.0,3.0,4.0,3.0,3.0,4.0
565,3.0,3.0,3.0,4.0,2.0,2.0,,,,,...,,,,,4.0,5.0,,5.0,,


In [32]:
similar_user_movies.columns

Index(['ace ventura  pet detective', 'aladdin', 'apollo 13', 'babe', 'batman',
       'batman forever', 'beauty and the beast', 'braveheart',
       'clear and present danger', 'clerks', 'cliffhanger', 'clueless',
       'crimson tide', 'dances with wolves', 'die hard',
       'die hard  with a vengeance', 'dumb   dumber', 'fargo', 'firm  the',
       'forrest gump', 'four weddings and a funeral', 'fugitive  the', 'ghost',
       'goldeneye', 'heat', 'home alone', 'independence day',
       'interview with the vampire  the vampire chronicles', 'jumanji',
       'jurassic park', 'l on  the professional', 'lion king  the',
       'mask  the', 'mission  impossible', 'mrs. doubtfire', 'net  the',
       'outbreak', 'pretty woman', 'pulp fiction', 'rock  the',
       'schindler s list', 'seven', 'shawshank redemption  the',
       'silence of the lambs  the', 'sleepless in seattle', 'speed',
       'star trek  generations', 'stargate', 'taxi driver',
       'terminator 2  judgment day', 'to

In [33]:
similar_movies_not_watched =similar_user_movies.columns[~similar_user_movies.columns.isin(watched_movies.columns)]

In [34]:
similar_movies_not_watched

Index(['clerks', 'cliffhanger', 'crimson tide', 'die hard',
       'die hard  with a vengeance', 'dumb   dumber', 'firm  the',
       'forrest gump', 'ghost', 'goldeneye', 'heat', 'home alone',
       'independence day', 'jumanji', 'jurassic park',
       'l on  the professional', 'mission  impossible', 'mrs. doubtfire',
       'net  the', 'outbreak', 'rock  the', 'seven',
       'silence of the lambs  the', 'sleepless in seattle', 'speed',
       'star trek  generations', 'taxi driver', 'trainspotting',
       'twelve monkeys', 'twister', 'waterworld',
       'willy wonka   the chocolate factory'],
      dtype='object', name='movie_name')

In [35]:
print(len(similar_movies_not_watched))
print(similar_movies_not_watched)

32
Index(['clerks', 'cliffhanger', 'crimson tide', 'die hard',
       'die hard  with a vengeance', 'dumb   dumber', 'firm  the',
       'forrest gump', 'ghost', 'goldeneye', 'heat', 'home alone',
       'independence day', 'jumanji', 'jurassic park',
       'l on  the professional', 'mission  impossible', 'mrs. doubtfire',
       'net  the', 'outbreak', 'rock  the', 'seven',
       'silence of the lambs  the', 'sleepless in seattle', 'speed',
       'star trek  generations', 'taxi driver', 'trainspotting',
       'twelve monkeys', 'twister', 'waterworld',
       'willy wonka   the chocolate factory'],
      dtype='object', name='movie_name')


# Recommender

In [36]:
#movies not watched by select_userid 5 
#not_watched_movies = movies_ratings_pivot.loc[movies_ratings_pivot.index!= select_userid, movies_ratings_pivot.loc[select_userid,:]==5]
not_watched_movies = movies_ratings_pivot.loc[movies_ratings_pivot.index != select_userid, movies_ratings_pivot.loc[select_userid,:]== 0]


In [37]:
not_watched_movies

movie_name,2001 a space odyssey,alien,aliens,amelie,american beauty,american history x,american pie,apocalypse now,austin powers the spy who shagged me,back to the future,...,titanic,trainspotting,truman show the,twelve monkeys,twister,up,wall e,waterworld,willy wonka the chocolate factory,x men
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,4.0,0.0,0.0,5.0,5.0,0.0,4.0,0.0,5.0,...,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,5.0,5.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,4.0,0.0,...,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,5.0,0.0,0.0,3.0,3.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,5.0,4.0,3.5,4.5,4.5,4.0,1.0,4.5,0.0,3.5,...,4.0,4.0,4.5,4.0,0.0,0.0,4.0,0.0,0.0,0.0
607,0.0,3.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,3.0,...,0.0,0.0,0.0,0.0,5.0,0.0,0.0,3.0,0.0,3.0
608,3.0,4.0,4.5,0.0,5.0,4.0,2.5,3.0,3.0,2.0,...,2.0,3.0,4.5,3.5,3.0,0.0,0.0,3.0,3.5,4.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0


In [38]:
weighted_averages = pd.DataFrame(not_watched_movies.T.dot(weights.to_numpy()), columns=["weighted_avg"])
#weighted_averages = pd.DataFrame(not_watched_movies.T.dot(weights.to_numpy()), columns=["predicted_rating"])

In [39]:
#weighted_averages = pd.DataFrame(not_watched_pivot.T.dot(weights), columns=["predicted_rating"])


In [58]:
weighted_averages

Unnamed: 0_level_0,weighted_avg
movie_name,Unnamed: 1_level_1
2001 a space odyssey,0.774267
alien,1.014306
aliens,0.907501
amelie,0.812702
american beauty,1.368601
...,...
up,0.576634
wall e,0.601733
waterworld,0.977729
willy wonka the chocolate factory,0.908833


In [57]:
weighted_averages.sort_values(by='weighted_avg', ascending=False).head(20)

Unnamed: 0_level_0,weighted_avg
movie_name,Unnamed: 1_level_1
forrest gump,2.953774
silence of the lambs the,2.51756
jurassic park,2.21177
seven,1.859381
matrix the,1.82197
star wars episode iv a new hope,1.781582
speed,1.601298
twelve monkeys,1.557773
star wars episode v the empire strikes back,1.554452
fight club,1.469447


In [42]:
# Average rating for the select_userid
aver_rating = movies_ratings_pivot[movies_ratings_pivot.index == select_userid].T.mean()[select_userid]
aver_rating

0.6666666666666666

# Similarity Score

In [61]:
#initialize an empty dictionary called ratings_score to store the predicted ratings for each movie.
ratings_score = {}

# loop through each movie in the dataset.
for i in similar_user_movies.columns:
  # For each movie, it retrieves the ratings given by similar users from the similar_user_movies DataFrame. 
#The ratings for the movie are stored in the movie_rating variable.
  movie_rating = similar_user_movies[i] 
  # Create a variable to store the score
  total = 0  
  # Create a variable to store the number of scores
  count = 0
  # Loop through similar users
  for u in top_similar_users.index:
    # If the movie has rating
    # If the movie has a rating
    if u in movie_rating.index and not pd.isna(movie_rating[u]):
  # Score is the sum of user similarity score multiply by the movie rating
        score = top_similar_users[u] * movie_rating[u] 
      #Add the score to the total score for the movie so far
        total+= score
      # Add 1 to the count
        count +=1
# Get the average score for the item
  
   # Get the average score for the item
  ratings_score[i] = total / count   # Adjusting the weights

# Convert dictionary to pandas dataframe
ratings_score = pd.DataFrame(ratings_score.items(), columns=['movie_name', 'similarity_score'])

# Remove all movies already watched by select_userid from ratings_score dataframe.
#watched_movies = movies_ratings_pivot.loc[movies_ratings_pivot.index== select_userid, movies_ratings_pivot.loc[select_userid,:]>0]
watched_movies_list = list(watched_movies.columns)
ratings_score = ratings_score[~ratings_score['movie_name'].isin(watched_movies_list)]
    
# Sort the movies by score
ranked_ratings_score = ratings_score.sort_values(by='similarity_score', ascending=False)



In [62]:
# Select top movies
ranked_ratings_score

Unnamed: 0,movie_name,similarity_score
9,clerks,2.885794
43,silence of the lambs the,2.877547
19,forrest gump,2.838535
57,willy wonka the chocolate factory,2.812134
22,ghost,2.692552
48,taxi driver,2.685582
53,twelve monkeys,2.521774
12,crimson tide,2.456002
44,sleepless in seattle,2.449408
41,seven,2.428709


In [64]:
# normalize the similarity scores
ranked_ratings_score['normalized_scores'] = ranked_ratings_score['similarity_score'] / ranked_ratings_score['similarity_score'].max()


In [65]:
ranked_ratings_score

Unnamed: 0,movie_name,similarity_score,normalized_scores
9,clerks,2.885794,1.0
43,silence of the lambs the,2.877547,0.997142
19,forrest gump,2.838535,0.983624
57,willy wonka the chocolate factory,2.812134,0.974475
22,ghost,2.692552,0.933037
48,taxi driver,2.685582,0.930621
53,twelve monkeys,2.521774,0.873858
12,crimson tide,2.456002,0.851066
44,sleepless in seattle,2.449408,0.848781
41,seven,2.428709,0.841608
