In [99]:
# import the "ratings.csv" file and see what it contains
import numpy as np
import pandas as pd
base="Movies Data/"
ratings_data = pd.read_csv(base+"ratings.csv")
ratings_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [100]:
# Each row in the dataset corresponds to one rating. 
# The userId column contains the ID of the user who left the rating. 
# The movieId column contains the Id of the movie, the rating column contains the rating left by the user. 
# Ratings can have values between 1 and 5. 
# And finally, the timestamp refers to the time at which the user left the rating.

In [101]:
# This dataset contains the IDs of the movies but not their titles. 
# We'll need movie names for the movies we're recommending. 
# The movie names are stored in the "movies.csv" file.

In [102]:
movie_names = pd.read_csv(base+"movies.csv")
movie_names.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [103]:
# We need a dataset that contains the userId, movie title, and its ratings. 
# We have this information in two different dataframe objects: "ratings_data" and "movie_names". 
# To get our desired information in a single dataframe, we can merge the two dataframes objects 
# on the movieId column since it is common between the two dataframes.

In [104]:
# Do this using merge() function from the Pandas library
movie_data = pd.merge(ratings_data, movie_names, on='movieId')
movie_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [105]:
# Now let's take a look at the average rating of each movie. 
# Group the dataset by the title of the movie and then calculate the mean of the rating for each movie.

In [106]:
movie_data.groupby('title')['rating'].mean().head()


title
'71 (2014)                                 4.0
'Hellboy': The Seeds of Creation (2004)    4.0
'Round Midnight (1986)                     3.5
'Salem's Lot (2004)                        5.0
'Til There Was You (1997)                  4.0
Name: rating, dtype: float64

In [107]:
movie_data.groupby('title')['rating'].mean().sort_values(ascending=True).head()

title
Rust and Bone (De rouille et d'os) (2012)     0.5
The Emoji Movie (2017)                        0.5
The Butterfly Effect 3: Revelations (2009)    0.5
Follow Me, Boys! (1966)                       0.5
The Beast of Hollow Mountain (1956)           0.5
Name: rating, dtype: float64

In [108]:
# Sort the ratings in the descending order of their average ratings
movie_data.groupby('title')['rating'].mean().sort_values(ascending=False).head()

title
Karlson Returns (1970)                           5.0
Winter in Prostokvashino (1984)                  5.0
My Love (2006)                                   5.0
Sorority House Massacre II (1990)                5.0
Winnie the Pooh and the Day of Concern (1972)    5.0
Name: rating, dtype: float64

In [109]:
# A movie can make it to the top of the above list even if only a single user has given it five stars. 
# Therefore, the above stats can be misleading. 
# Normally, a movie which is really a good one gets a higher rating by a large number of users.

In [110]:
movie_data.groupby('title')['rating'].count().sort_values(ascending=False).head()

title
Forrest Gump (1994)                 329
Shawshank Redemption, The (1994)    317
Pulp Fiction (1994)                 307
Silence of the Lambs, The (1991)    279
Matrix, The (1999)                  278
Name: rating, dtype: int64

In [111]:
# Now we know that both the average rating per movie and the number of ratings per movie are important attributes. 
# Let's create a new dataframe that contains both of these attributes.

In [112]:
# create a new dataframe and first add the average rating of each movie to this dataframe
ratings_mean_count = pd.DataFrame(movie_data.groupby('title')['rating'].mean())


In [113]:
# add the number of ratings for a movie to the dataframe
ratings_mean_count['rating_counts'] = pd.DataFrame(movie_data.groupby('title')['rating'].count())

In [114]:
ratings_mean_count.head()

Unnamed: 0_level_0,rating,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),4.0,1
'Hellboy': The Seeds of Creation (2004),4.0,1
'Round Midnight (1986),3.5,2
'Salem's Lot (2004),5.0,1
'Til There Was You (1997),4.0,2


In [115]:
# We will use the correlation between the ratings of a movie as the similarity metric. 
# To find the correlation between the ratings of the movie, we need to create a matrix 
# where each column is a movie name and each row contains the rating assigned by a specific user to that movie.

In [116]:
movie_data

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
...,...,...,...,...,...,...
100831,610,160341,2.5,1479545749,Bloodmoon (1997),Action|Thriller
100832,610,160527,4.5,1479544998,Sympathy for the Underdog (1971),Action|Crime|Drama
100833,610,160836,3.0,1493844794,Hazard (2005),Action|Drama|Thriller
100834,610,163937,3.5,1493848789,Blair Witch (2016),Horror|Thriller


In [117]:
user_movie_rating = movie_data.pivot_table(index='userId', columns='title', values='rating')
user_movie_rating.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [118]:
# find all the user ratings for the movie "Forrest Gump (1994)".

In [119]:
#forrest_gump_ratings = user_movie_rating[user_movie_rating.get('Forrest Gump (1994)').notnull()]
forrest_gump_ratings = user_movie_rating['Forrest Gump (1994)']
forrest_gump_ratings.head()



userId
1    4.0
2    NaN
3    NaN
4    NaN
5    NaN
Name: Forrest Gump (1994), dtype: float64

In [120]:
user_movie_rating['Forrest Gump (1994)']


userId
1      4.0
2      NaN
3      NaN
4      NaN
5      NaN
      ... 
606    4.0
607    NaN
608    3.0
609    4.0
610    3.0
Name: Forrest Gump (1994), Length: 610, dtype: float64

In [121]:
# Now let's retrieve all the movies that are similar to "Forrest Gump (1994)". 
# We can find the correlation between the user ratings for the "Forest Gump (1994)" and all the other movies using corrwith()

In [122]:
movies_like_forest_gump = user_movie_rating.corrwith(forrest_gump_ratings, axis = 0)
corr_forrest_gump = pd.DataFrame(movies_like_forest_gump, columns=['Correlation'])
corr_forrest_gump.dropna(inplace=True)
corr_forrest_gump.head()



Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
"'burbs, The (1989)",0.197712
(500) Days of Summer (2009),0.234095
*batteries not included (1987),0.89271
...And Justice for All (1979),0.928571
10 Cent Pistol (2015),-1.0


In [123]:
movies_like_forest_gump

title
'71 (2014)                                        NaN
'Hellboy': The Seeds of Creation (2004)           NaN
'Round Midnight (1986)                            NaN
'Salem's Lot (2004)                               NaN
'Til There Was You (1997)                         NaN
                                               ...   
eXistenZ (1999)                              0.011189
xXx (2002)                                   0.306817
xXx: State of the Union (2005)               0.383482
¡Three Amigos! (1986)                        0.449692
À nous la liberté (Freedom for Us) (1931)         NaN
Length: 9719, dtype: float64

In [124]:
# In the above script, we first retrieved the list of all the movies related to "Forrest Gump (1994)" 
# along with their correlation value, using corrwith() function. 
# Next, we created a dataframe that contains movie title and correlation columns. 
# We then removed all the NA values from the dataframe

In [125]:
corr_forrest_gump.sort_values('Correlation', ascending=False).head(10)

Unnamed: 0_level_0,Correlation
title,Unnamed: 1_level_1
Lost & Found (1999),1.0
"Cercle Rouge, Le (Red Circle, The) (1970)",1.0
Play Time (a.k.a. Playtime) (1967),1.0
Killers (2010),1.0
Playing God (1997),1.0
"Girl Walks Home Alone at Night, A (2014)",1.0
Tampopo (1985),1.0
"Century of the Self, The (2002)",1.0
Welcome to the Jungle (2013),1.0
Poison Ivy: New Seduction (1997),1.0


In [126]:
# retrieve only those correlated movies that have at least more than 50 ratings. 
# To do so, will add the rating_counts column from the rating_mean_count dataframe to our corr_forrest_gump dataframe. 

In [127]:
corr_forrest_gump = corr_forrest_gump.join(ratings_mean_count['rating_counts'])
corr_forrest_gump.head()

Unnamed: 0_level_0,Correlation,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"'burbs, The (1989)",0.197712,17
(500) Days of Summer (2009),0.234095,42
*batteries not included (1987),0.89271,7
...And Justice for All (1979),0.928571,3
10 Cent Pistol (2015),-1.0,2


In [128]:
# filter movies correlated to "Forest Gump (1994)", that have more than 50 ratings.

In [129]:
corr_forrest_gump[corr_forrest_gump ['rating_counts']>50].sort_values('Correlation', ascending=False).head()

Unnamed: 0_level_0,Correlation,rating_counts
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump (1994),1.0,329
Mr. Holland's Opus (1995),0.652144,80
Pocahontas (1995),0.550118,68
Grumpier Old Men (1995),0.534682,52
Caddyshack (1980),0.520328,52


In [130]:
# Download the week 10 - Music Data.zip dataset and create a Jupyter Notebook “RecommenderSystemsMusic.ipynb”. Do the following

#     a) Clean the dataset if necessary

#     b) Display the top 10 artists correlated to "Michael Jackson" based on the user's listen_count

#     c) Display the top 10 artists correlated to "Michael Jackson" based on the user tags

#     d) Reccommend 5 artists to the user with userID = 129 based on his friends' listen_counts 

#     e) Submit the “RecommenderSystemsMusic.ipynb”

# Hint: Combine different data files together as necassary for part b), c) and d)