In [1]:
import numpy as np
import pandas as pd
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
movie = pd.read_csv('movie.csv')
rating= pd.read_csv('rating.csv')

In [3]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [5]:
movie['year'] = movie.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movie['year'] = movie.year.str.extract('(\d\d\d\d)',expand=False)

In [6]:
#Removing the years from the 'title' column
movie['title'] = movie.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movie['title'] = movie['title'].apply(lambda x: x.strip())

  movie['title'] = movie.title.str.replace('(\(\d\d\d\d\))', '')


In [7]:
movie.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [8]:
rating.drop(columns=['timestamp'],inplace=True)

In [9]:
user = [
            {'title':'The Breakfast Club', 'rating':4},
            {'title':'Toy Story', 'rating':2.5},
            {'title':'Jumanji', 'rating':3},
            {'title':"Pulp Fiction", 'rating':4.5},
            {'title':'Akira', 'rating':5}
] 
inputMovie = pd.DataFrame(user)
inputMovie

Unnamed: 0,title,rating
0,The Breakfast Club,4.0
1,Toy Story,2.5
2,Jumanji,3.0
3,Pulp Fiction,4.5
4,Akira,5.0


In [10]:
#Filtering out the movies by title
Id = movie[movie['title'].isin(inputMovie['title'].tolist())]
#Then merging it so we can get the movieId. It's implicitly merging it by title.
inputMovie = pd.merge(Id, inputMovie)
#Dropping information we won't use from the input dataframe
inputMovie = inputMovie.drop('year', 1)
inputMovie

  inputMovie = inputMovie.drop('year', 1)


Unnamed: 0,movieId,title,genres,rating
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,2.5
1,2,Jumanji,Adventure|Children|Fantasy,3.0
2,296,Pulp Fiction,Comedy|Crime|Drama|Thriller,4.5
3,1274,Akira,Action|Adventure|Animation|Sci-Fi,5.0


In [11]:
#Filtering out users that have watched movies that the input has watched and storing it
users = rating[rating['movieId'].isin(inputMovie['movieId'].tolist())]
users.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
11,1,296,4.0
236,3,1,4.0
451,5,2,3.0
517,6,1,5.0


In [12]:
users.shape

(147572, 3)

In [13]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = users.groupby(['userId'])

In [14]:
#showing one such group example by getting all the users of a particular uderId
userSubsetGroup.get_group(1131)

Unnamed: 0,userId,movieId,rating
166669,1131,1,4.0


In [15]:
#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

  userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)


In [16]:
userSubsetGroup[0:3]

[(91,
        userId  movieId  rating
  9621      91        1     4.0
  9622      91        2     3.5
  9669      91      296     3.5
  9826      91     1274     2.5),
 (220,
         userId  movieId  rating
  28095     220        1     4.0
  28096     220        2     3.0
  28112     220      296     4.0
  28172     220     1274     3.0),
 (232,
         userId  movieId  rating
  29007     232        1     3.0
  29008     232        2     3.5
  29023     232      296     5.0
  29062     232     1274     4.5)]

In [17]:
userSubsetGroup = userSubsetGroup[0:100]

In [18]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovie = inputMovie.sort_values(by='movieId')
    #Get the N for the formula
    n = len(group)
    #Get the review scores for the movies that they both have in common
    temp = inputMovie[inputMovie['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp['rating'].tolist()
    #put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(n)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(n)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(n)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorDict[name] = 0


In [19]:
pearsonCorDict.items()

dict_items([(91, -0.8346223261119858), (220, -0.24253562503633297), (232, 0.9203579866168444), (294, 0), (367, 0.7893862224383407), (388, 0.34608266424225137), (440, 0.5007733956671915), (586, 0.852537120233408), (648, 0.5007733956671915), (729, 0.6441288386528846), (775, 0.46537892099551725), (812, -0.21693045781865616), (869, 0.21693045781865616), (903, 0.42008402520840293), (1200, 0.25890435250935817), (1244, 0.36563621206356534), (1290, 0.42008402520840293), (1341, 0.3473299378728699), (1525, 0.2869720215917757), (1568, 0.9701425001453319), (1715, 0.6287677132370127), (1748, 0.5144957554275265), (1763, 0.7001400420140048), (1810, 0.6969320524371696), (1813, 0.36030187928883595), (1825, 0.4509560339299333), (1849, 0.055641488407465724), (1864, 0.5144957554275265), (1942, 0.21693045781865616), (1969, -0.10188534162169867), (1984, -0.8557169633109855), (2024, 0.9701425001453319), (2047, 0.540728715025007), (2099, 0), (2107, 0.9848916356764205), (2138, 0.9701425001453319), (2367, -0.10

In [20]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()

Unnamed: 0,similarityIndex,userId
0,-0.834622,91
1,-0.242536,220
2,0.920358,232
3,0.0,294
4,0.789386,367


In [21]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
34,0.984892,2107
61,0.970143,3572
89,0.970143,5995
35,0.970143,2138
90,0.970143,6057


In [22]:
topUsersRating=topUsers.merge(rating, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,0.984892,2107,1,3.0
1,0.984892,2107,2,3.5
2,0.984892,2107,16,4.0
3,0.984892,2107,18,5.0
4,0.984892,2107,19,4.0


In [23]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,0.984892,2107,1,3.0,2.954675
1,0.984892,2107,2,3.5,3.447121
2,0.984892,2107,16,4.0,3.939567
3,0.984892,2107,18,5.0,4.924458
4,0.984892,2107,19,4.0,3.939567


In [24]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,33.460964,119.779834
2,33.460964,98.21246
3,8.14695,21.244382
4,1.354664,3.223823
5,5.627908,15.750595


In [25]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.579689,1
2,2.935135,2
3,2.607648,3
4,2.379796,4
5,2.798659,5


In [26]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4454,5.0,4454
4278,5.0,4278
8809,5.0,8809
7068,5.0,7068
4970,5.0,4970
944,5.0,944
3350,5.0,3350
4272,5.0,4272
793,5.0,793
69830,5.0,69830


In [27]:
movie.loc[movie['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,genres,year
780,793,My Life and Times With Antonin Artaud (En comp...,Drama,1993
927,944,Lost Horizon,Drama,1937
3263,3350,"Raisin in the Sun, A",Drama,1961
4177,4272,Tuvalu,Comedy,1999
4183,4278,Triumph of the Will (Triumph des Willens),Documentary,1934
4359,4454,More,Animation|Drama|Sci-Fi|IMAX,1998
4874,4970,"Blue Angel, The (Blaue Engel, Der)",Drama,1930
6956,7068,Last Year at Marienbad (L'Année dernière à Mar...,Drama|Mystery|Romance,1961
8126,8809,Danny Deckchair,Comedy|Romance,2003
13933,69830,Iron Maiden: Flight 666,Documentary,2009
