## Importing Data & Important Libraries

In [218]:
import pandas as pd
import numpy as np

In [219]:
# Movies Information Dataframe
movies_df = pd.read_csv("data/movies.csv")
movies_df.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [220]:
# Ratings Information DataFrame
ratings_df = pd.read_csv('data/ratings.csv')
ratings_df.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [221]:
#Using regular expressions to find a year stored between parentheses

#We specify the parantheses so we don’t conflict with movies that have years in their titles
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)

#Removing the parentheses
movies_df['year'] = movies_df.year.str.extract('(\d\d\d\d)',expand=False)


#Removing the years from the ‘title’ column
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))', '')

#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())


#Dropping the genres column
movies_df = movies_df.drop('genres',axis=1)

movies_df.head(2)

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995


## User Input

In [222]:
user_input = [
    {'title':'Breakfast Club, The', 'rating':5},
    {'title':'Toy Story', 'rating':3.5},
    {'title':'Jumanji', 'rating':2},
    {'title':'Pulp Fiction', 'rating':5},
    {'title':'Akira', 'rating':4.5}
]
movie_ratings_input = pd.DataFrame(user_input)
movie_ratings_input

Unnamed: 0,title,rating
0,"Breakfast Club, The",5.0
1,Toy Story,3.5
2,Jumanji,2.0
3,Pulp Fiction,5.0
4,Akira,4.5


In [236]:
user_input = [
    {'title':'Avengers: Infinity War - Part I', 'rating':5},
    {'title':'Justice League', 'rating':3.5},
    {'title':'Man of Steel', 'rating':4.5},
]
movie_ratings_input = pd.DataFrame(user_input)
movie_ratings_input

Unnamed: 0,title,rating
0,Avengers: Infinity War - Part I,5.0
1,Justice League,3.5
2,Man of Steel,4.5


In [237]:
# Filtering out the movie titles from movies_df, so we can get the movieIds
filtered_movies_df = movies_df[movies_df['title'].isin(movie_ratings_input['title'])]
movie_ratings_input = pd.merge(filtered_movies_df, movie_ratings_input)

# Drop info that we won’t use from the input dataframe
movie_ratings_input = movie_ratings_input.drop('year', 1)

movie_ratings_input.head()

Unnamed: 0,movieId,title,rating
0,103042,Man of Steel,4.5
1,122898,Justice League,3.5
2,122912,Avengers: Infinity War - Part I,5.0


In [238]:
# Filtering out users that have watched movies the same movies as userInput and storing it
userSubset = ratings_df[ratings_df['movieId'].isin(movie_ratings_input['movieId'])]
userSubset.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
4031,25,122912,5.0,1535470461
7359,50,122898,0.5,1514240068


In [239]:
"""
Group up the rows by userId, 
and sort these groups so the users that share the 
most movies in common with the input have higher priority
"""
# Groupby creates several sub dataframes where they all have 
# the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby('userId')
userSubsetGroup = sorted(userSubsetGroup, key=lambda x: len(x[1]), reverse=True)  # sort users by num movies watched 
userSubsetGroup

[(62,
        userId  movieId  rating   timestamp
  9053      62   103042     3.5  1525555304
  9102      62   122898     4.0  1523048682
  9105      62   122912     4.0  1526028975),
 (184,
         userId  movieId  rating   timestamp
  27164     184   103042     3.0  1537099001
  27189     184   122898     2.0  1537109736
  27192     184   122912     5.0  1537110034),
 (380,
         userId  movieId  rating   timestamp
  57962     380   103042     3.0  1494708602
  58016     380   122898     3.0  1536872751
  58020     380   122912     5.0  1526165814),
 (567,
         userId  movieId  rating   timestamp
  88004     567   103042     2.0  1525288223
  88077     567   122898     2.0  1525288625
  88080     567   122912     3.0  1525282047),
 (596,
         userId  movieId  rating   timestamp
  92077     596   103042     3.0  1535709903
  92103     596   122898     3.5  1535711652
  92108     596   122912     4.0  1535627215),
 (249,
         userId  movieId  rating   timestamp
  37253 

## Finding Similarity Weights

In [240]:
"""
Next, we are going to compare all users to our specified user and 
find users that are most similar using the Pearson Correlation Coefficient. 
In this case we will select a subset of users (first 100) to iterate through. 
This limit is imposed so we can focus on the users that have most impact, and to save computation time.
"""
userSubsetGroup = userSubsetGroup[0:100]

# Using Pearson's Correlation between Input User and UserSubsetGroup

In [241]:
from math import sqrt

#Store the Pearson Correlation in a dictionary, where the key = user Id and the value = correlation with userInput
pearson_correlation_dict = {}

#For every user group in our subset... name = userId, group = df of all movies he/she has watched
for name, group in userSubsetGroup:
    
    #Let’s start by sorting the movieIds in userInput and this current user for comparison/calculation later
    movie_ratings_input = movie_ratings_input.sort_values(by='movieId')
    group = group.sort_values(by='movieId')
    
    #Get the N for the formula
    nRatings = len(group)
    
    #Get movie review scores for movies that both this user & userInput have in common
    input_df = movie_ratings_input[movie_ratings_input['movieId'].isin(group['movieId'])]
    input_ratings = input_df['rating']
    
    #Save this user's reviews
    user_ratings = group['rating']
    
    # Now let’s calculate the pearson correlation between two users, aka User x (input user) and User y (current user)
    Sxx = sum([i**2 for i in input_ratings]) - pow(sum(input_ratings),2) / float(nRatings)
    Syy = sum([i**2 for i in user_ratings]) - pow(sum(user_ratings),2) / float(nRatings)
    Sxy = sum( i*j for i, j in zip(input_ratings, user_ratings)) - sum(input_ratings)*sum(user_ratings)/float(nRatings)
    
    # If the denominator isn't zero, then divide, else, there is 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearson_correlation_dict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearson_correlation_dict[name] = 0

In [242]:
# Converting the dictionary to a dataframe
similarity_df = pd.DataFrame.from_dict(pearson_correlation_dict, orient='index')
similarity_df = similarity_df.reset_index()
similarity_df = similarity_df.rename(columns={0: 'correlation', 'index': 'userId'})
similarity_df.head()

Unnamed: 0,userId,correlation
0,62,-0.188982
1,184,0.928571
2,380,0.755929
3,567,0.755929
4,596,0.327327


In [243]:
# Top 50 Similar Users to our input
top50_similar_users = similarity_df.sort_values(by='correlation', ascending=False)[0:50]
top50_similar_users.head(2)

Unnamed: 0,userId,correlation
5,249,1.0
6,305,1.0


## Creating the Weighting Rating Matrix

In [244]:
"""
Using this list of similar users, now let’s start recommending movies to the input user. 
We’re going to do this by taking the 
weighted average of these users' movie ratings (using the Pearson Correlation as the weight).
"""

similar_users_ratings = top50_similar_users.merge(ratings_df, on="userId")
similar_users_ratings.head()

Unnamed: 0,userId,correlation,movieId,rating,timestamp
0,249,1.0,1,4.0,1347317775
1,249,1.0,2,4.0,1353800871
2,249,1.0,19,3.5,1354107358
3,249,1.0,20,3.5,1355366891
4,249,1.0,32,5.0,1346752537


In [245]:
"""
First we simply multiply the movie rating by its weight (the correlation or similarity index).
Then we regroup the dataframe by movieId, before we sum up the weighted ratings and divide it by the sum of correlation weights. 
This helps us arrive at the weighted average recommendation score.
Basically SUM(Correlation * Ratings) / SUM((Correlation))
"""

similar_users_ratings['weightedRating'] = similar_users_ratings['correlation'] * similar_users_ratings['rating']
scores = similar_users_ratings.groupby('movieId')[['correlation','weightedRating']].sum()
scores.columns = ['sum_similarityIndex','sum_weightedRating']
scores['weighted average recommendation score'] = scores['sum_weightedRating'] / scores['sum_similarityIndex']
scores.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating,weighted average recommendation score
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2.839185,11.734703,4.133124
2,2.566947,10.523716,4.099702
3,0.0,0.0,
5,0.0,0.0,
6,1.566947,6.429225,4.103027


In [246]:
recommendation_df = scores[['weighted average recommendation score']].copy().reset_index()
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0,movieId,weighted average recommendation score
4476,104879,5.466037
2942,37733,5.333333
3319,55908,5.233019
5255,168248,5.166667
1577,3949,5.116509
1680,4223,5.116509
871,2076,5.0
322,720,5.0
1430,3503,5.0
1433,3508,5.0


In [247]:
merged_recommendation_movieTitle = recommendation_df.merge(movies_df, on='movieId')
merged_recommendation_movieTitle.head()

Unnamed: 0,movieId,weighted average recommendation score,title,year
0,104879,5.466037,Prisoners,2013
1,37733,5.333333,"History of Violence, A",2005
2,55908,5.233019,"Man from Earth, The",2007
3,168248,5.166667,John Wick: Chapter Two,2017
4,3949,5.116509,Requiem for a Dream,2000


In [255]:
result = merged_recommendation_movieTitle.copy()
result

Unnamed: 0,movieId,weighted average recommendation score,title,year
0,104879,5.466037,Prisoners,2013
1,37733,5.333333,"History of Violence, A",2005
2,55908,5.233019,"Man from Earth, The",2007
3,168248,5.166667,John Wick: Chapter Two,2017
4,3949,5.116509,Requiem for a Dream,2000
...,...,...,...,...
5403,190209,,Jeff Ross Roasts the Border,2017
5404,190213,,John From,2015
5405,190215,,Liquid Truth,2017
5406,190219,,Bunny,1998


In [261]:
for index, row in result.iterrows():
    print(row['title'])
    if index == 5:
        break

Prisoners
History of Violence, A
Man from Earth, The
John Wick: Chapter Two
Requiem for a Dream
Enemy at the Gates
