In [95]:
import pandas as pd
import numpy as np
from math import sqrt

In [50]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [51]:
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [52]:
movies_df = movies_df.drop('genres',1)

In [53]:
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [54]:
movies_df['year'] = movies_df.title.str.extract('(\(\d\d\d\d\))',expand=False)
movies_df['year'] = movies_df.title.str.extract('(\d\d\d\d)',expand=False)


In [55]:
movies_df['title'] = movies_df.title.str.replace('(\(\d\d\d\d\))','')
movies_df['title'] = movies_df['title'].apply(lambda x: x.strip())

In [56]:
movies_df.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [57]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [58]:
ratings_df = ratings_df.drop('timestamp',1)

In [62]:
userInput = [
            {'title':'Skyfall', 'rating':4.6},
            {'title':'Casino Royale', 'rating':5},
            {'title':'Spectre', 'rating':4.3},
            {'title':"Sharknado 3: Oh Hell No!", 'rating':2},
            {'title':'Tomorrow Never Dies', 'rating':4.2},
    {'title':"GoldenEye",'rating':4.4}
         ] 
inputMovies = pd.DataFrame(userInput)
inputMovies

Unnamed: 0,rating,title
0,4.6,Skyfall
1,5.0,Casino Royale
2,4.3,Spectre
3,2.0,Sharknado 3: Oh Hell No!
4,4.2,Tomorrow Never Dies
5,4.4,GoldenEye


In [76]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]

In [78]:
inputMovies = pd.merge(inputId,inputMovies)
inputMovies

Unnamed: 0,movieId,title,year,rating
0,10,GoldenEye,1995,4.4
1,1722,Tomorrow Never Dies,1997,4.2
2,5796,Casino Royale,1967,5.0
3,49272,Casino Royale,2006,5.0
4,96079,Skyfall,2012,4.6
5,136020,Spectre,2015,4.3
6,136305,Sharknado 3: Oh Hell No!,2015,2.0


In [79]:
inputMovies.drop('year',1)

Unnamed: 0,movieId,title,rating
0,10,GoldenEye,4.4
1,1722,Tomorrow Never Dies,4.2
2,5796,Casino Royale,5.0
3,49272,Casino Royale,5.0
4,96079,Skyfall,4.6
5,136020,Spectre,4.3
6,136305,Sharknado 3: Oh Hell No!,2.0


now we find user subset who watch the same movies 

In [84]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
userSubset.head()

Unnamed: 0,userId,movieId,rating
567,6,10,3.0
1022,7,49272,4.5
1027,8,10,2.0
1181,10,49272,5.0
1236,10,96079,5.0


In [89]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])
userSubsetGroup.get_group(10)

Unnamed: 0,userId,movieId,rating
1181,10,49272,5.0
1236,10,96079,5.0
1256,10,136020,5.0


In [90]:
userSubsetGroup = sorted(userSubsetGroup,  key=lambda x: len(x[1]), reverse=True)

In [92]:
userSubsetGroup[0:10]

[(21,       userId  movieId  rating
  3221      21       10     5.0
  3276      21     1722     5.0
  3374      21     5796     2.0
  3440      21    49272     4.0
  3537      21    96079     4.0
  3640      21   136020     4.0), (448,        userId  movieId  rating
  68659     448       10     4.0
  68917     448     1722     4.0
  69718     448    49272     4.5
  70060     448    96079     4.0
  70388     448   136020     3.0
  70389     448   136305     1.0), (380,        userId  movieId  rating
  56881     380       10     5.0
  57104     380     1722     5.0
  57738     380    49272     5.0
  57934     380    96079     2.0
  58035     380   136020     4.0), (408,        userId  movieId  rating
  61575     408       10     4.5
  61596     408     1722     4.0
  61644     408    49272     5.0
  61685     408    96079     3.0
  61709     408   136020     4.0), (50,       userId  movieId  rating
  7278      50    49272     2.5
  7323      50    96079     2.5
  7372      50   136020   

In [93]:
userSubsetGroup = userSubsetGroup[0:150]

Now, we calculate the Pearson Correlation between input user and subset group, and store it in a dictionary, where the key is the user Id and the value is the coefficient

In [96]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:
    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')
    #Get the N for the formula
    nRatings = len(group)
    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]
    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
    #Now let's calculate the pearson correlation between two users, so called, x and y
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)
    
    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0


In [106]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
pearsonDF.head()    

Unnamed: 0,similarityIndex,userId
0,-0.732793,21
1,0.956779,448
2,-0.060634,380
3,0.373101,408
4,0.979521,50


In [107]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
topUsers.head()

Unnamed: 0,similarityIndex,userId
48,1.0,195
38,1.0,57
79,1.0,608
57,1.0,304
56,1.0,288


In [108]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating
0,1.0,195,6,4.0
1,1.0,195,10,4.0
2,1.0,195,16,4.0
3,1.0,195,25,4.0
4,1.0,195,32,4.0


In [110]:
#Multiplies the similarity by the user's ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
topUsersRating.head()

Unnamed: 0,similarityIndex,userId,movieId,rating,weightedRating
0,1.0,195,6,4.0,4.0
1,1.0,195,10,4.0,4.0
2,1.0,195,16,4.0,4.0
3,1.0,195,25,4.0,4.0
4,1.0,195,32,4.0,4.0


In [111]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,30.46219,120.443807
2,18.018093,53.375862
3,8.750136,23.834067
5,8.214227,20.648679
6,15.996294,61.162791


In [112]:
#Applies a sum to the topUsers after grouping it up by userId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
tempTopUsersRating.head()

Unnamed: 0_level_0,sum_similarityIndex,sum_weightedRating
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,30.46219,120.443807
2,18.018093,53.375862
3,8.750136,23.834067
5,8.214227,20.648679
6,15.996294,61.162791


In [113]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()
#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
recommendation_df.head()

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.953879,1
2,2.962348,2
3,2.723851,3
5,2.51377,5
6,3.82356,6


In [114]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
recommendation_df.head(10)

Unnamed: 0_level_0,weighted average recommendation score,movieId
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1147,5.0,1147
33649,5.0,33649
86345,5.0,86345
92494,5.0,92494
26681,5.0,26681
3677,5.0,3677
6370,5.0,6370
6375,5.0,6375
90888,5.0,90888
6442,5.0,6442


In [115]:
movies_df.loc[movies_df['movieId'].isin(recommendation_df.head(10)['movieId'].tolist())]

Unnamed: 0,movieId,title,year
867,1147,When We Were Kings,1996
2739,3677,Baraka,1992
4354,6370,"Spanish Apartment, The (L'auberge espagnole)",2002
4358,6375,Gigantic (A Tale of Two Johns),2002
4390,6442,Belle époque,1992
5549,26681,Bullet in the Head,1990
5906,33649,Saving Face,2004
7590,86345,Louis C.K.: Hilarious,2010
7742,90888,Immortals,2011
7812,92494,Dylan Moran: Monster,2004
