In [16]:
import pandas as pd
from math import sqrt
import numpy as np

In [17]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')
print(movies_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  9742 non-null   int64 
 1   title    9742 non-null   object
 2   genres   9742 non-null   object
dtypes: int64(1), object(2)
memory usage: 228.5+ KB
None


In [18]:
userInput = [{'title': 'Tommy Boy (1995)', 'rating': 5},
             {'title':'Escape to Witch Mountain (1975)', 'rating': 3},
             {'title':'Winnie the Pooh and the Blustery Day (1968)', 'rating': 5},
             {'title':'Three Caballeros, The (1945)', 'rating': 5},
             {'title':'Sword in the Stone, The (1963)', 'rating': 5}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                                         title  rating
0                             Tommy Boy (1995)       5
1              Escape to Witch Mountain (1975)       3
2  Winnie the Pooh and the Blustery Day (1968)       5
3                 Three Caballeros, The (1945)       5
4               Sword in the Stone, The (1963)       5


In [20]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
print(inputMovies)

   movieId                                        title  \
0      333                             Tommy Boy (1995)   
1     1009              Escape to Witch Mountain (1975)   
2     1023  Winnie the Pooh and the Blustery Day (1968)   
3     1024                 Three Caballeros, The (1945)   
4     1025               Sword in the Stone, The (1963)   

                               genres  rating  
0                              Comedy       5  
1          Adventure|Children|Fantasy       3  
2          Animation|Children|Musical       5  
3          Animation|Children|Musical       5  
4  Animation|Children|Fantasy|Musical       5  


In [21]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
333          50      50         50
1009          9       9          9
1023         13      13         13
1024          6       6          6
1025         25      25         25


In [22]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])
    

#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[((1,),     userId  movieId  rating  timestamp
18       1      333     5.0  964981179
49       1     1009     3.0  964981775
50       1     1023     5.0  964982681
51       1     1024     5.0  964982876
52       1     1025     5.0  964982791), ((414,),        userId  movieId  rating   timestamp
62439     414      333     3.0   961439132
62656     414     1009     2.0  1026225826
62665     414     1023     4.0   961519060
62666     414     1024     4.0   961518975
62667     414     1025     4.0   961519060), ((288,),        userId  movieId  rating  timestamp
42238     288     1009     3.0  978468314
42250     288     1023     5.0  978469199
42251     288     1024     2.0  978469502
42252     288     1025     2.0  978469270), ((51,),       userId  movieId  rating   timestamp
7442      51      333     2.0  1230929800
7482      51     1009     3.5  1230930546
7485      51     1024     4.0  1230931449), ((274,),        userId  movieId  rating   timestamp
39297     274      333     3.5  1171

In [23]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [24]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0          1.00000    (1,)
1          0.87500  (414,)
2          0.00000  (288,)
3         -0.27735   (51,)
4          0.00000  (274,)


In [28]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())
topUsers['userId'] = topUsers['userId'].apply(lambda x:x[0])

    similarityIndex  userId
0             1.000    (1,)
10            1.000  (385,)
1             0.875  (414,)
74            0.000  (600,)
73            0.000  (599,)


In [29]:
topUsersRating = topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating  timestamp
0               1.0       1        1     4.0  964982703
1               1.0       1        3     4.0  964981247
2               1.0       1        6     4.0  964982224
3               1.0       1       47     5.0  964983815
4               1.0       1       50     5.0  964982931
..              ...     ...      ...     ...        ...
95              1.0       1     1445     3.0  964984112
96              1.0       1     1473     4.0  964980875
97              1.0       1     1500     4.0  964980985
98              1.0       1     1517     5.0  964981107
99              1.0       1     1552     4.0  964982620

[100 rows x 5 columns]


In [30]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating  timestamp  weightedRating
0              1.0       1        1     4.0  964982703             4.0
1              1.0       1        3     4.0  964981247             4.0
2              1.0       1        6     4.0  964982224             4.0
3              1.0       1       47     5.0  964983815             5.0
4              1.0       1       50     5.0  964982931             5.0


In [31]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating  timestamp  weightedRating
0              1.0       1        1     4.0  964982703             4.0
1              1.0       1        3     4.0  964981247             4.0
2              1.0       1        6     4.0  964982224             4.0
3              1.0       1       47     5.0  964983815             5.0
4              1.0       1       50     5.0  964982931             5.0


In [32]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                      2.875              11.500
2                      0.875               2.625
3                      1.875               7.500
4                      0.000               0.000
5                      0.875               1.750


In [33]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     4.000000        1
2                                     3.000000        2
3                                     4.000000        3
4                                          NaN        4
5                                     2.000000        5
6                                     3.347826        6
7                                     3.000000        7
8                                     3.000000        8
9                                          NaN        9
10                                    3.000000       10


In [34]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)

         weighted average recommendation score  movieId
movieId                                                
3578                                       5.0     3578
589                                        5.0      589
2542                                       5.0     2542
2395                                       5.0     2395
3034                                       5.0     3034
...                                        ...      ...
183301                                     NaN   183301
184245                                     NaN   184245
188675                                     NaN   188675
188833                                     NaN   188833
189381                                     NaN   189381

[6822 rows x 2 columns]


In [35]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

      movieId                                      title  \
0           1                           Toy Story (1995)   
1           2                             Jumanji (1995)   
2           3                    Grumpier Old Men (1995)   
3           4                   Waiting to Exhale (1995)   
4           5         Father of the Bride Part II (1995)   
...       ...                                        ...   
9695   184791  Fred Armisen: Standup for Drummers (2018)   
9710   187595             Solo: A Star Wars Story (2018)   
9714   188675                              Dogman (2018)   
9717   188833      The Man Who Killed Don Quixote (2018)   
9721   189381                            SuperFly (2018)   

                                           genres  
0     Adventure|Animation|Children|Comedy|Fantasy  
1                      Adventure|Children|Fantasy  
2                                  Comedy|Romance  
3                            Comedy|Drama|Romance  
4                  