In [1]:
import pandas as pd
from math import sqrt
import numpy as np

In [23]:
url1 = 'https://raw.githubusercontent.com/consumerofbeanss/FoDS_W10/main/links.csv'
url2 = 'https://raw.githubusercontent.com/consumerofbeanss/FoDS_W10/main/movies.csv'
url3 = 'https://raw.githubusercontent.com/consumerofbeanss/FoDS_W10/main/ratings.csv'
url4 = 'https://raw.githubusercontent.com/consumerofbeanss/FoDS_W10/main/tags.csv'

links_df = pd.read_csv(url1)
movies_df = pd.read_csv(url2)
ratings_df = pd.read_csv(url3)
tags_df = pd.read_csv(url4)

movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [24]:
userInput = [{'title':'Terminator, The (1984)', 'rating':4},
             {'title':'The Lego Batman Movie (2017)', 'rating':5},
             {'title':'Scott Pilgrim vs. the World (2010)', 'rating':4.5},
             {'title':'Transformers: Age of Extinction (2014)', 'rating':2},
             {'title':'Fifty Shades of Grey (2015)', 'rating':1.5}]
inputMovies = pd.DataFrame(userInput)
print(inputMovies)

                                    title  rating
0                  Terminator, The (1984)     4.0
1            The Lego Batman Movie (2017)     5.0
2      Scott Pilgrim vs. the World (2010)     4.5
3  Transformers: Age of Extinction (2014)     2.0
4             Fifty Shades of Grey (2015)     1.5


In [25]:
inputId = movies_df[movies_df['title'].isin(inputMovies['title'].tolist())]
inputMovies = pd.merge(inputId, inputMovies)
inputMovies = inputMovies[['movieId','title','rating']]
print(inputMovies)

   movieId                                   title  rating
0     1240                  Terminator, The (1984)     4.0
1    79702      Scott Pilgrim vs. the World (2010)     4.5
2   112370  Transformers: Age of Extinction (2014)     2.0
3   125916             Fifty Shades of Grey (2015)     1.5
4   167746            The Lego Batman Movie (2017)     5.0


In [26]:
userSubset = ratings_df[ratings_df['movieId'].isin(inputMovies['movieId'].tolist())]
print(userSubset.groupby('movieId').count())

         userId  rating  timestamp
movieId                           
1240        131     131        131
79702        44      44         44
112370        4       4          4
125916        3       3          3
167746        7       7          7


In [27]:
#Groupby creates several sub dataframes where they all have the same value in the column specified as the parameter
userSubsetGroup = userSubset.groupby(['userId'])

def take_5_elem(x):
    # print (len(x[1]))
    return len(x[1])


#Sorting it so users with movie most in common with the input will have priority
userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:5])

[(448,        userId  movieId  rating   timestamp
68843     448     1240     3.0  1019124194
69883     448    79702     3.0  1310228058
70245     448   112370     1.5  1433790679
70333     448   125916     0.5  1448552452), (125,        userId  movieId  rating   timestamp
19254     125     1240     4.0  1474289509
19415     125    79702     4.0  1474142151
19579     125   167746     4.0  1513774320), (249,        userId  movieId  rating   timestamp
36464     249     1240     4.0  1346757874
37072     249    79702     4.5  1346847799
37400     249   167746     4.0  1504979314), (298,        userId  movieId  rating   timestamp
44612     298     1240     4.0  1447584728
45260     298    79702     3.0  1447516910
45414     298   112370     2.5  1453033058), (380,        userId  movieId  rating   timestamp
57042     380     1240     5.0  1493473846
57869     380    79702     4.0  1493420626
58070     380   167746     5.0  1508436102)]


  userSubsetGroup = sorted(userSubsetGroup, key=take_5_elem, reverse=True)


In [28]:
#Store the Pearson Correlation in a dictionary, where the key is the user Id and the value is the coefficient
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='movieId')
    inputMovies = inputMovies.sort_values(by='movieId')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputMovies[inputMovies['movieId'].isin(group['movieId'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()

    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()


    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [29]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

   similarityIndex  userId
0         0.970725     448
1         0.000000     125
2         0.000000     249
3         0.618590     298
4         0.000000     380


In [30]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

    similarityIndex  userId
17              1.0     477
8               1.0     212
21              1.0     596
20              1.0     567
19              1.0     534


In [31]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

    similarityIndex  userId  movieId  rating   timestamp
0               1.0     477        1     4.0  1200939636
1               1.0     477        2     4.0  1200939962
2               1.0     477        3     3.0  1200941177
3               1.0     477       19     3.0  1200939977
4               1.0     477       24     4.0  1201159341
..              ...     ...      ...     ...         ...
95              1.0     477      924     4.5  1200939944
96              1.0     477      968     2.0  1200939188
97              1.0     477     1005     2.5  1201195982
98              1.0     477     1020     3.5  1200939255
99              1.0     477     1033     3.5  1200947060

[100 rows x 5 columns]


In [32]:
#Multiplies the similarity by the user’s ratings
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

   similarityIndex  userId  movieId  rating   timestamp  weightedRating
0              1.0     477        1     4.0  1200939636             4.0
1              1.0     477        2     4.0  1200939962             4.0
2              1.0     477        3     3.0  1200941177             3.0
3              1.0     477       19     3.0  1200939977             3.0
4              1.0     477       24     4.0  1201159341             4.0


In [33]:
#Applies a sum to the topUsers after grouping it up by movieId
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

         sum_similarityIndex  sum_weightedRating
movieId                                         
1                   6.589315           25.590806
2                   4.589315           13.221471
3                   2.970725            8.912176
5                   0.970725            2.912176
6                   0.000000            0.000000


In [34]:
#Creates an empty dataframe
recommendation_df = pd.DataFrame()

#Now we take the weighted average
recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

         weighted average recommendation score  movieId
movieId                                                
1                                     3.883682        1
2                                     2.880925        2
3                                     3.000000        3
5                                     3.000000        5
6                                          NaN        6
7                                          NaN        7
9                                          NaN        9
10                                    4.000000       10
11                                         NaN       11
12                                    2.000000       12


In [42]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)

recommendation_df_sorted = recommendation_df[recommendation_df['weighted average recommendation score'] > 4.9]
print(recommendation_df_sorted)

         weighted average recommendation score  movieId
movieId                                                
2991                                       5.0     2991
1147                                       5.0     1147
47200                                      5.0    47200
7247                                       5.0     7247
1250                                       5.0     1250
2394                                       5.0     2394
2524                                       5.0     2524
4396                                       5.0     4396
4454                                       5.0     4454
1078                                       5.0     1078
4623                                       5.0     4623
51709                                      5.0    51709
148881                                     5.0   148881
2013                                       5.0     2013
1031                                       5.0     1031
56782                                      5.0  

In [44]:
recommended_movie=movies_df.loc[movies_df['movieId'].isin(recommendation_df_sorted['movieId'])]

#we don't want to recommend the same movie
recommended_movie=recommended_movie.loc[~recommended_movie.movieId.isin(userSubset['movieId'])]

print(recommended_movie)

      movieId                                              title  \
52         58                  Postman, The (Postino, Il) (1994)   
515       599                             Wild Bunch, The (1969)   
788      1031                    Bedknobs and Broomsticks (1971)   
818      1078                                     Bananas (1971)   
867      1147                          When We Were Kings (1996)   
910      1209  Once Upon a Time in the West (C'era una volta ...   
928      1227                 Once Upon a Time in America (1984)   
949      1250               Bridge on the River Kwai, The (1957)   
955      1256                                   Duck Soup (1933)   
961      1262                           Great Escape, The (1963)   
1031     1343                                   Cape Fear (1991)   
1488     2013                     Poseidon Adventure, The (1972)   
1701     2288                                  Thing, The (1982)   
1768     2366                                   