In [7]:
import pandas as pd
import numpy as np
from math import sqrt

In [13]:
books_df = pd.read_csv('BX-Books.csv',encoding='latin-1',on_bad_lines='skip')
ratings_df = pd.read_csv('BX-Book-Ratings.csv',encoding='latin-1',on_bad_lines='skip')
books_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228148 entries, 0 to 228147
Data columns (total 1 columns):
 #   Column                                                                                                       Non-Null Count   Dtype 
---  ------                                                                                                       --------------   ----- 
 0   ISBN;"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L"  228148 non-null  object
dtypes: object(1)
memory usage: 1.7+ MB


In [22]:
userInput = [{'Title':'Classical Mythology', 'Rating':'0'},
            {'Title':'Clara Callan', 'Rating':'5'},
            {'Title':'Jane Doe','Rating':'5'}]
        
inputBooks = pd.DataFrame(userInput)
print(inputBooks)

                 Title Rating
0  Classical Mythology      0
1         Clara Callan      5
2             Jane Doe      5


In [24]:
inputID = books_df[books_df['Book-Title'].isin(inputBooks['Title'].tolist())]
inputBooks = pd.merge(inputID,inputBooks)
inputBooks = inputBooks[['ISBN','Title','Rating']]
print(inputBooks)

KeyError: 'Book-Title'

In [16]:
userSubset = ratings_df[ratings_df['ISBN'].isin(inputBooks['ISBN'].tolist())]
print(userSubset.groupby('ISBN').count())

KeyError: 'ISBN'

In [None]:
userSubsetGroup = userSubset.groupby(['ISBN'])

def take_3_elem(x):
    # print (len(x[1]))
    return len(x[1])
    
userSubsetGroup = sorted(userSubsetGroup, key=take_3_elem, reverse=True)

userSubsetGroup = userSubsetGroup[0:100]
print(userSubsetGroup[0:3])

In [None]:
pearsonCorrelationDict = {}

#For every user group in our subset
for name, group in userSubsetGroup:

    #Let's start by sorting the input and current user group so the values aren't mixed up later on
    group = group.sort_values(by='ISBN')
    inputBooks = inputBooks.sort_values(by='ISBN')

    #Get the N for the formula
    nRatings = len(group)

    #Get the review scores for the movies that they both have in common
    temp_df = inputBooks[inputBooks['ISBN'].isin(group['ISBN'].tolist())]

    #And then store them in a temporary buffer variable in a list format to facilitate future calculations
    tempRatingList = temp_df['rating'].tolist()
   
    #Let's also put the current user group reviews in a list format
    tempGroupList = group['rating'].tolist()
   
    
    #Now let's calculate the pearson correlation between two users, so called, x and y manually (check the formula from week 7 slide)
    Sxx = sum([i**2 for i in tempRatingList]) - pow(sum(tempRatingList),2)/float(nRatings)
    Syy = sum([i**2 for i in tempGroupList]) - pow(sum(tempGroupList),2)/float(nRatings)
    Sxy = sum( i*j for i, j in zip(tempRatingList, tempGroupList)) - sum(tempRatingList)*sum(tempGroupList)/float(nRatings)

    #If the denominator is different than zero, then divide, else, 0 correlation.
    if Sxx != 0 and Syy != 0:
        pearsonCorrelationDict[name] = Sxy/sqrt(Sxx*Syy)
    else:
        pearsonCorrelationDict[name] = 0

In [None]:
pearsonDF = pd.DataFrame.from_dict(pearsonCorrelationDict, orient='index')
pearsonDF.columns = ['similarityIndex']
pearsonDF['userId'] = pearsonDF.index
pearsonDF.index = range(len(pearsonDF))
print(pearsonDF.head())

In [None]:
topUsers=pearsonDF.sort_values(by='similarityIndex', ascending=False)[0:50]
print(topUsers.head())

In [None]:
topUsersRating=topUsers.merge(ratings_df, left_on='userId', right_on='userId', how='inner')
print(topUsersRating.head(100))

In [None]:
topUsersRating['weightedRating'] = topUsersRating['similarityIndex']*topUsersRating['rating']
print(topUsersRating.head())

In [None]:
tempTopUsersRating = topUsersRating.groupby('movieId').sum()[['similarityIndex','weightedRating']]
tempTopUsersRating.columns = ['sum_similarityIndex','sum_weightedRating']
print(tempTopUsersRating.head())

In [None]:
recommendation_df = pd.DataFrame()

recommendation_df['weighted average recommendation score'] = tempTopUsersRating['sum_weightedRating']/tempTopUsersRating['sum_similarityIndex']
recommendation_df['movieId'] = tempTopUsersRating.index
print(recommendation_df.head(10))

In [None]:
recommendation_df = recommendation_df.sort_values(by='weighted average recommendation score', ascending=False)
print(recommendation_df)