In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import time
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

---
Nearest Neighbor Recommedation System
=====
***

http://www2.informatik.uni-freiburg.de/~cziegler/BX/

<img src="./Assets/bookcrossing.png" />

#####Load the 3 files

In [2]:
#df_users = pd.read_csv("/Users/mrgholt/GADS-22-NYC/Datasets/BX-CSV-Dump/BX-Users.csv", delimiter=';')

In [3]:
#df_book_ratings = pd.read_csv("/Users/mrgholt/GADS-22-NYC/Datasets/BX-CSV-Dump/BX-Book-Ratings.csv", delimiter=';')

In [4]:
#df_books = pd.read_csv("/Users/mrgholt/GADS-22-NYC/Datasets/BX-CSV-Dump/BX-Books.csv", delimiter=';')

#####Have a look at each of the file attributes

In [1]:
#df_users.info()

In [2]:
#df_book_ratings.info()

In [3]:
#df_books.info()

In [4]:
#df_book_ratings.head(2)

In [5]:
#df_users.head(2)

In [6]:
#df_books.head(2)

#####To manage this amount of data in Python requires some consideration
#####While Pandas can be convenient they can also be slow
#####Let's make a new data frame consisting of just the ISBN number and book title

In [7]:
#df_bk = df_books[["ISBN", "Book-Title"]]

In [8]:
#df_bk.head(2)

#####We are going to be looking up book titles using ISBN numbers
#####Pandas indexing is fast, so make the ISBN number the index

In [9]:
#isb = df_bk['ISBN']
#df_bk.index = isb
#del(df_bk["ISBN"])

In [10]:
#df_bk.head()

#####Create a empy title list

In [11]:
#title_list = list(np.zeros(len(df_book_ratings["ISBN"])))

#####Now create a fast routine for creating a list of titles - we'll see why this is a good idea in a few cells time!!

In [16]:
count = 0
#run through the books in the ratings data frame
for isbn in df_book_ratings["ISBN"]:
    
    try:
        #see if there is a title in the books data frame, use the isbn to index into the table efficiently
        title =  df_bk.ix[isbn]["Book-Title"]
    
    except KeyError:
        #if a key error is generated then there is no title to match the isbn number, make up a title!!
        title = "missing title, isbn = " + str(isbn)
    
    #Now fill up the empty title list, the memory for which has been created 2 cells up
    title_list[count] = title
    count += 1
    if count % 100000 == 0:
        #give encouragement to the user that something is happening
        print '.',
        time.sleep(1)

. . . . . . . . . . .


#####Check out the title list

In [12]:
#print len(title_list)

In [13]:
#title_list[0:5]

#####Initialize all the internal dictionaries, i.e. we are making a dictionary of dictionaries

In [14]:
#users = {}
#for user in df_users['User-ID']:
#    users[str(user)] = {}

#####Produce a ratings list

In [15]:
#ratings_list = df_book_ratings["Book-Rating"].values
#print len(ratings_list)

#####Produce and ISBN list

In [16]:
#isbn_list = df_book_ratings["ISBN"]
#print len(isbn_list)

#####Now tranverse through the ratings and populate the users dictionary

---

In [18]:
#def get_book_title(df_books, isbn):
#    b = df_books[df_books.ISBN == isbn]
#    if len(b.index.values) > 0:
#        return b['Book-Title'].values[0]
#    else:
#        return "missing book title, isbn = {:s}".format(isbn)

#i = 0;
#userid = df_book_ratings['User-ID']
    
#%time ratingA = df_book_ratings.iloc[i]['Book-Rating']
    
#%time isbn = df_book_ratings.iloc[i]['ISBN']
        
#%time book_title = get_book_title(df_books, isbn)

#print "\n"

#%time rating = ratings_list[i]

#%time isbn = isbn_list[i]
    
#%time book_title = title_list[i]

#####Let's compare in detail the next two cells
#####These 2 cells do the same thing!!

In [24]:
def get_book_title(df_books, isbn):
    b = df_books[df_books.ISBN == isbn]
    if len(b.index.values) > 0:
        return b['Book-Title'].values[0]
    else:
        return "missing book title, isbn = {:s}".format(isbn)

count = 0

#traverse the ratings data frame
for i, userid in enumerate(df_book_ratings['User-ID']):
    
    #report progress to the user
    if (i > 0) & (i % 100 == 0):
        print '.',
        count += 100
        time.sleep(1)
    
    #The alternative to using a ratings list is to use the data frame directly
    ratingA = df_book_ratings.iloc[i]['Book-Rating']
    
    #The alternative to using an isbn list is to use the data frame directly
    isbn = df_book_ratings.iloc[i]['ISBN']
        
    #The alternative to using the book title list is to use a routine to look up the book title
    book_title = get_book_title(df_books, isbn)
        
    #Add to the users dictionary and update the user's dictionary with book titles they have read and their ratings
    # for those books
    index = str(userid)
    users[index].update({book_title: rating})
    
    if count > 500:
        break

. . . . . .


---

In [25]:
#traverse the ratings data frame
for i, userid in enumerate(df_book_ratings['User-ID']):
    
    #report progress to the user
    if (i > 0) & (i % 100000 == 0):
        print '.',
        time.sleep(1)
    
    #The alternative to using a ratings list is to use the data frame directly
    #ratingA = df_book_ratings.iloc[i]['Book-Rating']
    rating = ratings_list[i]
    
    #The alternative to using an isbn list is to use the data frame directly
    #isbn = df_book_ratings.iloc[i]['ISBN']
    isbn = isbn_list[i]
    
    #The alternative to using the book title list is to use a routine to look up the book title
    #book_title = get_book_title(df_books, isbn)
    book_title = title_list[i]
    
    #Add to the users dictionary and update the user's dictionary with book titles they have read and their ratings
    # for those books
    index = str(userid)
    users[index].update({book_title: rating})

. . . . . . . . . . .


In [19]:
#users['8']

###These are unused functions for calculating similarity

In [27]:
def calc_minkowski_dist(rating1, rating2, r):
    '''This routine generalizes the Minkowski distances, where if r = 1 the function returns the Manhattan
    distance, and if r = 2 the function returns the euclidean distance'''
    
    mdist = 0
    common_ratings = False

    #for every key in the first users list, check to see if the book is in the second user's list 
    for key in rating1:
        if key in rating2:
            
            if r == 1:
                #manhattan distance
                mdist += abs(rating1[key] - rating2[key])
            else:
                #euclidean distance
                mdist += np.power(abs(rating1[key] - rating2[key]), r)
                
            common_ratings = True
            
    #Only if there were common ratings should we bother to complete the calculation
    if common_ratings:
        return(np.power(mdist, 1.0/r))
    else:
        return 0

In [56]:
def calc_cc(rating1, rating2, in_common):
    '''This routine calculates a Pearsons correlation coefficient between the 2 lists of books from the 2 users'''
    
    common_ratings = False
    
    a_list = []
    b_list = []
    
    #for every key in the first users list, check to see if the book is in the second user's list
    for key in rating1:
        if key in rating2:
            
            #create the vectors of ratings
            a_list.append(rating1[key])
            b_list.append(rating2[key])
            common_ratings = True
    
    #This time we insist that the 2 users must have a least in_common number of books in common
    #if the other routines are used then this should also be included
    if len(a_list) < in_common:
        common_ratings = False
            
    #Only if there were common ratings should we bother to complete the calculation        
    if common_ratings:
        
        #r is the correlation coefficient, and p is the p-value (which we ignore)
        r, p = pearsonr(a_list, b_list)
        return(r)
    else:
        return 0

In [29]:
def user_ratings(userid, users, N):
    '''This function will return the ratings from the database in order, with the highest rating first'''
    
    #get the ratings
    ratings = users[userid]
    
    #convert to a list
    ratings = list(ratings.items())
    
    #sort
    ratings.sort(key = lambda x: x[1], reverse = True)
    
    #return the top N ratings
    ratings = ratings[:N]
    
    #print
    for r in ratings:
        print "{:100s}\t{:d}".format(r[0][0:99], int(r[1]))

In [30]:
def calc_cs(rating1, rating2, in_common):
    '''This function implements cosine similarity, between the 2 lists of rated books from the 2 users'''
    
    common_ratings = False
    
    a_list = []
    b_list = []
    
    #for every key in the first users list, check to see if the book is in the second user's list
    for key in rating1:
        if key in rating2:
            
            #create the vectors of ratings
            a_list.append(rating1[key])
            b_list.append(rating2[key])
            
            common_ratings = True


    #This time we insist that the 2 users must have a least in_common number of books in common
    #if the other routines are used then this should also be included
    if len(a_list) < in_common:
        common_ratings = False
        
    if common_ratings:
        
        #if there are enough books in common then calculate the cosine similarity and return it
        cs = cosine_similarity(a_list, b_list)
        return(cs)
    else:
        return 0

#####Compare cosine similarity with Pearson's correlation coefficient
#####Which do you like better and why?

In [54]:
def compute_closest_person(userid, users, in_common = 1):
    '''This routine takes in a user ID and returns users who are close, in terms of the books they have rated'''
    
    #this list holds the distance measures from other users
    mdist_list = []
    found = False
    
    #traverse the main dictionary of users
    for user in users:
        
        #obviously don't look at the userid and compare it with itself
        if user != userid:
            
            #Choose to use the cosine similarity as our measure of closeness
            #mdist = calc_minkowski_dist(users[user], users[userid], 2)
            mdist = calc_cc(users[user], users[userid], in_common)
            
            #We specifiy how many books they must have in common
            #mdist = calc_cs(users[user], users[userid], in_common)
            
            #record the distance and the user as long as the distance is greater than 0
            if np.abs(mdist) > 0:
                mdist_list.append((mdist, user))
                found = True
                
    #sort by distance, closest first
    if found:
        mdist_list.sort(reverse = True)
    else:
        print "no matches found with those search criteria"
    
    return mdist_list

In [95]:
def recommend(user_id, users, in_common = 2, number_of_recommendations = 10):
    
    #get the ID of the nearest person with in_common ratings
    nearest_list = compute_closest_person(user_id, users, in_common)
    
    if len(nearest_list) == 0:
        return []
    
    nearest_id = nearest_list[0][1]
    
    print nearest_id
    
    #initialize the recommendations list
    recommendations = []
    
    #Now get the list of books for each - the requested user-ID and their nearest neighbor
    neighbor_ratings = users[nearest_id]
    user_ratings = users[user_id]
    
    #Look through the book titles from the neighbor
    for book_title in neighbor_ratings:
        
        #Check that the book from the neighbor hasn't already been reviewed
        if not book_title in user_ratings:
            
            #append the book to the recommendations list
            recommendations.append((book_title, neighbor_ratings[book_title]))
    
    #sort the list based on the rating, returning the highest rated book first
    sr = sorted(recommendations, key = lambda x: x[1], reverse = True)[:number_of_recommendations]
        
    return (nearest_list, sr)

In [97]:
def recommend_for_specific_match(useridA, useridB, users, number_of_recommendations = 10):
    '''This function takes 2 user-IDs and makes recommendations for the first based on books from the second'''
    
    recommendations = []
    
    neighbor_ratings = users[useridB]
    if len(neighbor_ratings) == 0:
        return []
    
    user_ratings = users[useridA]
    if len(user_ratings) == 0:
        return []
    
    for book_title in neighbor_ratings:
        if not book_title in user_ratings:
            recommendations.append((book_title, neighbor_ratings[book_title]))
            
    if len(recommendations) == 0:
        return []
    sr = sorted(recommendations, key = lambda x: x[1], reverse = True)[:number_of_recommendations]
        
    return sr

In [84]:
def evaluate_comparison(user_id1, user_id2, users, N = 0):
    '''This function compares the book ratings for 2 users'''
    
    ul1 = users[user_id1]
    ul2 = users[user_id2]
    
    count = 0
    
    for b in ul1:
        if b in ul2:
            print "{:100s}".format(b)
            print "Users {:s} & {:s} ratings = {:5d} {:5d}".format(user_id1, user_id2, users[user_id1][b],\
                                                                                   users[user_id2][b])
            print "\n"
            count += 1
            if (count >= N) & (N != 0):
                return

In [20]:
#closest_list = compute_closest_person('8', users, in_common = 5)

#show_top = len(closest_list)
#if show_top > 10:
#    show_top = 10
    
#for i in xrange(show_top):
#    print "User ID: {:10s} Similarity = {:5.5f}".format(closest_list[i][1], float(closest_list[i][0]))

In [21]:
#evaluate_comparison('8', '11676', users)

In [22]:
#a = [0,0,0,0,5]
#b = [9,8,8,0,8]
#print cosine_similarity(a, b).ravel()[0]
#print pearsonr(a, b)[0]

In [23]:
#recommend('8', users, in_common = 5, number_of_recommendations = 5)

In [24]:
#user_ratings('171118', users, 10)

In [25]:
#neighbor_list, recommendation_list = recommend('171118', users, in_common = 10, number_of_recommendations = 5)

In [26]:
#for i in xrange(5):
#    print "{:20s} {:5.4f}".format(neighbor_list[i][1], float(neighbor_list[i][0]))

In [27]:
#evaluate_comparison('171118', '21659', users, 0)

In [28]:
#evaluate_comparison('171118', '62881', users, 0)

In [29]:
#a = [0,0,0,1,0,0,0,0,0,0,0]
#b = [0,0,0,10,0,0,0,0,0,0,0]
#print cosine_similarity(a, b).ravel()[0]
#print pearsonr(a, b)[0]

In [30]:
#evaluate_comparison('171118', '196148', users, 0)

In [31]:
#a = [8,9,7,7,8,8,8,8,8,8,0,7]
#b = [10,10,10,10,10,10,9,10,10,10,0,8]
#print cosine_similarity(a,b).ravel()[0]
#print pearsonr(a,b)[0]

In [32]:
#recommend_for_specific_match('171118', '196148', users, number_of_recommendations = 5)

In [33]:
#neighbor_list, recommendation_list = recommend('171118', users, in_common = 10, number_of_recommendations = 5)
#print recommendations_list