In [2]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import time
from scipy.stats import pearsonr
from sklearn.metrics.pairwise import cosine_similarity

---
Nearest Neighbor Recommedation System
=====
***

http://www2.informatik.uni-freiburg.de/~cziegler/BX/

<img src="./Assets/bookcrossing.png" />

#####Load the 3 files

In [3]:
df_users = pd.read_csv("/Users/mrgholt/GADS-22-NYC/Datasets/BX-CSV-Dump/BX-Users.csv", delimiter=';')

In [4]:
df_book_ratings = pd.read_csv("/Users/mrgholt/GADS-22-NYC/Datasets/BX-CSV-Dump/BX-Book-Ratings.csv", delimiter=';')

In [5]:
df_books = pd.read_csv("/Users/mrgholt/GADS-22-NYC/Datasets/BX-CSV-Dump/BX-Books.csv", delimiter=';')

#####Have a look at each of the file attributes

In [6]:
df_users.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 278858 entries, 0 to 278857
Data columns (total 3 columns):
User-ID     278858 non-null int64
Location    278858 non-null object
Age         168096 non-null float64
dtypes: float64(1), int64(1), object(1)
memory usage: 8.5+ MB


In [7]:
df_book_ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1149780 entries, 0 to 1149779
Data columns (total 3 columns):
User-ID        1149780 non-null int64
ISBN           1149780 non-null object
Book-Rating    1149780 non-null int64
dtypes: int64(2), object(1)
memory usage: 35.1+ MB


In [8]:
df_books.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 271379 entries, 0 to 271378
Data columns (total 8 columns):
ISBN                   271379 non-null object
Book-Title             271379 non-null object
Book-Author            271379 non-null object
Year-Of-Publication    271379 non-null int64
Publisher              271377 non-null object
Image-URL-S            271379 non-null object
Image-URL-M            271379 non-null object
Image-URL-L            271379 non-null object
dtypes: int64(1), object(7)
memory usage: 18.6+ MB


In [9]:
df_book_ratings.head(2)

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5


In [10]:
df_users.head(2)

Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0


In [11]:
df_books.head(2)

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...


#####To manage this amount of data in Python requires some consideration
#####While Pandas can be convenient they can also be slow
#####Let's make a new data frame consisting of just the ISBN number and book title

In [12]:
df_bk = df_books[["ISBN", "Book-Title"]]

In [13]:
df_bk.head(2)

Unnamed: 0,ISBN,Book-Title
0,195153448,Classical Mythology
1,2005018,Clara Callan


#####We are going to be looking up book titles using ISBN numbers
#####Pandas indexing is fast, so make the ISBN number the index

In [14]:
isb = df_bk['ISBN']
df_bk.index = isb
del(df_bk["ISBN"])

In [15]:
df_bk.head()

Unnamed: 0_level_0,Book-Title
ISBN,Unnamed: 1_level_1
195153448,Classical Mythology
2005018,Clara Callan
60973129,Decision in Normandy
374157065,Flu: The Story of the Great Influenza Pandemic...
393045218,The Mummies of Urumchi


#####Create a empy title list

In [16]:
title_list = list(np.zeros(len(df_book_ratings["ISBN"])))

#####Now create a fast routine for processing the book ratings data frame

In [17]:
count = 0
#run through the books in the ratings data frame
for isbn in df_book_ratings["ISBN"]:
    
    try:
        #see if there is a title in the books data frame, use the isbn to index into the table efficiently
        title =  df_bk.ix[isbn]["Book-Title"]
    
    except KeyError:
        #if a key error is generated then there is no title to match the isbn number, make up a title!!
        title = "missing title, isbn = " + str(isbn)
    
    #Now fill up the empty title list, the memory for which has been created 2 cells up
    title_list[count] = title
    count += 1
    if count % 100000 == 0:
        #give encouragement to the user that something is happening
        print '.',
        time.sleep(1)

. . . . . . . . . . .


#####Check out the title list

In [18]:
print len(title_list)

1149780


In [19]:
title_list[0:5]

['Flesh Tones: A Novel',
 'Rites of Passage',
 'The Notebook',
 'Help!: Level 1',
 'The Amsterdam Connection : Level 4 (Cambridge English Readers)']

#####Initialize all the internal dictionaries, i.e. we are making a dictionary of dictionaries

In [20]:
users = {}
for user in df_users['User-ID']:
    users[str(user)] = {}

#####Produce a ratings list

In [21]:
ratings_list = df_book_ratings["Book-Rating"].values
print len(ratings_list)

1149780


#####Produce and ISBN list

In [22]:
isbn_list = df_book_ratings["ISBN"]
print len(isbn_list)

1149780


#####Now tranverse through the ratings and populate the users dictionary

---

In [23]:
def get_book_title(df_books, isbn):
    b = df_books[df_books.ISBN == isbn]
    if len(b.index.values) > 0:
        return b['Book-Title'].values[0]
    else:
        return "missing book title, isbn = {:s}".format(isbn)

i = 0;
userid = df_book_ratings['User-ID']
    
%time ratingA = df_book_ratings.iloc[i]['Book-Rating']
    
%time isbn = df_book_ratings.iloc[i]['ISBN']
        
%time book_title = get_book_title(df_books, isbn)


%time rating = ratings_list[i]

%time isbn = isbn_list[i]
    
%time book_title = title_list[i]

CPU times: user 316 µs, sys: 58 µs, total: 374 µs
Wall time: 342 µs
CPU times: user 202 µs, sys: 68 µs, total: 270 µs
Wall time: 235 µs
CPU times: user 27.3 ms, sys: 1.77 ms, total: 29.1 ms
Wall time: 28.8 ms
CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 7.15 µs
CPU times: user 2.72 ms, sys: 84 µs, total: 2.8 ms
Wall time: 3.29 ms
CPU times: user 2 µs, sys: 1 µs, total: 3 µs
Wall time: 5.96 µs


In [24]:
def get_book_title(df_books, isbn):
    b = df_books[df_books.ISBN == isbn]
    if len(b.index.values) > 0:
        return b['Book-Title'].values[0]
    else:
        return "missing book title, isbn = {:s}".format(isbn)

count = 0

#traverse the ratings data frame
for i, userid in enumerate(df_book_ratings['User-ID']):
    
    #report progress to the user
    if (i > 0) & (i % 100 == 0):
        print '.',
        count += 100
        time.sleep(1)
    
    #The alternative to using a ratings list is to use the data frame directly
    ratingA = df_book_ratings.iloc[i]['Book-Rating']
    
    #The alternative to using an isbn list is to use the data frame directly
    isbn = df_book_ratings.iloc[i]['ISBN']
        
    #The alternative to using the book title list is to use a routine to look up the book title
    book_title = get_book_title(df_books, isbn)
        
    #Add to the users dictionary and update the user's dictionary with book titles they have read and their ratings
    # for those books
    index = str(userid)
    users[index].update({book_title: rating})
    
    if count > 500:
        break

. . . . . .


---

In [25]:
#traverse the ratings data frame
for i, userid in enumerate(df_book_ratings['User-ID']):
    
    #report progress to the user
    if (i > 0) & (i % 100000 == 0):
        print '.',
        time.sleep(1)
    
    #The alternative to using a ratings list is to use the data frame directly
    #ratingA = df_book_ratings.iloc[i]['Book-Rating']
    rating = ratings_list[i]
    
    #The alternative to using an isbn list is to use the data frame directly
    #isbn = df_book_ratings.iloc[i]['ISBN']
    isbn = isbn_list[i]
    
    #The alternative to using the book title list is to use a routine to look up the book title
    #book_title = get_book_title(df_books, isbn)
    book_title = title_list[i]
    
    #Add to the users dictionary and update the user's dictionary with book titles they have read and their ratings
    # for those books
    index = str(userid)
    users[index].update({book_title: rating})

. . . . . . . . . . .


In [26]:
users['8']

{"A Second Chicken Soup for the Woman's Soul (Chicken Soup for the Soul Series)": 0,
 'Clara Callan': 5,
 'Decision in Normandy': 0,
 'Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It': 0,
 'Goodbye to the Buttermilk Sky': 7,
 "Hitler's Secret Bankers: The Myth of Swiss Neutrality During the Holocaust": 0,
 'Jane Doe': 5,
 'More Cunning Than Man: A Social History of Rats and Man': 6,
 'Nights Below Station Street': 0,
 'PLEADING GUILTY': 0,
 "The Kitchen God's Wife": 0,
 'The Middle Stories': 5,
 'The Mummies of Urumchi': 0,
 'The Witchfinder (Amos Walker Mystery Series)': 6,
 'Under the Black Flag: The Romance and the Reality of Life Among the Pirates': 0,
 "What If?: The World's Foremost Military Historians Imagine What Might Have Been": 0,
 "Where You'll Find Me: And Other Stories": 5,
 'missing title, isbn = 0771025661': 0}

###These are unused functions for calculating similarity

In [47]:
def calc_minkowski_dist(rating1, rating2, r, in_common):
    '''This routine generalizes the Minkowski distances, where if r = 1 the function returns the Manhattan
    distance, and if r = 2 the function returns the euclidean distance'''
    
    mdist = 0
    common_ratings = False
    count = 0

    #for every key in the first users list, check to see if the book is in the second user's list 
    for key in rating1:
        if key in rating2:
            
            if r == 1:
                #manhattan distance
                count += 1
                mdist += abs(rating1[key] - rating2[key])
            else:
                #euclidean distance
                count += 1
                mdist += np.power(abs(rating1[key] - rating2[key]), r)
                
            common_ratings = True
    
    #This time we insist that the 2 users must have a least in_common number of books in common
    #if the other routines are used then this should also be included
    if count < in_common:
        common_ratings = False
    
    #Only if there were common ratings should we bother to complete the calculation
    if common_ratings:
        return(np.power(mdist, 1.0/r)/float(count))
    else:
        return 0

In [28]:
def calc_cc(rating1, rating2, in_common):
    '''This routine calculates a Pearsons correlation coefficient between the 2 lists of books from the 2 users'''
    
    common_ratings = False
    
    a_list = []
    b_list = []
    
    #for every key in the first users list, check to see if the book is in the second user's list
    for key in rating1:
        if key in rating2:
            
            #create the vectors of ratings
            a_list.append(rating1[key])
            b_list.append(rating2[key])
            common_ratings = True
    
    #This time we insist that the 2 users must have a least in_common number of books in common
    #if the other routines are used then this should also be included
    if len(a_list) < in_common:
        common_ratings = False
            
    #Only if there were common ratings should we bother to complete the calculation        
    if common_ratings:
        
        #r is the correlation coefficient, and p is the p-value (which we ignore)
        r, p = pearsonr(a_list, b_list)
        return(r)
    else:
        return 0

In [29]:
def user_ratings(userid, users, N):
    '''This function will return the ratings from the database in order, with the highest rating first'''
    
    #get the ratings
    ratings = users[userid]
    
    #convert to a list
    ratings = list(ratings.items())
    
    #sort
    ratings.sort(key = lambda x: x[1], reverse = True)
    
    #return the top N ratings
    ratings = ratings[:N]
    
    #print
    for r in ratings:
        print "{:100s}\t{:d}".format(r[0][0:99], int(r[1]))

In [30]:
def calc_cs(rating1, rating2, in_common):
    '''This function implements cosine similarity, between the 2 lists of rated books from the 2 users'''
    
    common_ratings = False
    
    a_list = []
    b_list = []
    
    #for every key in the first users list, check to see if the book is in the second user's list
    for key in rating1:
        if key in rating2:
            
            #create the vectors of ratings
            a_list.append(rating1[key])
            b_list.append(rating2[key])
            
            common_ratings = True


    #This time we insist that the 2 users must have a least in_common number of books in common
    #if the other routines are used then this should also be included
    if len(a_list) < in_common:
        common_ratings = False
        
    if common_ratings:
        
        #if there are enough books in common then calculate the cosine similarity and return it
        cs = cosine_similarity(a_list, b_list)
        return(cs)
    else:
        return 0

#####Compare cosine similarity with Pearson's correlation coefficient
#####Which do you like better and why?

In [66]:
def compute_closest_person(userid, users, in_common = 1):
    '''This routine takes in a user ID and returns users who are close, in terms of the books they have rated'''
    
    #this list holds the distance measures from other users
    mdist_list = []
    found = False
    
    #traverse the main dictionary of users
    for user in users:
        
        #obviously don't look at the userid and compare it with itself
        if user != userid:
            
            #Choose a measure of similarity and use the in_common argument to specify
            #the minimum number of books they must have rated
            #mdist = calc_minkowski_dist(users[user], users[userid], 1, in_common)
            mdist = calc_cc(users[user], users[userid], in_common)
            #mdist = calc_cs(users[user], users[userid], in_common)
            
            #record the distance and the user as long as the distance is greater than 0
            if np.abs(mdist) > 0:
                mdist_list.append((mdist, user))
                found = True
                
    #sort by distance, closest first
    if found:
        mdist_list.sort(reverse = True)
    else:
        print "no matches found with those search criteria"
    
    return mdist_list

In [106]:
def recommend(user_id, users, in_common = 2, number_of_recommendations = 10):
    
    #get the ID of the nearest person with in_common ratings
    nearest_list = compute_closest_person(user_id, users, in_common)
    
    if len(nearest_list) == 0:
        return []
    
    nearest_id = nearest_list[0][1]
    
    print nearest_id
    
    #initialize the recommendations list
    recommendations = []
    
    #Now get the list of books for each - the requested user-ID and their nearest neighbor
    neighbor_ratings = users[nearest_id]
    user_ratings = users[user_id]
    
    #Look through the book titles from the neighbor
    for book_title in neighbor_ratings:
        
        #Check that the book from the neighbor hasn't already been reviewed
        if not book_title in user_ratings:
            
            #append the book to the recommendations list
            recommendations.append((book_title, neighbor_ratings[book_title]))
    
    
    if len(recommendations) == 0:
        print "No recommedations found"
        return []
    
    #sort the list based on the rating, returning the highest rated book first
    #reverse direction needs to be in line with similarity measure
    if number_of_recommendations > len(recommendations):
        number_of_recommendations = len(recommendations)
        
    if number_of_recommendations < 0:
        number_of_recommendations = 1
        
    if number_of_recommendations == 0:
        number_of_recommendations = len(recommendations)
    
    sr = sorted(recommendations, key = lambda x: x[1], reverse = True)[:number_of_recommendations]
        
    return (nearest_list, sr)

In [105]:
def recommend_for_specific_match(useridA, useridB, users, number_of_recommendations = 10):
    '''This function takes 2 user-IDs and makes recommendations for the first based on books from the second'''
    
    recommendations = []
    
    neighbor_ratings = users[useridB]
    if len(neighbor_ratings) == 0:
        return []
    
    user_ratings = users[useridA]
    if len(user_ratings) == 0:
        return []
    
    for book_title in neighbor_ratings:
        if not book_title in user_ratings:
            recommendations.append((book_title, neighbor_ratings[book_title]))
            
    if len(recommendations) == 0:
        print "No recommedations found"
        return []
    
    if number_of_recommendations > len(recommendations):
        number_of_recommendations = len(recommendations)
        
    if number_of_recommendations < 0:
        number_of_recommendations = 1
        
    if number_of_recommendations == 0:
        number_of_recommendations = len(recommendations)
        
    sr = sorted(recommendations, key = lambda x: x[1], reverse = True)[:number_of_recommendations]
        
    return sr

In [34]:
def evaluate_comparison(user_id1, user_id2, users, N = 0):
    '''This function compares the book ratings for 2 users'''
    
    ul1 = users[user_id1]
    ul2 = users[user_id2]
    
    count = 0
    
    for b in ul1:
        if b in ul2:
            print "{:100s}".format(b)
            print "Users {:s} & {:s} ratings = {:5d} {:5d}".format(user_id1, user_id2, users[user_id1][b],\
                                                                                   users[user_id2][b])
            print "\n"
            count += 1
            if (count >= N) & (N != 0):
                return

In [85]:
closest_list = compute_closest_person('8', users, in_common = 5)

show_top = len(closest_list)
if show_top > 10:
    show_top = 10
    
for i in xrange(show_top):
    print "User ID: {:10s} Similarity = {:5.5f}".format(closest_list[i][1], float(closest_list[i][0]))

User ID: 11676      Similarity = 0.21068


In [86]:
evaluate_comparison('8', '11676', users)

A Second Chicken Soup for the Woman's Soul (Chicken Soup for the Soul Series)                       
Users 8 & 11676 ratings =     0     9


The Kitchen God's Wife                                                                              
Users 8 & 11676 ratings =     0     8


PLEADING GUILTY                                                                                     
Users 8 & 11676 ratings =     0     8


Nights Below Station Street                                                                         
Users 8 & 11676 ratings =     0     0


Clara Callan                                                                                        
Users 8 & 11676 ratings =     5     8




In [87]:
a = [0,0,0,0,5]
b = [9,8,8,0,8]
print cosine_similarity(a, b).ravel()[0]
print pearsonr(a, b)[0]

0.484182026135
0.2106752429


In [88]:
recommend('8', users, in_common = 5, number_of_recommendations = 5)

11676


([(0.21067524290009609, '11676')],
 [('Onkel Wolfram. Erinnerungen', 10),
  ("Precious and Few: Volume I in the 'Polly's Heartsongs' Trilogy", 10),
  ('Journey', 10),
  ('Bruno and Boots: The War with Mr. Wizzle', 10),
  ('The Little Zen Companion', 10)])

In [68]:
user_ratings('171118', users, 10)

Pride and Prejudice (World's Classics)                                                              	10
Pilgrim at Tinker Creek                                                                             	10
Twelfth Night (Folger Shakespeare Library)                                                          	10
Just So Stories (Penguin Twentieth-Century Classics)                                                	10
The Careful Writer                                                                                  	10
missing title, isbn = 0099430967                                                                    	10
The Griffin &amp; Sabine Trilogy Boxed Set: Griffin &amp; Sabine/Sabine's Notebook/The Golden Mean  	10
Robert Bateman an Artist In Nature                                                                  	10
The Summer Tree (The Fionavar Tapestry, Book 1)                                                     	10
The Visual Display of Quantitative Information                  

In [89]:
neighbor_list, recommendation_list = recommend('171118', users, in_common = 10, number_of_recommendations = 5)

21659


In [90]:
for i in xrange(5):
    print "{:20s} {:5.4f}".format(neighbor_list[i][1], float(neighbor_list[i][0]))

21659                1.0000
62881                1.0000
196148               0.9662
167349               0.8173
135831               0.8058


In [91]:
evaluate_comparison('171118', '21659', users, 0)

The Deep End of the Ocean                                                                           
Users 171118 & 21659 ratings =     0     0


missing title, isbn = 0452277205                                                                    
Users 171118 & 21659 ratings =     0     0


Snow Falling on Cedars                                                                              
Users 171118 & 21659 ratings =     0     0


The Pelican Brief                                                                                   
Users 171118 & 21659 ratings =     4    10


The Outsiders                                                                                       
Users 171118 & 21659 ratings =     0     0


A Painted House                                                                                     
Users 171118 & 21659 ratings =     0     0


The Most Wanted                                                                                     
Users 171118 & 21659 ra

In [92]:
evaluate_comparison('171118', '62881', users, 0)

Franny and Zooey                                                                                    
Users 171118 & 62881 ratings =     0     0


Jurassic Park                                                                                       
Users 171118 & 62881 ratings =     0     0


Couplehood                                                                                          
Users 171118 & 62881 ratings =     0     0


Swim With the Sharks: Without Being Eaten Alive : Outsell, Outmanage, Outmotivate, and Outnegotiate Your Competition
Users 171118 & 62881 ratings =     0     0


The Pelican Brief                                                                                   
Users 171118 & 62881 ratings =     4     5


Airframe                                                                                            
Users 171118 & 62881 ratings =     0     0


Lord of the Flies                                                                                   
Users 1

In [93]:
a = [0,0,0,1,0,0,0,0,0,0,0]
b = [0,0,0,10,0,0,0,0,0,0,0]
print cosine_similarity(a, b).ravel()[0]
print pearsonr(a, b)[0]

1.0
1.0


In [94]:
evaluate_comparison('171118', '196148', users, 0)

Scientific Progress Goes 'Boink':  A Calvin and Hobbes Collection                                   
Users 171118 & 196148 ratings =     8    10


Calvin and Hobbes                                                                                   
Users 171118 & 196148 ratings =     9    10


Yukon Ho!                                                                                           
Users 171118 & 196148 ratings =     7    10


There's Treasure Everywhere--A Calvin and Hobbes Collection                                         
Users 171118 & 196148 ratings =     7    10


Something Under the Bed Is Drooling                                                                 
Users 171118 & 196148 ratings =     8    10


Homicidal Psycho Jungle Cat: A Calvin and Hobbes Collection                                         
Users 171118 & 196148 ratings =     8    10


The Revenge Of The Baby-Sat                                                                         
Users 171118 & 19

In [95]:
a = [8,9,7,7,8,8,8,8,8,8,0,7]
b = [10,10,10,10,10,10,9,10,10,10,0,8]
print cosine_similarity(a,b).ravel()[0]
print pearsonr(a,b)[0]

0.997039870111
0.966248837092


In [108]:
recommend_for_specific_match('171118', '196148', users, number_of_recommendations = 5)

[('The Indispensable Calvin And Hobbes', 10),
 ('The Complete Little Nemo in Slumberland, 1907-1908 (Complete Little Nemo)',
  10),
 ('The Dog Is Not a Toy: House Rule #4', 10),
 ("Krazy &amp; Ignatz 1929-1930: 'A Mice, A Brick, A Lovely Night' (Krazy Kat)",
  10),
 ('Peepshow', 10)]

In [151]:
neighbor_list, recommendation_list = recommend('171118', users, in_common = 20, number_of_recommendations = 1)
print recommendation_list

229741
[("The Princess Bride: S. Morgenstern's Classic Tale of True Love and High Adventure (The 'Good Parts' Version)", 10)]


---