In [12]:
import pandas as pd
dataFile='BX-Book-Ratings.csv'
data=pd.read_csv(dataFile,sep=";",header=0,names=["user","isbn","rating"])

In [13]:
data.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0


In [14]:
bookFile='BX-Books.csv'
books=pd.read_csv(bookFile,sep=";",header=0,error_bad_lines=False, usecols=[0,1,2],index_col=0,names=['isbn',"title","author"])

In [15]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [16]:
def bookMeta(isbn):
    title = books.at[isbn,"title"]
    author = books.at[isbn,"author"]
    return title, author
bookMeta("0671027360")

('Angels &amp; Demons', 'Dan Brown')

In [17]:
data = data[data["isbn"].isin(books.index)]

In [53]:
def faveBooks(user,N):
    userRatings = data[data["user"]==user] # filter data relevant to the user
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'],ascending=[0])[:N] # sort in descending order based on rating
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta) # apply the bookMeta function to the entire ISBN column
    return sortedRatings

In [54]:
faveBooks(204622,5)

Unnamed: 0,user,isbn,rating,title
844935,204622,671027360,10.0,"(Angels &amp; Demons, Dan Brown)"
844926,204622,385504209,10.0,"(The Da Vinci Code, Dan Brown)"
844966,204622,1878424114,9.0,(The Seven Spiritual Laws of Success: A Practi...
844920,204622,60935464,9.0,"(To Kill a Mockingbird, Harper Lee)"
844937,204622,671666258,8.0,"(American Star, Jackie Collins)"


In [55]:
data.shape

(405709, 3)

In [21]:
usersPerISBN = data.isbn.value_counts()
usersPerISBN.head(10)

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0142001740     615
067976402X     614
0671027360     586
0446672211     585
Name: isbn, dtype: int64

In [22]:
usersPerISBN.shape

(270170,)

In [23]:
ISBNsPerUser = data.user.value_counts()

In [24]:
ISBNsPerUser.shape

(92107,)

In [25]:
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN>10].index)] # keeb isbns which were read by more than 10 users

In [26]:
data = data[data["user"].isin(ISBNsPerUser[ISBNsPerUser>10].index)] # keep users who've read more than 10 books

In [57]:
data.head(10)

Unnamed: 0,user,isbn,rating
31,276762,034544003X,0.0
33,276762,0380711524,5.0
34,276762,0451167317,0.0
89,276798,3423084049,0.0
97,276798,3548603203,6.0
133,276822,0060096195,10.0
137,276822,0375821813,9.0
145,276822,0786817070,10.0
157,276828,2253044903,0.0
158,276828,2253150711,7.0


In [27]:
userItemRatingMatrix=pd.pivot_table(data, values='rating',
                                    index=['user'], columns=['isbn']) #matrix of ratings

In [28]:
userItemRatingMatrix.head()

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,


In [29]:
userItemRatingMatrix.shape

(10706, 15451)

In [30]:
user1 = 204622
user2 = 255489

In [31]:
user1Ratings = userItemRatingMatrix.transpose()[user1] # User 1's ratings for all books
user1Ratings.head()

isbn
0002005018   NaN
0002251760   NaN
0002259834   NaN
0002558122   NaN
0006480764   NaN
Name: 204622, dtype: float64

In [32]:
user2Ratings = userItemRatingMatrix.transpose()[user2] # User 2's ratings for all books

In [33]:
from scipy.spatial.distance import hamming 
hamming(user1Ratings,user2Ratings)

0.99993527926995018

In [34]:
import numpy as np
def distance(user1,user2):
        try:
            user1Ratings = userItemRatingMatrix.transpose()[user1]
            user2Ratings = userItemRatingMatrix.transpose()[user2] #represent user1 and user2 in terms of their ratings
            distance = hamming(user1Ratings,user2Ratings)
        except: 
            distance = np.NaN
        return distance 

In [35]:
distance(204622,10118)

0.99987055853990037

In [36]:
user = 204622 #active user
allUsers = pd.DataFrame(userItemRatingMatrix.index) #get the ids for all users
allUsers = allUsers[allUsers.user!=user] #remove the active user
allUsers.head()

Unnamed: 0,user
0,8
1,99
2,242
3,243
4,254


In [37]:
allUsers["distance"] = allUsers["user"].apply(lambda x: distance(user,x)) #distance between the users and the active user

In [38]:
allUsers.head()

Unnamed: 0,user,distance
0,8,1.0
1,99,1.0
2,242,0.999935
3,243,0.999935
4,254,1.0


In [39]:
K = 10
KnearestUsers = allUsers.sort_values(["distance"],ascending=True)["user"][:K] # the number of nearest neighbors

In [40]:
KnearestUsers # K nearest users

3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
7584    198711
565      16795
8866    232131
239       7346
9693    251422
Name: user, dtype: int64

In [41]:
def nearestNeighbors(user,K=10):
    allUsers = pd.DataFrame(userItemRatingMatrix.index)
    allUsers = allUsers[allUsers.user!=user]
    allUsers["distance"] = allUsers["user"].apply(lambda x: distance(user,x)) # find the distance between each user and active user
    KnearestUsers = allUsers.sort_values(["distance"],ascending=True)["user"][:K] #sort in ascending order of distance
    return KnearestUsers

In [42]:
KnearestUsers = nearestNeighbors(user)

In [43]:
KnearestUsers

3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
7584    198711
565      16795
8866    232131
239       7346
9693    251422
Name: user, dtype: int64

In [44]:
NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)] # Get the ratings of the nearest neighbors for 
                                                                                 # all books
NNRatings

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7346,,,,,,,,,,,...,,,,,,,,,,
16795,,,,,,,,,,,...,,,,,,,,,,
48046,,,,,,,,,,,...,,,,,,,,,,
68555,,,,,,,,,,,...,,,,,,,,,,
82893,,,,,,,,,,,...,,,,,,,,,,
87555,,,,,,,,,,,...,,,,,,,,,,
140036,,,,,,,,,,,...,,,,,,,,,,
198711,,,,,,,,,,,...,,,,,,,,,,
232131,,,,,,,,,,,...,,,,,,,,,,
251422,,,,,,,,,,,...,,,,,,,,,,


In [45]:
avgRating = NNRatings.apply(np.nanmean).dropna() # apply mean for each column but not taking in consideration the NaN and at the 
                                                 # end drop books which don't have rating
avgRating.head()

  labels=labels)


isbn
0007154615    1.5
0020125305    0.0
0020125607    0.0
0020198817    0.0
0020198906    8.0
dtype: float64

In [58]:
booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index # get the ratings of the active user 
                                                                         # and drop books without a rating
booksAlreadyRead

Index(['006016848X', '0060935464', '0140042598', '0140178724', '0142004278',
       '0380732238', '0385504209', '0425109720', '0425152898', '0440136482',
       '0440241162', '0451191145', '0451197127', '0553096060', '0671027360',
       '0671027387', '0671666258', '0688174574', '0743225708', '076790592X',
       '0785264280', '0786868716', '0802131867', '0802132952', '0971880107',
       '1853260045', '1853260126', '1853260207', '185326041X', '1878424114'],
      dtype='object', name='isbn')

In [47]:
avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)] # remove the average ratings for books already read by the user

In [48]:
N=3
topNISBNs = avgRating.sort_values(ascending=False).index[:N] # sorting ratings in descending order and pick top N

In [49]:
pd.Series(topNISBNs).apply(bookMeta) # Apply the bookMeta function to the top N ISBSs

0              (Love, Greg &amp; Lauren, Greg Manning)
1    (The Two Towers (The Lord of the Rings, Part 2...
2    (Harry Potter and the Sorcerer's Stone (Book 1...
Name: isbn, dtype: object

In [50]:
def topN(user,N=3):
    KnearestUsers = nearestNeighbors(user)
    NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
    avgRating = NNRatings.apply(np.nanmean).dropna()
    booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index
    avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]
    topNISBNs = avgRating.sort_values(ascending=False).index[:N]
    return pd.Series(topNISBNs).apply(bookMeta)

In [51]:
faveBooks(204813,10)

Unnamed: 0,user,isbn,rating,title
845417,204813,399149848,10.0,"(Birthright, Nora Roberts)"
845407,204813,385504209,10.0,"(The Da Vinci Code, Dan Brown)"
845382,204813,373218036,10.0,"(Truly, Madly Manhattan, Nora Roberts)"
845359,204813,142001805,10.0,"(The Eyre Affair: A Novel, Jasper Fforde)"
845431,204813,446527793,10.0,"(The Guardian, Nicholas Sparks)"
845416,204813,399149392,10.0,"(Chesapeake Blue (Quinn Brothers (Hardcover)),..."
845432,204813,446531332,9.0,"(Nights in Rodanthe, Nicholas Sparks)"
845434,204813,446606243,9.0,"(The Tenth Justice, Brad Meltzer)"
845451,204813,671027360,9.0,"(Angels &amp; Demons, Dan Brown)"
845433,204813,446532452,9.0,"(The Wedding, Nicholas Sparks)"


In [52]:
topN(204813,10)

  labels=labels)


0    (Waiting For Nick (Silhouette Special Edition)...
1           (Wringer (Trophy Newbery), Jerry Spinelli)
2    (The Star Wars Trilogy: Star Wars, the Empire ...
3          (One, Two, Buckle My Shoe, Agatha Christie)
4                          (On the Road, Jack Kerouac)
5                 (Dead Poets Society, N.H. Kleinbaum)
6     (Go Ask Alice (Avon/Flare Book), James Jennings)
7                        (Carolina Moon, Nora Roberts)
8    (Illusions: The Adventures of a Reluctant Mess...
9    (You Just Don't Duct Tape a Baby!: True Tales ...
Name: isbn, dtype: object