In [18]:
import pandas as pd

In [39]:
dataFile = "BX-CSV-Dump/BX-Book-Ratings.csv"
data = pd.read_csv(dataFile, sep=";", header=0, names=["user", "isbn", "rating"], encoding = "ISO-8859-1")

In [20]:
data.head(5)

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


### Book Meta data

In [21]:
bookFile = "BX-CSV-Dump/BX-Books.csv"
books = pd.read_csv(bookFile, sep=";", header=0, error_bad_lines=False, usecols=[0, 1, 2], index_col=0, names=["isbn", "title", "author"], encoding = "ISO-8859-1")

In [22]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [23]:
def bookMeta(isbn):
    title = books.at[isbn, "title"]
    author = books.at[isbn, "author"]
    return title, author

In [24]:
bookMeta("0195153448")

('Classical Mythology', 'Mark P. O. Morford')

#### Top N books for user

In [25]:
def faveBooks(user, N):
    userRatings = data[data["user"]==user] #Filter data relevant to user
    sortedRatings = pd.DataFrame.sort_values(userRatings, by=['rating'], ascending=[0])[:N]
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta)
    return sortedRatings

In [26]:
#Considering the books which are there in the books df
#data = data[data["isbn"].isin(books.index)]

In [27]:
faveBooks(204622, 5)

Unnamed: 0,user,isbn,rating,title
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


# Construct rating matrix

In [40]:
# 1 million ratings
data.shape

(1149780, 3)

###### Before converting data to desired matrix usersPerISBN, ISBNsPerUser

In [29]:
usersPerISBN = data.isbn.value_counts()

In [30]:
#col1: isbns col2: user count
usersPerISBN.head(10)

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0679781587     639
0142001740     615
067976402X     614
0671027360     586
Name: isbn, dtype: int64

In [31]:
#These many unique isbn
#Which means that the rating matrix will end up having these many columns
usersPerISBN.shape

(340556,)

In [32]:
ISBNsPerUser = data.user.value_counts()

In [33]:
#col1: userId col2: isbns count
ISBNsPerUser.head(10)

11676     13602
198711     7550
153662     6109
98391      5891
35859      5850
212898     4785
278418     4533
76352      3367
110973     3100
235105     3067
Name: user, dtype: int64

In [34]:
#Number of users
#Which means that the rating matrix will end up having these many rows
ISBNsPerUser.shape

(105283,)

In [None]:
# 105283 x 340556
# Resultant matrix shape

In [41]:
# Take subset the data to only those ISBNs which have been read by more than 10 users
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN>10].index)]
data.shape

(515060, 3)

In [42]:
# Take subset the data to only users which have read more than 10 books
data = data[data["user"].isin(ISBNsPerUser[ISBNsPerUser>10].index)]
data.shape

(419407, 3)

In [44]:
data.columns

Index(['user', 'isbn', 'rating'], dtype='object')

In [46]:
userItemRatingMatrix = pd.pivot_table(data, values='rating', index=['user'], columns=['isbn'])

In [47]:
userItemRatingMatrix.head()

isbn,000000000,0002005018,0002251760,0002259001,0002259834,0002558122,0006172768,0006374921,0006475973,0006479286,...,9722015184,9722020609,9722319345,9724113361,9726106141,9726116902,9727591965,9727722458,9770390107900,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,5.0,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,


In [71]:
len(userItemRatingMatrix)

11707

# Find the K Nearest Neighbours

In [48]:
user1 = 204622
user2 = 255489

In [60]:
user1Ratings = userItemRatingMatrix.transpose()[user1]
user1Ratings.head()

isbn
000000000    NaN
0002005018   NaN
0002251760   NaN
0002259001   NaN
0002259834   NaN
Name: 204622, dtype: float64

In [61]:
user2Ratings = userItemRatingMatrix.transpose()[user2]

In [62]:
# % of disagreement
from scipy.spatial.distance import hamming
hamming(user1Ratings, user2Ratings)

0.99993812646949631

In [64]:
import numpy as np
def distance(user1, user2):
    try:
        user1Ratings = userItemRatingMatrix.transpose()[user1]
        user2Ratings = userItemRatingMatrix.transpose()[user2]
        distance = hamming(user1Ratings, user2Ratings)
    except:
        distance = np.NaN
    return distance

In [65]:
distance(user1, user2)

0.99993812646949631

In [76]:
# Take an active user & a no. K and find the kNNs for that user
user = 204622
# allUsers = pd.DataFrame(userItemRatingMatrix.index)
# print(allUsers.shape)
# allUsers  = allUsers[allUsers.user != user] #all the users except active user
# print(allUsers.shape)
# allUsers.head()

In [77]:
# allUsers["distance"] = allUsers["user"].apply(lambda x: distance(user, x))

In [78]:
# K =10
# KnearestUsers = allUsers.sort_values(["distance"], ascending=True)["user"][:K]
# KnearestUsers

In [79]:
def nearestNeighbour(user, K=10):
    allUsers = pd.DataFrame(userItemRatingMatrix.index)
    allUsers  = allUsers[allUsers.user != user] #all the users except active user
    allUsers["distance"] = allUsers["user"].apply(lambda x: distance(user, x))
    KnearestUsers = allUsers.sort_values(["distance"], ascending=True)["user"][:K]
    return KnearestUsers

In [80]:
KnearestUsers = nearestNeighbour(204622)

In [81]:
KnearestUsers

3509      82893
2872      68555
3693      87555
1991      48046
10582    251422
273        7346
620       16795
8287     198711
9689     232131
5901     140036
Name: user, dtype: int64

## Find the top N recommendations

###### - Average the ratings of nearest neighbours for unrated books
###### - sort in descending order
###### - pick the top  N

In [83]:
# Get the ratings of the nns for all books
NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
NNRatings

isbn,000000000,0002005018,0002251760,0002259001,0002259834,0002558122,0006172768,0006374921,0006475973,0006479286,...,9722015184,9722020609,9722319345,9724113361,9726106141,9726116902,9727591965,9727722458,9770390107900,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7346,,,,,,,,,,,...,,,,,,,,,,
16795,,,,,,,,,,,...,,,,,,,,,,
48046,,,,,,,,,,,...,,,,,,,,,,
68555,,,,,,,,,,,...,,,,,,,,,,
82893,,,,,,,,,,,...,,,,,,,,,,
87555,,,,,,,,,,,...,,,,,,,,,,
140036,,,,,,,,,,,...,,,,,,,,,,
198711,,,,,,,,,,,...,,,,,,,,,,
232131,,,,,,,,,,,...,,,,,,,,,,
251422,,,,,,,,,,,...,,,,,,,,,,


In [None]:
NNRatings.apply