#### Collaborative Filtering Technique
* Nearest Neighbours Model - use ratings of most similar user

In [62]:
import pandas as pd

In [63]:
dataFile = "BX-CSV-Dump/BX-Book-Ratings.csv"
data = pd.read_csv(dataFile, sep=";", header=0, names=["user", "isbn", "rating"], encoding = "ISO-8859-1")

In [64]:
#Rating in the range of 0-10
data.head(5)

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


### Book Meta data

In [65]:
bookFile = "BX-CSV-Dump/BX-Books.csv"
books = pd.read_csv(bookFile, sep=";", header=0, error_bad_lines=False, usecols=[0, 1, 2], index_col=0, names=["isbn", "title", "author"], encoding = "ISO-8859-1")

In [66]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [67]:
def bookMeta(isbn):
    title = books.at[isbn, "title"]
    author = books.at[isbn, "author"]
    return title, author

In [68]:
bookMeta("0195153448")

('Classical Mythology', 'Mark P. O. Morford')

#### Top N books for user

In [69]:
def faveBooks(user, N):
    userRatings = data[data["user"]==user] #Filter data relevant to user
    sortedRatings = pd.DataFrame.sort_values(userRatings, by=['rating'], ascending=[0])[:N]
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta)
    return sortedRatings

In [70]:
#Considering the books which are there in the books df
data = data[data["isbn"].isin(books.index)]

In [71]:
faveBooks(204622, 5)

Unnamed: 0,user,isbn,rating,title
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


# Construct rating matrix

In [72]:
# 1 million ratings
data.shape

(1031175, 3)

Before converting data to desired matrix usersPerISBN, ISBNsPerUser for remove some unusual occuring data
- Take subset the data to only those ISBNs which have been read by more than 10 users
- Take subset the data to only users which have read more than 10 books

In [73]:
usersPerISBN = data.isbn.value_counts()

In [74]:
#col1: isbns col2: user count
usersPerISBN.head(10)

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0142001740     615
067976402X     614
0671027360     586
0446672211     585
Name: isbn, dtype: int64

In [75]:
#These many unique isbn
#Which means that the rating matrix will end up having these many columns
usersPerISBN.shape

(270170,)

In [76]:
ISBNsPerUser = data.user.value_counts()

In [77]:
#col1: userId col2: isbns count
ISBNsPerUser.head(10)

11676     11144
198711     6456
153662     5814
98391      5779
35859      5646
212898     4290
278418     3996
76352      3329
110973     2971
235105     2943
Name: user, dtype: int64

In [78]:
#Number of users
#Which means that the rating matrix will end up having these many rows
ISBNsPerUser.shape

(92107,)

In [79]:
# 105283 x 340556
# Resultant matrix shape

In [80]:
# Take subset the data to only those ISBNs which have been read by more than 10 users
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN>10].index)]
data.shape

(500229, 3)

In [81]:
# Take subset the data to only users which have read more than 10 books
data = data[data["user"].isin(ISBNsPerUser[ISBNsPerUser>10].index)]
data.shape

(405709, 3)

In [82]:
data.columns

Index(['user', 'isbn', 'rating'], dtype='object')

In [83]:
userItemRatingMatrix = pd.pivot_table(data, values='rating', index=['user'], columns=['isbn'])

In [84]:
userItemRatingMatrix.head()

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,


In [85]:
len(userItemRatingMatrix)

10706

# Find the K Nearest Neighbours

In [86]:
user1 = 204622
user2 = 255489

In [109]:
# user1Ratings = userItemRatingMatrix.transpose()[user1]
# user1Ratings.head()

In [88]:
# user2Ratings = userItemRatingMatrix.transpose()[user2]

In [None]:
# % of disagreement
from scipy.spatial.distance import hamming
hamming(user1Ratings, user2Ratings)

###### Find the distance bn two users

In [90]:
import numpy as np
def distance(user1, user2):
    try:
        user1Ratings = userItemRatingMatrix.transpose()[user1]
        user2Ratings = userItemRatingMatrix.transpose()[user2]
        distance = hamming(user1Ratings, user2Ratings)
    except:
        distance = np.NaN
    return distance

In [91]:
distance(user1, user2)

0.99993527926995018

In [92]:
# Take an active user & a no. K and find the kNNs for that user
user = 204622
# allUsers = pd.DataFrame(userItemRatingMatrix.index)
# print(allUsers.shape)
# allUsers  = allUsers[allUsers.user != user] #all the users except active user
# print(allUsers.shape)
# allUsers.head()

In [93]:
# allUsers["distance"] = allUsers["user"].apply(lambda x: distance(user, x))

In [94]:
# K =10
# KnearestUsers = allUsers.sort_values(["distance"], ascending=True)["user"][:K]
# KnearestUsers

###### K nearest neighbours to active users

In [95]:
def nearestNeighbour(user, K=10):
    allUsers = pd.DataFrame(userItemRatingMatrix.index)
    allUsers  = allUsers[allUsers.user != user] #all the users except active user
    allUsers["distance"] = allUsers["user"].apply(lambda x: distance(user, x))
    KnearestUsers = allUsers.sort_values(["distance"], ascending=True)["user"][:K]
    return KnearestUsers

In [96]:
KnearestUsers = nearestNeighbour(204622)

In [97]:
KnearestUsers

3201     82893
3368     87555
2624     68555
1813     48046
5401    140036
7584    198711
565      16795
8866    232131
239       7346
9693    251422
Name: user, dtype: int64

## Find the top N recommendations

###### - Average the ratings of nearest neighbours for unrated books
###### - sort in descending order
###### - pick the top  N

In [98]:
# Get the ratings of the nns for all books
NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)]
NNRatings

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7346,,,,,,,,,,,...,,,,,,,,,,
16795,,,,,,,,,,,...,,,,,,,,,,
48046,,,,,,,,,,,...,,,,,,,,,,
68555,,,,,,,,,,,...,,,,,,,,,,
82893,,,,,,,,,,,...,,,,,,,,,,
87555,,,,,,,,,,,...,,,,,,,,,,
140036,,,,,,,,,,,...,,,,,,,,,,
198711,,,,,,,,,,,...,,,,,,,,,,
232131,,,,,,,,,,,...,,,,,,,,,,
251422,,,,,,,,,,,...,,,,,,,,,,


In [99]:
#Compute the mean of ISBNs if any book not read any of k nners dropna those books
avgRating = NNRatings.apply(np.nanmean).dropna()
avgRating.head()

  labels=labels)


isbn
0007154615    1.5
0020125305    0.0
0020125607    0.0
0020198817    0.0
0020198906    8.0
dtype: float64

###### Get the books which have already been read by active user

In [100]:
booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index
print(len(booksAlreadyRead))
booksAlreadyRead

30


Index(['006016848X', '0060935464', '0140042598', '0140178724', '0142004278',
       '0380732238', '0385504209', '0425109720', '0425152898', '0440136482',
       '0440241162', '0451191145', '0451197127', '0553096060', '0671027360',
       '0671027387', '0671666258', '0688174574', '0743225708', '076790592X',
       '0785264280', '0786868716', '0802131867', '0802132952', '0971880107',
       '1853260045', '1853260126', '1853260207', '185326041X', '1878424114'],
      dtype='object', name='isbn')

In [101]:
#These are the books which are not read by our active user
avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]

In [102]:
N = 3
topNISBNs = avgRating.sort_values(ascending=False).index[:N]

In [103]:
topNISBNs

Index(['0553802976', '0618002235', '0590353403'], dtype='object', name='isbn')

In [104]:
#Get the authors of the book
pd.Series(topNISBNs).apply(bookMeta)

0              (Love, Greg &amp; Lauren, Greg Manning)
1    (The Two Towers (The Lord of the Rings, Part 2...
2    (Harry Potter and the Sorcerer's Stone (Book 1...
Name: isbn, dtype: object

In [105]:
def topN(user, N=3):
    KnearestUsers = nearestNeighbour(user)
    # Get the ratings of the nns for all books
    NNRatings = userItemRatingMatrix[userItemRatingMatrix.index.isin(KnearestUsers)] 
    #Compute the mean of ISBNs if any book not read any of k nners dropna those books
    avgRating = NNRatings.apply(np.nanmean).dropna()
    ###### Get the books which have already been read by active user
    booksAlreadyRead = userItemRatingMatrix.transpose()[user].dropna().index
    #These are the books which are not read by our active user
    avgRating = avgRating[~avgRating.index.isin(booksAlreadyRead)]
    topNISBNs = avgRating.sort_values(ascending=False).index[:N]
    #Get the authors of the book
    return pd.Series(topNISBNs).apply(bookMeta)

### Evaluate

In [106]:
#faveBooks of the user already read
faveBooks(204813, 10)

Unnamed: 0,user,isbn,rating,title
845417,204813,399149848,10,"(Birthright, Nora Roberts)"
845407,204813,385504209,10,"(The Da Vinci Code, Dan Brown)"
845382,204813,373218036,10,"(Truly, Madly Manhattan, Nora Roberts)"
845359,204813,142001805,10,"(The Eyre Affair: A Novel, Jasper Fforde)"
845431,204813,446527793,10,"(The Guardian, Nicholas Sparks)"
845416,204813,399149392,10,"(Chesapeake Blue (Quinn Brothers (Hardcover)),..."
845432,204813,446531332,9,"(Nights in Rodanthe, Nicholas Sparks)"
845434,204813,446606243,9,"(The Tenth Justice, Brad Meltzer)"
845451,204813,671027360,9,"(Angels &amp; Demons, Dan Brown)"
845433,204813,446532452,9,"(The Wedding, Nicholas Sparks)"


In [107]:
#Recommending top 10 books
topN(204813, 10)

  labels=labels)


0    (Waiting For Nick (Silhouette Special Edition)...
1           (Wringer (Trophy Newbery), Jerry Spinelli)
2    (The Star Wars Trilogy: Star Wars, the Empire ...
3          (One, Two, Buckle My Shoe, Agatha Christie)
4                          (On the Road, Jack Kerouac)
5                 (Dead Poets Society, N.H. Kleinbaum)
6     (Go Ask Alice (Avon/Flare Book), James Jennings)
7                        (Carolina Moon, Nora Roberts)
8    (Illusions: The Adventures of a Reluctant Mess...
9    (You Just Don't Duct Tape a Baby!: True Tales ...
Name: isbn, dtype: object

###### We are recommending to user likely 'Nora Roberts' books