#### Collaborative Filtering Technique
* Nearest Neighbours Model - use ratings of most similar user

In [1]:
import pandas as pd

In [2]:
dataFile = "BX-CSV-Dump/BX-Book-Ratings.csv"
data = pd.read_csv(dataFile, sep=";", header=0, names=["user", "isbn", "rating"], encoding = "ISO-8859-1")

In [3]:
#Rating in the range of 0-10
data.head(5)

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


### Book Meta data

In [4]:
bookFile = "BX-CSV-Dump/BX-Books.csv"
books = pd.read_csv(bookFile, sep=";", header=0, error_bad_lines=False, usecols=[0, 1, 2], index_col=0, names=["isbn", "title", "author"], encoding = "ISO-8859-1")

In [5]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [6]:
def bookMeta(isbn):
    title = books.at[isbn, "title"]
    author = books.at[isbn, "author"]
    return title, author

In [7]:
bookMeta("0195153448")

('Classical Mythology', 'Mark P. O. Morford')

#### Top N books for user

In [8]:
def faveBooks(user, N):
    userRatings = data[data["user"]==user] #Filter data relevant to user
    sortedRatings = pd.DataFrame.sort_values(userRatings, by=['rating'], ascending=[0])[:N]
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta)
    return sortedRatings

In [9]:
#Considering the books which are there in the books df
data = data[data["isbn"].isin(books.index)]

In [10]:
faveBooks(204622, 5)

Unnamed: 0,user,isbn,rating,title
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


# Construct rating matrix

In [11]:
# 1 million ratings
data.shape

(1031175, 3)

Before converting data to desired matrix usersPerISBN, ISBNsPerUser for remove some unusual occuring data
- Take subset the data to only those ISBNs which have been read by more than 10 users
- Take subset the data to only users which have read more than 10 books

In [12]:
usersPerISBN = data.isbn.value_counts()

In [13]:
#col1: isbns col2: user count
usersPerISBN.head(10)

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0142001740     615
067976402X     614
0671027360     586
0446672211     585
Name: isbn, dtype: int64

In [14]:
#These many unique isbn
#Which means that the rating matrix will end up having these many columns
usersPerISBN.shape

(270170,)

In [15]:
ISBNsPerUser = data.user.value_counts()

In [16]:
#col1: userId col2: isbns count
ISBNsPerUser.head(10)

11676     11144
198711     6456
153662     5814
98391      5779
35859      5646
212898     4290
278418     3996
76352      3329
110973     2971
235105     2943
Name: user, dtype: int64

In [17]:
#Number of users
#Which means that the rating matrix will end up having these many rows
ISBNsPerUser.shape

(92107,)

In [18]:
# 105283 x 340556
# Resultant matrix shape

In [19]:
# Take subset the data to only those ISBNs which have been read by more than 10 users
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN>10].index)]
data.shape

(500229, 3)

In [20]:
# Take subset the data to only users which have read more than 10 books
data = data[data["user"].isin(ISBNsPerUser[ISBNsPerUser>10].index)]
data.shape

(405709, 3)

In [21]:
data.columns

Index(['user', 'isbn', 'rating'], dtype='object')

In [22]:
userItemRatingMatrix = pd.pivot_table(data, values='rating', index=['user'], columns=['isbn'])

In [23]:
userItemRatingMatrix.head()

isbn,0002005018,0002251760,0002259834,0002558122,0006480764,000648302X,0006485200,000649840X,000651202X,0006512062,...,8845906884,8845915611,8878188212,8885989403,9074336329,9074336469,950491036X,9681500830,9681500954,9871138016
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,5.0,,,,,,,,,,...,,,,,,,,,,
99,,,,,,,,,,,...,,,,,,,,,,
242,,,,,,,,,,,...,,,,,,,,,,
243,,,,,,,,,,,...,,,,,,,,,,
254,,,,,,,,,,,...,,,,,,,,,,


In [31]:
#Removes the density of matrix Memory efficient
from scipy.sparse import coo_matrix
data['user'] = data['user'].astype("category")
data['isbn'] = data['isbn'].astype("category")

In [35]:
R = coo_matrix(
            (data['rating'].astype(float),
                  (data['user'].cat.codes.copy(),
                   data['isbn'].cat.codes.copy()
                  )
            )
          )

In [36]:
R.shape

(10706, 15451)

In [38]:
len(R.data)

405709

In [50]:
R.row[0], R.row[1], R.row[2], R.row[88434]

(10633, 10633, 10633, 2208)

In [43]:
R.col[0], R.col[2], R.col[1]

(3053, 7873, 4025)

In [59]:
R.data[90]

9.0

In [66]:
len(R.row)

405709

### Intialize factor matrices

In [60]:
M, N = R.shape
K = 3 #Factors

In [67]:
import numpy as np
P = np.random.rand(M, K)
Q = np.random.rand(K, M)

### Compute the error

In [64]:
from numpy.linalg import norm

def error(R, P, Q, lamda=0.02):
    ratings = R.data
    rows = R.row
    cols = R.col
    e = 0
    for ui in range(len(ratings)):
        rui = ratings[ui]
        u = rows[ui]
        i = cols[ui]
        if rui > 0:
            e = e + pow(rui-np.dot(P[u, :], Q[:, i]), 2) + \
                lamda*(pow(norm(P[u, :]), 2)) + 