# Collaborative Filtering Technique
* Recommending Products Based on the Latent Factors Model


In [140]:
import pandas as pd
dataFile='BX-CSV-Dump/BX-Book-Ratings.csv'
data = pd.read_csv(dataFile, sep=";", header=0, names=["user", "isbn", "rating"], encoding = "ISO-8859-1")

In [141]:
data.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [142]:
bookFile='BX-CSV-Dump/BX-Books.csv'
books = pd.read_csv(bookFile, sep=";", header=0, error_bad_lines=False, usecols=[0, 1, 2], index_col=0, names=["isbn", "title", "author"], encoding = "ISO-8859-1")

In [143]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


In [144]:
def bookMeta(isbn):
    title = books.at[isbn,"title"]
    author = books.at[isbn,"author"]
    return title, author
bookMeta("0671027360")

('Angels &amp; Demons', 'Dan Brown')

In [145]:
data = data[data["isbn"].isin(books.index)]

In [146]:
def faveBooks(user,N):
    userRatings = data[data["user"]==user]
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'],ascending=[0])[:N] 
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta)
    return sortedRatings

In [147]:
faveBooks(204622,5)

Unnamed: 0,user,isbn,rating,title
844955,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844935,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844926,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844958,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844920,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


In [148]:
data.shape

(1031175, 3)

In [149]:
usersPerISBN = data.isbn.value_counts()
usersPerISBN.head(10)

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0142001740     615
067976402X     614
0671027360     586
0446672211     585
Name: isbn, dtype: int64

In [150]:
usersPerISBN.shape

(270170,)

In [151]:
ISBNsPerUser = data.user.value_counts()

In [152]:
ISBNsPerUser.shape

(92107,)

In [153]:
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN>10].index)]

In [154]:
data = data[data["user"].isin(ISBNsPerUser[ISBNsPerUser>10].index)]

In [155]:
data.shape

(405709, 3)

In [156]:
from scipy.sparse import coo_matrix
data['user'] = data['user'].astype("category")
data['isbn'] = data['isbn'].astype("category")

R = coo_matrix((data['rating'].astype(float),
                       (data['user'].cat.codes.copy(),
                        data['isbn'].cat.codes.copy())))

In [157]:
R.shape

(10706, 15451)

In [158]:
len(R.data)

405709

In [159]:
R.data[0]

0.0

In [160]:
R.row[0]

10633

In [161]:
R.col[0]

3053

### Intialize factor matrices

In [162]:
M,N = R.shape
K = 3

In [163]:
import numpy as np
P = np.random.rand(M,K)
Q = np.random.rand(K,N)

### Compute the error

###### This is the error when you intialize matrix

In [167]:
from numpy.linalg import norm

def error(R,P,Q,lamda=0.02):
    ratings = R.data #Accessing non-null ratings 
    rows = R.row #Corresponing row id 
    cols = R.col #Corresponing column id
    e = 0 #Intialize error zero
    #Iterate through all the ratings that are already known and compute individual error
    for ui in range(len(ratings)):
        rui=ratings[ui]
        u = rows[ui]
        i = cols[ui]
        #If rating is greater than 0 then apply optimization algo
        if rui>0:
            e= e + pow(rui-np.dot(P[u,:],Q[:,i]),2)+\
                lamda*(pow(norm(P[u,:]),2)+pow(norm(Q[:,i]),2))
    return e

In [170]:
#This is the total sqaurred error, minimize using the stochastic GD 
error(R,P,Q)

7143351.4029583698

In [192]:
# norm is a just root squarred of vector elements
a = [6, 8]
norm(a)

10.0

In [171]:
#This is the measure of error per rating
rmse = np.sqrt(error(R, P, Q)/len(R.data))

In [172]:
rmse

4.1960792935284354

### Using SGD update factor matrices with minimizing error

In [202]:
def SGD(R, K, lamda=0.02, steps=10, gamma=0.001):
    M,N = R.shape
    P = np.random.rand(M,K)
    Q = np.random.rand(K,N)
    
    #Intial error bfr opti. algo
    rmse = np.sqrt(error(R, P, Q)/len(R.data))
    print("Intial RMSE: "+str(rmse))
    
    for step in range(steps):
        for ui in range(len(ratings)):
            rui=ratings[ui]
            u = R.row[ui]
            i = R.col[ui]
            #If rating is greater than 0 then apply optimization algo
            if rui>0:
                eui = rui - np.dot(P[u,:], Q[:,i])
                P[u,:] = P[u,:]+gamma*2*(eui*Q[:,i] - lamda*P[u,:])
                Q[:,i] = Q[:,i]+gamma*2*(eui*P[u,:] - lamda*Q[:,i])
        rmse = np.sqrt(error(R, P, Q)/len(R.data))
        if rmse < 0.5:
            break
        print("step: {} RMSE {}".format(step, str(rmse)))    
    print("Final RMSE: "+str(rmse))
    return P, Q

In [203]:
(P,Q)=SGD(R,K=2,gamma=0.0007,lamda=0.01, steps=100)

Intial RMSE: 4.3332340944
step: 0 RMSE 3.98432105402
step: 1 RMSE 3.62455856946
step: 2 RMSE 3.2929923469
step: 3 RMSE 3.01472552122
step: 4 RMSE 2.78275339114
step: 5 RMSE 2.58626168809
step: 6 RMSE 2.41721227925
step: 7 RMSE 2.27003843428
step: 8 RMSE 2.14074507547
step: 9 RMSE 2.02634482377
step: 10 RMSE 1.92453334071
step: 11 RMSE 1.83349098985
step: 12 RMSE 1.75175349214
step: 13 RMSE 1.67812336344
step: 14 RMSE 1.61160713635
step: 15 RMSE 1.55136984612
step: 16 RMSE 1.49670158322
step: 17 RMSE 1.44699267319
step: 18 RMSE 1.40171506252
step: 19 RMSE 1.36040816408
step: 20 RMSE 1.32266790893
step: 21 RMSE 1.28813811874
step: 22 RMSE 1.25650358215
step: 23 RMSE 1.2274844071
step: 24 RMSE 1.20083134958
step: 25 RMSE 1.17632190571
step: 26 RMSE 1.15375701098
step: 27 RMSE 1.13295822978
step: 28 RMSE 1.1137653454
step: 29 RMSE 1.09603428048
step: 30 RMSE 1.07963529251
step: 31 RMSE 1.0644514001
step: 32 RMSE 1.0503770044
step: 33 RMSE 1.03731667688
step: 34 RMSE 1.02518408983
step: 35 

#### Final RMSE: 0.825386354411