In [1]:
import pandas as pd
dataFile='BX-CSV-Dump/BX-Book-Ratings.csv'
data=pd.read_csv(dataFile,sep=";",header=0,names=["user","isbn","rating"])

In [2]:
data.head()

Unnamed: 0,user,isbn,rating
0,276726,0155061224,5
1,276727,0446520802,0
2,276729,052165615X,3
3,276729,0521795028,6
4,276733,2080674722,0


In [3]:
bookFile='BX-CSV-Dump/BX-Books.csv'
books=pd.read_csv(bookFile,sep=";",header=0,error_bad_lines=False, usecols=[0,1,2],index_col=0,names=['isbn',"title","author"])

In [4]:
books.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber
399135782,The Kitchen God's Wife,Amy Tan


In [5]:
def bookMeta(isbn):
    title = books.at[isbn,"title"]
    author = books.at[isbn,"author"]
    return title, author
bookMeta("0671027360")

('Angels &amp; Demons', 'Dan Brown')

In [9]:
data = data[data["isbn"].isin(books.index)]

In [10]:
def faveBooks(user,N):
    userRatings = data[data["user"]==user]
    sortedRatings = pd.DataFrame.sort_values(userRatings,['rating'],ascending=[0])[:N] 
    sortedRatings["title"] = sortedRatings["isbn"].apply(bookMeta)
    return sortedRatings

In [11]:
faveBooks(204622,5)

Unnamed: 0,user,isbn,rating,title
844954,204622,0967560500,10,"(Natural Hormonal Enhancement, Rob Faigin)"
844934,204622,0671027360,10,"(Angels &amp; Demons, Dan Brown)"
844925,204622,0385504209,10,"(The Da Vinci Code, Dan Brown)"
844957,204622,097173660X,9,"(Life After School Explained, Cap &amp; Compass)"
844919,204622,0060935464,9,"(To Kill a Mockingbird, Harper Lee)"


In [12]:
data.shape

(1031173, 3)

In [13]:
usersPerISBN = data.isbn.value_counts()
usersPerISBN.head(10)

0971880107    2502
0316666343    1295
0385504209     883
0060928336     732
0312195516     723
044023722X     647
0142001740     615
067976402X     614
0671027360     586
0446672211     585
Name: isbn, dtype: int64

In [14]:
usersPerISBN.shape

(270169,)

In [15]:
ISBNsPerUser = data.user.value_counts()

In [16]:
ISBNsPerUser.shape

(92105,)

In [17]:
data = data[data["isbn"].isin(usersPerISBN[usersPerISBN>10].index)]

In [18]:
data = data[data["user"].isin(ISBNsPerUser[ISBNsPerUser>10].index)]

In [19]:
from scipy.sparse import coo_matrix
data['user'] = data['user'].astype("category")
data['isbn'] = data['isbn'].astype("category")

R = coo_matrix((data['rating'].astype(float),
                       (data['user'].cat.codes.copy(),
                        data['isbn'].cat.codes.copy())))

In [20]:
R.shape

(10706, 15451)

In [21]:
len(R.data)

405709

In [22]:
R.data[0]

0.0

In [23]:
R.row[0]

10633

In [24]:
R.col[0]

3053

In [25]:
M,N = R.shape
K = 3

In [26]:
import numpy as np
P = np.random.rand(M,K)
Q = np.random.rand(K,N)

In [27]:
from numpy.linalg import norm

def error(R,P,Q,lamda=0.02):
    ratings = R.data
    rows = R.row
    cols = R.col
    e = 0 
    for ui in range(len(ratings)):
        rui=ratings[ui]
        u = rows[ui]
        i = cols[ui]
        if rui>0:
            e= e + pow(rui-np.dot(P[u,:],Q[:,i]),2)+\
                lamda*(pow(norm(P[u,:]),2)+pow(norm(Q[:,i]),2))
    return e

In [28]:
error(R,P,Q)

7144326.299961165

In [29]:
rmse = np.sqrt(error(R,P,Q)/len(R.data))

In [27]:
rmse

4.1954897042123482

In [36]:
def SGD(R, K, lamda=0.02,steps=10, gamma=0.001):
    
    M,N = R.shape
    P = np.random.rand(M,K)
    Q = np.random.rand(K,N)
    
    rmse = np.sqrt(error(R,P,Q,lamda)/len(R.data))
    print("Initial RMSE: "+str(rmse))
    
    for step in range(steps):
        for ui in range(len(R.data)):
            rui=R.data[ui]
            u = R.row[ui]
            i = R.col[ui]
            if rui>0:
                eui=rui-np.dot(P[u,:],Q[:,i])
                P[u,:]=P[u,:]+gamma*2*(eui*Q[:,i]-lamda*P[u,:])
                Q[:,i]=Q[:,i]+gamma*2*(eui*P[u,:]-lamda*Q[:,i])
        rmse = np.sqrt(error(R,P,Q,lamda)/len(R.data))
        print("RMSE@step{}: {}".format(step,rmse))
        if rmse<0.5:
            break
    print("Final RMSE: "+str(rmse))
    return P,Q

In [40]:
(P,Q)=SGD(R,K=2,gamma=0.008,lamda=0.01, steps=100)

Initial RMSE: 4.330798158809477
RMSE@step0: 2.8832436661791525
RMSE@step1: 1.2787505956027954
RMSE@step2: 1.0159084021444387
RMSE@step3: 0.9173881871001106
RMSE@step4: 0.8720316532857231
RMSE@step5: 0.8471326672441736
RMSE@step6: 0.831343748103382
RMSE@step7: 0.8201943499898001
RMSE@step8: 0.8116916522647106
RMSE@step9: 0.8048316653486705
RMSE@step10: 0.7990534008609659
RMSE@step11: 0.7940212636222769
RMSE@step12: 0.7895281083424187
RMSE@step13: 0.7854445221030989
RMSE@step14: 0.7816883399697242
RMSE@step15: 0.7782056165900775
RMSE@step16: 0.7749592353142277
RMSE@step17: 0.7719224252637686
RMSE@step18: 0.7690750451745186
RMSE@step19: 0.7664012763281702
RMSE@step20: 0.763888070742802
RMSE@step21: 0.7615241203640807
RMSE@step22: 0.7592992612156517
RMSE@step23: 0.7572042085131673
RMSE@step24: 0.7552304706531373
RMSE@step25: 0.7533703023881557
RMSE@step26: 0.7516166243655671
RMSE@step27: 0.7499629028515441
RMSE@step28: 0.748403017778483
RMSE@step29: 0.7469311520071424
RMSE@step30: 0.745541