In [1]:
import numpy as np
import pandas as pd
from math import sqrt
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv("BX-Book-Ratings.csv", encoding="latin1")
df.sort_values(["user_id", "isbn"], inplace=True)
df = df.head(10000)
df.reset_index()
df



Unnamed: 0,user_id,isbn,rating
9561,2,195153448,0
9562,7,34542252,0
9571,8,074322678X,5
9574,8,080652121X,0
9576,8,1552041778,5
...,...,...,...
19556,3728,553574671,9
19553,3728,60008776,7
19554,3728,61057819,0
19557,3728,761513779,0


In [4]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df["isbn"] = le.fit_transform(df["isbn"])
df

Unnamed: 0,user_id,isbn,rating
9561,2,1467,0
9562,7,2490,0
9571,8,443,5
9574,8,475,0
9576,8,877,5
...,...,...,...
19556,3728,5189,9
19553,3728,5426,7
19554,3728,5703,0
19557,3728,6861,0


In [6]:
books = pd.read_csv("BX-Books.csv", encoding="latin1", low_memory=False)
books

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company
...,...,...,...,...,...
271374,440400988,There's a Bat in Bunk Five,Paula Danziger,1988,Random House Childrens Pub (Mm)
271375,525447644,From One to One Hundred,Teri Sloat,1991,Dutton Books
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker,2004,HarperSanFrancisco
271377,192126040,Republic (World's Classics),Plato,1996,Oxford University Press


In [7]:
users = pd.read_csv("BX-Users.csv", encoding="latin1", low_memory=False)
users

Unnamed: 0,user_id,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",
...,...,...,...
278854,278854,"portland, oregon, usa",
278855,278855,"tacoma, washington, united kingdom",50.0
278856,278856,"brampton, ontario, canada",
278857,278857,"knoxville, tennessee, usa",


In [8]:
user1 = df["user_id"].unique().shape[0]
user1

1323

In [9]:
books1 = df["isbn"].unique().shape[0]
books1

8742

In [19]:
data_matrix = np.zeros((user1, books1))
for line in df.head().itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]
data_matrix.shape

(1323, 8742)

In [20]:
from sklearn.metrics.pairwise import pairwise_distances 

user_sim = pairwise_distances(data_matrix, metric='cosine')
item_sim = pairwise_distances(data_matrix.T, metric='cosine')

In [23]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        rating_diff = (ratings - mean_user_rating[:, np.newaxis])
        predict = mean_user_rating[:, np.newaxis] + similarity.dot(rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'book':
        predict = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return predict

In [32]:
user_predict = predict(data_matrix, user_sim, type='user')
item_predict = predict(data_matrix, item_sim, type='book')


In [34]:
def rmse(pred, test):
    predict = pred[test.nonzero()].flatten()
    test = test[test.nonzero()].flatten()
    return sqrt(mean_squared_error(predict, test))


In [35]:
rmse(user_predict, data_matrix)


4.9988560970029745

In [37]:
rmse(item_predict, data_matrix)


5.0