In [5]:
# Libraries for data preparation & visualization
import numpy as np
import seaborn as sns
import pandas as pd
import plotly.io as pio
import matplotlib.pyplot as plt
pio.renderers.default = "png"

# Ignore printing warnings for general readability
import warnings 
warnings.filterwarnings("ignore")


In [6]:
# Loading the dataset 
def loaddata(filename):
    df = pd.read_csv(f'{filename}.csv',sep=';',error_bad_lines=False,warn_bad_lines=False,encoding='latin-1')
    return df

book   = loaddata("../../BX-Books")
user   = loaddata("../../BX-Users")
rating = loaddata("../../BX-Book-Ratings")

In [7]:
#Preprocessing Data
book = book[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]
book.rename(columns = {'Book-Title':'title', 'Book-Author':'author', 'Year-Of-Publication':'year', 'Publisher':'publisher'}, inplace=True)
user.rename(columns = {'User-ID':'user_id', 'Location':'location', 'Age':'age'}, inplace=True)
rating.rename(columns = {'User-ID':'user_id', 'Book-Rating':'rating'}, inplace=True)

In [8]:
rating

Unnamed: 0,user_id,ISBN,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6
...,...,...,...
1149775,276704,1563526298,9
1149776,276706,0679447156,0
1149777,276709,0515107662,10
1149778,276721,0590442449,10


In [9]:
# Let's visualize ratings given by users

rating_users = rating['user_id'].value_counts().reset_index().\
               rename({'index':'user_id','user_id':'rating'}, axis=1)
rating_users

Unnamed: 0,user_id,rating
0,11676,13602
1,198711,7550
2,153662,6109
3,98391,5891
4,35859,5850
...,...,...
105278,116180,1
105279,116166,1
105280,116154,1
105281,116137,1


In [10]:
# Let's visualize ratings received by books

rating_books = rating['ISBN'].value_counts().reset_index().\
               rename({'index':'ISBN','ISBN':'rating'}, axis=1)
rating_books

Unnamed: 0,ISBN,rating
0,0971880107,2502
1,0316666343,1295
2,0385504209,883
3,0060928336,732
4,0312195516,723
...,...,...
340551,1568656386,1
340552,1568656408,1
340553,1569551553,1
340554,1570081808,1


In [11]:
# In order to avoid rating bias & for making good recommendations, limit the dataset to only those
# users that have made at least 250 ratings & books that have received at least 50 ratings

rating = rating[rating['user_id'].isin(rating_users[rating_users['rating']>250]['user_id'])]
rating = rating[rating['ISBN'].isin(rating_books[rating_books['rating']> 50]['ISBN'])]

rating


Unnamed: 0,user_id,ISBN,rating
1456,277427,002542730X,10
1468,277427,006092988X,0
1469,277427,0060930535,0
1470,277427,0060932139,0
1471,277427,0060934417,0
...,...,...,...
1147440,275970,1400031354,0
1147441,275970,1400031362,0
1147470,275970,1558744606,0
1147517,275970,1573229725,0


In [12]:
len(rating['user_id'].unique())

686

In [13]:
# For the recommendation system, it is prefered to have the book titles rather than ISBN for easier interpretation

rating = rating.merge(book, on="ISBN")[['user_id','title','rating','ISBN']] # merging with the book dataframe
rating                                                                         # on 'ISBN' to get 'Book-Title'


Unnamed: 0,user_id,title,rating,ISBN
0,277427,Politically Correct Bedtime Stories: Modern Ta...,10,002542730X
1,3363,Politically Correct Bedtime Stories: Modern Ta...,0,002542730X
2,11676,Politically Correct Bedtime Stories: Modern Ta...,6,002542730X
3,12538,Politically Correct Bedtime Stories: Modern Ta...,10,002542730X
4,13552,Politically Correct Bedtime Stories: Modern Ta...,0,002542730X
...,...,...,...,...
79308,234828,Ringworld,8,0345333926
79309,236283,Ringworld,0,0345333926
79310,249628,Ringworld,0,0345333926
79311,261829,Ringworld,0,0345333926


In [14]:
# Check for duplicate values
print(f'Duplicate entries: {rating.duplicated().sum()}')

Duplicate entries: 0


In [15]:
rating.drop_duplicates(inplace=True)
rating

Unnamed: 0,user_id,title,rating,ISBN
0,277427,Politically Correct Bedtime Stories: Modern Ta...,10,002542730X
1,3363,Politically Correct Bedtime Stories: Modern Ta...,0,002542730X
2,11676,Politically Correct Bedtime Stories: Modern Ta...,6,002542730X
3,12538,Politically Correct Bedtime Stories: Modern Ta...,10,002542730X
4,13552,Politically Correct Bedtime Stories: Modern Ta...,0,002542730X
...,...,...,...,...
79308,234828,Ringworld,8,0345333926
79309,236283,Ringworld,0,0345333926
79310,249628,Ringworld,0,0345333926
79311,261829,Ringworld,0,0345333926


In [16]:
rating

Unnamed: 0,user_id,title,rating,ISBN
0,277427,Politically Correct Bedtime Stories: Modern Ta...,10,002542730X
1,3363,Politically Correct Bedtime Stories: Modern Ta...,0,002542730X
2,11676,Politically Correct Bedtime Stories: Modern Ta...,6,002542730X
3,12538,Politically Correct Bedtime Stories: Modern Ta...,10,002542730X
4,13552,Politically Correct Bedtime Stories: Modern Ta...,0,002542730X
...,...,...,...,...
79308,234828,Ringworld,8,0345333926
79309,236283,Ringworld,0,0345333926
79310,249628,Ringworld,0,0345333926
79311,261829,Ringworld,0,0345333926


In [17]:
book_pivot = rating.pivot_table(columns='user_id', index='title', values="rating")
#book_pivot.fillna(0, inplace=True)

In [18]:
book_pivot

user_id,254,2276,2766,3363,3757,4385,6251,6543,6575,7158,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
16 Lighthouse Road,,,,,,,,,,,...,,,,,,,,,,
1984,9.0,,,,,,,,,,...,10.0,,,,,,0.0,,,
1st to Die: A Novel,,,,,,,,9.0,,0.0,...,,,,,,,,,,
2010: Odyssey Two,,0.0,,,,,,,,,...,,,,,,,,,,
204 Rosewood Lane,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
You Belong To Me,,,,,,,,,,0.0,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,0.0,,,0.0,,,,...,,,,,,,0.0,,,
Zoya,,,,,,,,,,,...,,0.0,,,,,,,,
"\O\"" Is for Outlaw""",,,,,,,,,,,...,,,,,8.0,,,,,


In [19]:
book_pivot1 = rating.pivot_table(columns='user_id', index='ISBN', values="rating")
#book_pivot1.fillna(0, inplace=True)

In [20]:
book_pivot1

user_id,254,2276,2766,3363,3757,4385,6251,6543,6575,7158,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000649840X,,,,,,,,,,,...,,,0.0,,,,,,,
002026478X,,,,,,,,,,,...,,,,,,,,,,
0020442203,,,,,,,,,,,...,0.0,,,,,,,,,0.0
002542730X,,,,0.0,,,,,,,...,0.0,,,,,,,10.0,,
0028604199,,,,0.0,,,,,0.0,,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257229534,,0.0,,,,,,,,,...,,,,,,,,,,
3404148665,,,,,,,,,,,...,,,,,,,,,,
3423202327,,,,,,,,,,,...,,,,,,,,,,
3442541751,,,,,,,,,,,...,,,,,,,,,,


# Normalization utility matrix

In [21]:
rating = rating[['user_id','ISBN','rating']]

In [22]:
rating

Unnamed: 0,user_id,ISBN,rating
0,277427,002542730X,10
1,3363,002542730X,0
2,11676,002542730X,6
3,12538,002542730X,10
4,13552,002542730X,0
...,...,...,...
79308,234828,0345333926,8
79309,236283,0345333926,0
79310,249628,0345333926,0
79311,261829,0345333926,0


In [23]:
len(rating['user_id'].unique())

686

In [24]:
len(rating['ISBN'].unique())

2101

In [25]:
rating.values

array([[277427, '002542730X', 10],
       [3363, '002542730X', 0],
       [11676, '002542730X', 6],
       ...,
       [249628, '0345333926', 0],
       [261829, '0345333926', 0],
       [264321, '0345333926', 8]], dtype=object)

In [26]:
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
book_sparse = csr_matrix(book_pivot1)

In [27]:
book_sparse

<2101x686 sparse matrix of type '<class 'numpy.float64'>'
	with 1380228 stored elements in Compressed Sparse Row format>

In [28]:
book_pivot1

user_id,254,2276,2766,3363,3757,4385,6251,6543,6575,7158,...,271705,273979,274004,274061,274301,274308,275970,277427,277639,278418
ISBN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
000649840X,,,,,,,,,,,...,,,0.0,,,,,,,
002026478X,,,,,,,,,,,...,,,,,,,,,,
0020442203,,,,,,,,,,,...,0.0,,,,,,,,,0.0
002542730X,,,,0.0,,,,,,,...,0.0,,,,,,,10.0,,
0028604199,,,,0.0,,,,,0.0,,...,,,,,,,,,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3257229534,,0.0,,,,,,,,,...,,,,,,,,,,
3404148665,,,,,,,,,,,...,,,,,,,,,,
3423202327,,,,,,,,,,,...,,,,,,,,,,
3442541751,,,,,,,,,,,...,,,,,,,,,,


In [29]:
Y_data = rating.values
Y_data

array([[277427, '002542730X', 10],
       [3363, '002542730X', 0],
       [11676, '002542730X', 6],
       ...,
       [249628, '0345333926', 0],
       [261829, '0345333926', 0],
       [264321, '0345333926', 8]], dtype=object)

In [30]:
len(Y_data[: , 0])

79313

In [31]:
first = Y_data[:, 0] # first col of the Y_data
sec = Y_data[:, 1]
n_users = int(np.max(Y_data[:, 0])) + 1
n_items = int(np.max(Y_data[:, 1])) + 1
Y = np.copy(Y_data)
mu = np.zeros((n_users,))

In [None]:

        for n in range(n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            # ids = np.where(first == n)[0].astype(np.int32)
            ids = np.where(first == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = Y_data[ids, 1] 
            # and the corresponding ratings 
            ratingsbook = Y_data[ids, 2]
            # take mean
            m = np.mean(ratingsbook) 
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            mu[n] = m
            # normalize
            Y[ids,2] = ratingsbook - mu[n]

In [None]:
Y

In [None]:
book_pivot = pd.DataFrame(Y)
book_pivot

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
dist_func = cosine_similarity

## user-based1

In [None]:
book_pivot2 = book_pivot1.T
book_pivot2

In [None]:
dist_func = cosine_similarity

In [None]:
def standardize(row):
    new_row = (row-row.mean())
    return new_row
ratingustd = book_pivot2.apply(standardize)
ratingustdnan = ratingustd.copy(deep=True)
ratingustdnan 

In [None]:
ratingustd.fillna(0, inplace=True)
ratingustd

In [None]:
ratingustdnan

In [None]:
eps = 1e-6
user_sim = dist_func(ratingustd)
print(user_sim)

In [None]:
sparse_df = sparse.csr_matrix(ratingustd)
corrMatrix = pd.DataFrame(cosine_similarity(sparse_df),index=ratingustd.T.columns,columns=ratingustd.T.columns)
corrMatrix

In [None]:
user = 2276
sim_score = corrMatrix[user]
sim_score.sort_values(ascending = False)[1:4]

In [None]:
corrMatrix.values

In [None]:
Y_data

In [None]:
corrMatrix

In [None]:
corrMatrix.values[2,3]

In [None]:
book_pivot2

In [None]:
picked_u = 3363
picked_b = ['000649840X']
# Tìm các user đã rate book
picked = pd.DataFrame(ratingustdnan[picked_b].dropna(axis=0))
picked

In [None]:
# pick user similarity high 
pickuser_sim = pd.DataFrame(corrMatrix[picked_u])
pickuser_sim

In [None]:
k = 5
pickuser_id = pd.merge(left=picked,right=pickuser_sim, on='user_id', how='inner')

In [None]:
pickK_user = pickuser_id.sort_values(3363, ascending=False)[:k]
pickK_user

In [None]:
#caluculate rating prediction 
print(pickK_user.values[: ,0]*pickK_user.values[:,1])

In [None]:
np.sum(np.abs(pickK_user.values[: ,0]))

In [None]:
np.sum(pickK_user.values[: ,0]*pickK_user.values[:,1])

## User-based

In [None]:
Y

In [None]:
book_pivot2

In [None]:
#normalize user-item matrix
# axis = 1 cloumns
# axis = 0 index 
matrix_norm = book_pivot2.subtract(book_pivot2.mean(1), axis=0 )
matrix_norm

In [None]:
matrix_norm0 = matrix_norm.copy(deep=True)
matrix_norm0

In [None]:
matrix_norm.fillna(0, inplace=True)
matrix_norm.T

In [None]:
user_similarity = sparse.csr_matrix(matrix_norm)
user_similarity

In [None]:
user_sim_cosine = cosine_similarity(matrix_norm)
user_sim_cosine

In [None]:
u_sim = pd.DataFrame(cosine_similarity(user_similarity), index=matrix_norm.T.columns, columns=matrix_norm.T.columns)
u_sim

In [None]:
# number of similar users
k = 5

In [None]:
picked_u

In [None]:
# picked = pd.DataFrame(ratingustdnan[picked_b].dropna(axis=0))
# picked
similar_user = u_sim[picked_u].sort_values(ascending=False)[1:k+1]
similar_user

In [None]:
#narrow down item
similar_user_book = matrix_norm0[matrix_norm0.index.isin(similar_user.index)].dropna(axis=1, how='all')
similar_user_book

In [None]:
picked_userid_read = matrix_norm0[matrix_norm0.index == picked_u].dropna(axis=1, how='all')
picked_userid_read

Next we will drop the movies that user id have watch 

In [None]:
similar_user_book.drop(picked_userid_read.columns,axis=1, inplace = True,errors = 'ignore')
similar_user_book

In [None]:
similar_user_book.columns

Recommend book

In [None]:
item_score = {}

for i in similar_user_book.columns:
    book_rating = similar_user_book[i]
    # create a variable to score the score
    total = 0
    # variable to store the number of score
    count = 0
    for u in similar_user.index:
        if pd.isna(book_rating[u]) == False:
            score = similar_user[u] * book_rating[u]
            total +=score
            count +=1
    # get average score for item 
    item_score[i] = total/count
    
item_score = pd.DataFrame(item_score.items(), columns=['book', 'book_score'])
ranked_item_score = item_score.sort_values(by= 'book_score', ascending=False)
    
m=10 
ranked_item_score.head(m)
