In [None]:
import numpy as np
import pandas as pd

In [64]:
books = pd.read_csv('./data/Books.csv',low_memory=False)
users = pd.read_csv('./data/Users.csv')
ratings = pd.read_csv('./data/Ratings.csv')

In [None]:
books.head()
# users.head()
# ratings.head()

In [None]:
ratings_with_books = pd.merge(ratings, books, on='ISBN')
ratings_with_books

In [None]:
# using Bayesian Average to find the most popular books of all time
# Bayesian Average = (v/(v+m)) * R + (m/(v+m)) * C

# v = number of votes for the book
# m = minimum votes required to be listed in the chart
# R = average rating of the book
# C = mean vote across the whole report

### Using Bayesian Average to find most popular books of all time

Formulas for Bayesian Average:

$$\text{Bayesian Average} = \frac{v}{v+m} \times \text{Average Rating} + \frac{m}{v+m} \times \text{Average Rating of All Books}$$

where $v$ is the number of votes for the book and $m$ is the minimum votes required to be listed in the chart.

In [None]:
number_of_rating = ratings_with_books.groupby('ISBN')['Book-Rating'].count().reset_index()
number_of_rating.rename(columns={'Book-Rating': 'number_of_rating'}, inplace=True)
number_of_rating

In [None]:
average_rating = ratings_with_books.groupby('ISBN')['Book-Rating'].mean().reset_index()
average_rating.rename(columns={'Book-Rating': 'average_rating'}, inplace=True)
average_rating

In [None]:
C = average_rating['average_rating'].mean()
C

In [None]:
M = 250

In [None]:
popular_books = number_of_rating.merge(average_rating, on='ISBN')
popular_books['weighted_average'] = ((popular_books['number_of_rating']/(popular_books['number_of_rating']+M)) * popular_books['average_rating']) + ((M/(popular_books['number_of_rating']+M)) * C)
popular_books

In [None]:
popular_books.sort_values('weighted_average', ascending=False).head(10)

In [None]:
popular_books_with_title = popular_books.merge(books, on='ISBN')

top_25_books = popular_books_with_title.sort_values('weighted_average', ascending=False).head(25) 
top_25_books

In [None]:
# getting active readers with at least 200 ratings
active_readers = ratings_with_books.groupby('User-ID')['Book-Rating'].count().reset_index()
active_readers.rename(columns={'Book-Rating': 'number_of_rating'}, inplace=True)
active_readers = active_readers[active_readers['number_of_rating'] >= 200]
active_readers

In [None]:
filtered_ratings = ratings_with_books[ratings_with_books['User-ID'].isin(active_readers['User-ID'])]
filtered_ratings

In [None]:
# books with more than 50 ratings
known_books = filtered_ratings.groupby('ISBN').count()['Book-Rating'] >= 50
known_books = known_books[known_books].index
known_books

In [None]:
final_ratings = filtered_ratings[filtered_ratings['ISBN'].isin(known_books)]
final_ratings

In [None]:
pt = final_ratings.pivot_table(index='ISBN', columns='User-ID', values='Book-Rating').fillna(0)
pt

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity_scores = cosine_similarity(pt)

In [None]:
def recommend_books(book_id):
    index = np.where(pt.index == book_id)[0][0]

    similar_books = list(enumerate(similarity_scores[index]))
    similar_books = sorted(similar_books, key=lambda x: x[1], reverse=True)

    # sending only 10 books
    similar_books = similar_books[1:11]

    books_with_details = []

    for i in similar_books:
        book = books[books['ISBN'] == pt.index[i[0]]]
        book = book.iloc[0]
        book = book.to_dict()
        book['number_of_rating'] = ratings_with_books[ratings_with_books['ISBN'] == book['ISBN']]['Book-Rating'].count()
        book['average_rating'] = ratings_with_books[ratings_with_books['ISBN'] == book['ISBN']]['Book-Rating'].mean()
        books_with_details.append(book)

    return books_with_details

In [None]:
recommend_books("0439136350")

In [None]:
import pickle

In [None]:
pickle.dump(top_25_books, open('../api-server/model/top_25_books.pkl', 'wb'))

<!-- def recommend_books(book_id):
    index = pt.index.get_loc(book_id)

    similar_books = list(enumerate(similarity_scores[index]))
    similar_books = sorted(similar_books, key=lambda x: x[1], reverse=True)

    # sending only 10 books
    similar_books = similar_books[1:11]

    books_with_details = []

    for i in similar_books:
        book = books[books['ISBN'] == pt.index[i[0]]]
        book = book.iloc[0]
        book = book.to_dict()
        book['number_of_rating'] = ratings_with_books[ratings_with_books['ISBN'] == book['ISBN']]['Book-Rating'].count()
        book['average_rating'] = ratings_with_books[ratings_with_books['ISBN'] == book['ISBN']]['Book-Rating'].mean()
        books_with_details.append(book)

    return books_with_details -->


In [None]:
# pt
pickle.dump(pt, open('../api-server/model/pt.pkl', 'wb'))
# similar books
pickle.dump(similarity_scores, open('../api-server/model/similarity_scores.pkl', 'wb'))
# ratings_with_books
pickle.dump(ratings_with_books, open('../api-server/model/ratings_with_books.pkl', 'wb'))
# books
pickle.dump(books, open('../api-server/model/books.pkl', 'wb'))