In [None]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [None]:
books = pd.read_csv('books.csv', low_memory=False)
books.head()

In [None]:
ratings = pd.read_csv('ratings.csv', low_memory=False)
ratings.head()

In [None]:
tags = pd.read_csv('tags.csv', low_memory=False)
tags.head()

In [None]:
book_tags = pd.read_csv('book_tags.csv', low_memory=False)
book_tags.head()

In [None]:
to_read = pd.read_csv('to_read.csv', low_memory=False)
to_read.head()

In [None]:
books.columns

In [None]:
ratings.columns

In [None]:
tags.columns

In [None]:
book_tags.columns

In [None]:
to_read.columns

# Clean data set

Remove Null value, drop the image_url,small_image_url columns

In [None]:
books = books.dropna()
books = books.drop(columns=['image_url', 'small_image_url'])

In [None]:
books.shape

In [None]:
#used to replace any missing or NaN values in the 'original_publication_year' column 
books['original_publication_year'] = books['original_publication_year'].fillna(-1).apply(lambda x: int(x) if x != -1 else -1)

In [None]:
#removes any duplicate rows from the ratings
ratings_rmv_duplicates = ratings.drop_duplicates()
#counts the number of rows in each group.
unwanted_users = ratings_rmv_duplicates.groupby('user_id')['user_id'].count()
#keeping only the users with a count of ratings less than 3.
unwanted_users = unwanted_users[unwanted_users < 3]
#keeping only the rows where the 'user_id' is an unwanted user
unwanted_ratings = ratings_rmv_duplicates[ratings_rmv_duplicates.user_id.isin(unwanted_users.index)]
#which has both duplicates and unwanted users removed, is assigned to a variable called new_ratings.
new_ratings = ratings_rmv_duplicates.drop(unwanted_ratings.index)

In [None]:
new_ratings.shape

In [None]:
new_ratings.head(10)

In [None]:
books.head(10)

In [None]:
bookMerge = books.merge(new_ratings,on='book_id')

In [None]:
bookMerge.head(150)

In [None]:
bookMerge.isnull().sum()

==============================================================================================================================

In [None]:
books.sort_values(by = ['average_rating','ratings_count'], ascending = False).head()

In [None]:
df_book = books[['original_title', 'average_rating', 'ratings_count']]
df_book

In [None]:
df_book.describe()

Weighted Rating
v is the number of votes for the movie (represented by vote_count)

m is the minimum of votes required to be listed in the chart (to be calculated)

R is the average rating of the movie (represented by vote_average)

C is the mean of average_rating across the whole report (to be calculated)

(v / (v + m) * R) + (m / (m + v) * C)

In [None]:
m = books['ratings_count'].quantile(0.90)

In [None]:
C = books['average_rating'].mean()

In [None]:
# Function that computes the weighted rating of each book
def weighted_rating(x, m = m, C =C ):
    
    v = x['ratings_count']
    R = x['average_rating']
    
    # Calculation based on the IMDB formula
    return (v / (v + m) * R) + (m / (m + v) * C)

In [None]:
m

The rating must more than 95%

In [None]:
# Extract all qualified movies into a new DataFrame
qualified_books = df_book.copy().loc[df_book['ratings_count'] >= m]
qualified_books

In [None]:
len(qualified_books)

Insert weighted rating into columns

In [None]:
qualified_books['weighted_rating'] = qualified_books.apply(weighted_rating, axis=1)


In [None]:
qualified_books.head()

# Recommander Top 10 Book

In [None]:

qualified_books=qualified_books [['original_title', 'average_rating', 'ratings_count', 'weighted_rating']]
qualified_books = qualified_books.sort_values('weighted_rating', ascending = False)

qualified_books.head(10)

# Top genres Book

In [None]:
book_tags.head(10)

In [None]:
tags.head(10)

In [None]:
genres = ["Art", "Biography", "Business", "Chick Lit", "Children's", "Christian", "Classics",
          "Comics", "Contemporary", "Cookbooks", "Crime", "Ebooks", "Fantasy", "Fiction",
          "Gay and Lesbian", "Graphic Novels", "Historical Fiction", "History", "Horror",
          "Humor and Comedy", "Manga", "Memoir", "Music", "Mystery", "Nonfiction", "Paranormal",
          "Philosophy", "Poetry", "Psychology", "Religion", "Romance", "Science", "Science Fiction", 
          "Self Help", "Suspense", "Spirituality", "Sports", "Thriller", "Travel", "Young Adult"]

In [None]:
genres = list(map(str.lower, genres))
genres[:4]

In [None]:
available_genres = tags.loc[tags.tag_name.str.lower().isin(genres)]

In [None]:
available_genres.head(30)

In [None]:
available_genres_books = book_tags[book_tags.tag_id.isin(available_genres.tag_id)]

In [None]:
print('There are {} books that are tagged with above genres'.format(available_genres_books.shape[0]))

In [None]:
available_genres_books.head()

In [None]:
available_genres_books['genre'] = available_genres.tag_name.loc[available_genres_books.tag_id].values
available_genres_books.head()

In [None]:
books.head()

In [None]:
def recommender_genre(genre, percentile=0.85):
    df = available_genres_books[available_genres_books['genre'] == genre.lower()]
    #sets the 'book_id' column as the index of the books DataFrame. The index is the row labels of a DataFrame,
    #and setting a column as the index can be useful for faster lookups, merging, or joining operations.
    books_indexed = books.set_index('book_id')
    
    # Filter out book IDs not present in the 'books' DataFrame
    valid_book_ids = df.goodreads_book_id[df.goodreads_book_id.isin(books_indexed.index)]
    
    qualified = books_indexed.loc[valid_book_ids]
    
    v = qualified['ratings_count']
    m = qualified['ratings_count'].quantile(percentile)
    R = qualified['average_rating']
    C = qualified['average_rating'].mean()
    qualified['weighted_rating'] = (v / (v + m) * R) + (m / (m + v) * C)

    qualified.sort_values('weighted_rating', ascending=False, inplace=True)
    return qualified

In [None]:
available_genres_books

In [None]:
cols = ['title','authors','original_publication_year','average_rating','ratings_count','work_text_reviews_count','weighted_rating']

In [None]:
genre = 'Fiction'
recommender_genre(genre)[cols].head(15)

In [None]:
list(enumerate(available_genres.tag_name))

In [None]:
idx = 24  # romance
recommender_genre(list(available_genres.tag_name)[idx])[cols].head(15)

In [None]:
list(enumerate(available_genres.tag_name))

# Top 20 Gerne Book

In [None]:
idx = int(input("choose one genres : "))
recommender_genre(list(available_genres.tag_name)[idx])[cols].head(20)