BASELINE

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

import warnings
warnings.filterwarnings('ignore')

In [2]:
books = pd.read_csv('/content/books.csv.zip')
ratings = pd.read_csv('/content/ratings.csv.zip')
book_tags = pd.read_csv('/content/book_tags.csv.zip')
tags = pd.read_csv('/content/tags.csv')

In [3]:
books['original_publication_year'] = books['original_publication_year'].fillna(-1).apply(lambda x: int(x) if x != -1 else -1)

In [4]:
ratings_rmv_duplicates = ratings.drop_duplicates()
unwanted_users = ratings_rmv_duplicates.groupby('user_id')['user_id'].count()
unwanted_users = unwanted_users[unwanted_users < 3]
unwanted_ratings = ratings_rmv_duplicates[ratings_rmv_duplicates.user_id.isin(unwanted_users.index)]
new_ratings = ratings_rmv_duplicates.drop(unwanted_ratings.index)

In [5]:
new_ratings['title'] = books.set_index('id').title.loc[new_ratings.book_id].values

In [6]:
new_ratings.head(10)

Unnamed: 0,book_id,user_id,rating,title
0,1,314,5,"The Hunger Games (The Hunger Games, #1)"
1,1,439,3,"The Hunger Games (The Hunger Games, #1)"
2,1,588,5,"The Hunger Games (The Hunger Games, #1)"
3,1,1169,4,"The Hunger Games (The Hunger Games, #1)"
4,1,1185,4,"The Hunger Games (The Hunger Games, #1)"
5,1,2077,4,"The Hunger Games (The Hunger Games, #1)"
6,1,2487,4,"The Hunger Games (The Hunger Games, #1)"
7,1,2900,5,"The Hunger Games (The Hunger Games, #1)"
8,1,3662,4,"The Hunger Games (The Hunger Games, #1)"
9,1,3922,5,"The Hunger Games (The Hunger Games, #1)"


The Baseline Recommender provides generalized recommendations based on book popularity and sometimes genre. It prioritizes books that are both popular and critically acclaimed, assuming they're more likely to be enjoyed by the average audience.

In [7]:
v = books['ratings_count']
m = books['ratings_count'].quantile(0.95)
R = books['average_rating']
C = books['average_rating'].mean()
W = (R*v + C*m) / (v + m)


In [8]:
books['weighted_rating'] = W
qualified  = books.sort_values('weighted_rating', ascending=False).head(250)


In [9]:
#Top Books
qualified[['title', 'authors', 'average_rating', 'weighted_rating']].head(15)

Unnamed: 0,title,authors,average_rating,weighted_rating
24,Harry Potter and the Deathly Hallows (Harry Po...,"J.K. Rowling, Mary GrandPré",4.61,4.555956
26,Harry Potter and the Half-Blood Prince (Harry ...,"J.K. Rowling, Mary GrandPré",4.54,4.490428
17,Harry Potter and the Prisoner of Azkaban (Harr...,"J.K. Rowling, Mary GrandPré, Rufus Beck",4.53,4.48509
23,Harry Potter and the Goblet of Fire (Harry Pot...,"J.K. Rowling, Mary GrandPré",4.53,4.483227
1,Harry Potter and the Sorcerer's Stone (Harry P...,"J.K. Rowling, Mary GrandPré",4.44,4.424365
20,Harry Potter and the Order of the Phoenix (Har...,"J.K. Rowling, Mary GrandPré",4.46,4.419054
30,The Help,Kathryn Stockett,4.45,4.405158
38,"A Game of Thrones (A Song of Ice and Fire, #1)",George R.R. Martin,4.45,4.398759
134,"A Storm of Swords (A Song of Ice and Fire, #3)",George R.R. Martin,4.54,4.396645
421,"Harry Potter Boxset (Harry Potter, #1-7)",J.K. Rowling,4.74,4.391147


In [10]:
#Top Genres of Books
book_tags.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [11]:
tags.head()

Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-
2,2,--10-
3,3,--12-
4,4,--122-


In [12]:
genres = ["Art", "Biography", "Business", "Chick Lit", "Children's", "Christian", "Classics",
          "Comics", "Contemporary", "Cookbooks", "Crime", "Ebooks", "Fantasy", "Fiction",
          "Gay and Lesbian", "Graphic Novels", "Historical Fiction", "History", "Horror",
          "Humor and Comedy", "Manga", "Memoir", "Music", "Mystery", "Nonfiction", "Paranormal",
          "Philosophy", "Poetry", "Psychology", "Religion", "Romance", "Science", "Science Fiction",
          "Self Help", "Suspense", "Spirituality", "Sports", "Thriller", "Travel", "Young Adult"]


In [13]:
genres = list(map(str.lower, genres))
genres[:4]


['art', 'biography', 'business', 'chick lit']

In [14]:
available_genres = tags.loc[tags.tag_name.str.lower().isin(genres)]
available_genres.head()

Unnamed: 0,tag_id,tag_name
2938,2938,art
4605,4605,biography
5951,5951,business
7077,7077,christian
7457,7457,classics


In [15]:
available_genres_books = book_tags[book_tags.tag_id.isin(available_genres.tag_id)]
print('There are {} books that are tagged with above genres'.format(available_genres_books.shape[0]))

There are 60573 books that are tagged with above genres


In [16]:
available_genres_books['genre'] = available_genres.tag_name.loc[available_genres_books.tag_id].values
available_genres_books.head()

Unnamed: 0,goodreads_book_id,tag_id,count,genre
1,1,11305,37174,fantasy
5,1,11743,9954,fiction
25,1,7457,958,classics
38,1,22973,673,paranormal
52,1,20939,465,mystery


In [17]:
def build_chart(genre, percentile=0.85):
    df = available_genres_books[available_genres_books['genre'] == genre.lower()]
    qualified = books.set_index('book_id').loc[df.goodreads_book_id]

    v = qualified['ratings_count']
    m = qualified['ratings_count'].quantile(percentile)
    R = qualified['average_rating']
    C = qualified['average_rating'].mean()
    qualified['weighted_rating'] = (R*v + C*m) / (v + m)

    qualified.sort_values('weighted_rating', ascending=False, inplace=True)
    return qualified

In [18]:
cols = ['title','authors','original_publication_year','average_rating','ratings_count','work_text_reviews_count','weighted_rating']

In [19]:
genre = 'Fiction'
build_chart(genre)[cols].head(15)

Unnamed: 0_level_0,title,authors,original_publication_year,average_rating,ratings_count,work_text_reviews_count,weighted_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
136251,Harry Potter and the Deathly Hallows (Harry Po...,"J.K. Rowling, Mary GrandPré",2007,4.61,1746574,51942,4.587098
862041,"Harry Potter Boxset (Harry Potter, #1-7)",J.K. Rowling,1998,4.74,190050,6508,4.544691
1,Harry Potter and the Half-Blood Prince (Harry ...,"J.K. Rowling, Mary GrandPré",2005,4.54,1678823,27520,4.518933
5,Harry Potter and the Prisoner of Azkaban (Harr...,"J.K. Rowling, Mary GrandPré, Rufus Beck",1999,4.53,1832823,36099,4.510997
6,Harry Potter and the Goblet of Fire (Harry Pot...,"J.K. Rowling, Mary GrandPré",2000,4.53,1753043,31084,4.510164
62291,"A Storm of Swords (A Song of Ice and Fire, #3)",George R.R. Martin,2000,4.54,469022,19497,4.471466
186074,The Name of the Wind (The Kingkiller Chronicle...,Patrick Rothfuss,2007,4.55,400101,28631,4.469922
1215032,"The Wise Man's Fear (The Kingkiller Chronicle,...",Patrick Rothfuss,2011,4.57,245686,15503,4.446163
18512,"The Return of the King (The Lord of the Rings,...",J.R.R. Tolkien,1955,4.51,463959,6644,4.444645
2,Harry Potter and the Order of the Phoenix (Har...,"J.K. Rowling, Mary GrandPré",2003,4.46,1735368,28685,4.442607


In [20]:
list(enumerate(available_genres.tag_name))

[(0, 'art'),
 (1, 'biography'),
 (2, 'business'),
 (3, 'christian'),
 (4, 'classics'),
 (5, 'comics'),
 (6, 'contemporary'),
 (7, 'cookbooks'),
 (8, 'crime'),
 (9, 'ebooks'),
 (10, 'fantasy'),
 (11, 'fiction'),
 (12, 'history'),
 (13, 'horror'),
 (14, 'manga'),
 (15, 'memoir'),
 (16, 'music'),
 (17, 'mystery'),
 (18, 'nonfiction'),
 (19, 'paranormal'),
 (20, 'philosophy'),
 (21, 'poetry'),
 (22, 'psychology'),
 (23, 'religion'),
 (24, 'romance'),
 (25, 'science'),
 (26, 'spirituality'),
 (27, 'sports'),
 (28, 'suspense'),
 (29, 'thriller'),
 (30, 'travel')]

In [22]:
idx = 12  #history
build_chart(list(available_genres.tag_name)[idx])[cols].head(15)

Unnamed: 0_level_0,title,authors,original_publication_year,average_rating,ratings_count,work_text_reviews_count,weighted_rating
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4667024,The Help,Kathryn Stockett,2009,4.45,1531753,78204,4.431319
21853621,The Nightingale,Kristin Hannah,2015,4.54,253606,37279,4.428014
8664353,"Unbroken: A World War II Story of Survival, Re...",Laura Hillenbrand,2010,4.4,487775,41572,4.351917
19063,The Book Thief,Markus Zusak,2005,4.36,1159741,93611,4.340421
11422,Redeeming Love,Francine Rivers,1991,4.48,157506,9147,4.337177
128029,A Thousand Splendid Suns,Khaled Hosseini,2007,4.34,818742,43645,4.314338
1923820,Holy Bible: King James Version,Anonymous,1611,4.43,159457,4438,4.302898
561909,The Hiding Place: The Triumphant True Story of...,"Corrie ten Boom, John Sherrill, Elizabeth Sher...",1971,4.42,171384,7801,4.302061
256008,Lonesome Dove,Larry McMurtry,1985,4.47,111498,5343,4.29363
4406,East of Eden,John Steinbeck,1952,4.35,320919,14454,4.28955
