The hybrid recommender combines content-based and collaborative filtering techniques. Given a user ID and a book title, it outputs similar books sorted based on expected ratings by that user, integrating personalized preferences with item similarities.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

import warnings
warnings.filterwarnings('ignore')

In [4]:
books = pd.read_csv('/content/books.csv.zip')
ratings = pd.read_csv('/content/ratings.csv.zip')
book_tags = pd.read_csv('/content/book_tags.csv.zip')
tags = pd.read_csv('/content/tags.csv')

In [6]:
books['original_publication_year'] = books['original_publication_year'].fillna(-1).apply(lambda x: int(x) if x != -1 else -1)

ratings_rmv_duplicates = ratings.drop_duplicates()
unwanted_users = ratings_rmv_duplicates.groupby('user_id')['user_id'].count()
unwanted_users = unwanted_users[unwanted_users < 3]
unwanted_ratings = ratings_rmv_duplicates[ratings_rmv_duplicates.user_id.isin(unwanted_users.index)]
new_ratings = ratings_rmv_duplicates.drop(unwanted_ratings.index)

In [23]:
! pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3162992 sha256=98d70f47ab5a3ab8452f05f959a483d59e66aed11e2eee4406468902ee8ccce3
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [28]:
from surprise.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD

reader = Reader()
data = Dataset.load_from_df(new_ratings[['user_id', 'book_id', 'rating']], reader)

svd = SVD()
cross_validate(svd, data, measures=['RMSE', 'MAE'])

books['authors'] = books['authors'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])


def get_genres(x):
    t = book_tags[book_tags.goodreads_book_id==x]
    return [i.lower().replace(" ", "") for i in tags.tag_name.loc[t.tag_id].values]
books['genres'] = books.book_id.apply(get_genres)

books['soup'] = books.apply(lambda x: ' '.join([x['title']] + x['authors'] + x['genres']), axis=1)


count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(books['soup'])

cosine_sim = cosine_similarity(count_matrix, count_matrix)
indices = pd.Series(books.index, index=books['title'])
titles = books['title']

def hybrid(user_id, title, n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    book_indices = [i[0] for i in sim_scores]

    df = books.iloc[book_indices][['book_id', 'title', 'original_publication_year', 'ratings_count', 'average_rating']]
    df['est'] = df['book_id'].apply(lambda x: svd.predict(user_id, x).est)
    df = df.sort_values('est', ascending=False)
    return df.head(n)

hybrid(4, 'Eat, Pray, Love')

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,est
80,7445,The Glass Castle,2005,621099,4.24,4.009757
382,1241,A Million Little Pieces,2003,184241,3.62,3.942398
4038,6365221,Mennonite in a Little Black Dress: A Memoir of...,2009,23096,3.17,3.830599
604,40173,"Are You There, Vodka? It's Me, Chelsea",2007,127096,3.85,3.830599
3984,46190,Love Is a Mix Tape,2007,21971,3.83,3.830599
744,12868761,Let's Pretend This Never Happened: A Mostly Tr...,2012,118475,3.9,3.830599
4724,13642929,My Beloved World,2013,17742,4.03,3.830599
5702,316558,Kabul Beauty School: An American Woman Goes Be...,2007,17002,3.63,3.830599
2803,18039963,A House in the Sky,2013,29369,4.2,3.830599
2701,6114607,"The Midwife: A Memoir of Birth, Joy, and Hard ...",2002,19176,4.17,3.830599


In [29]:
hybrid(10, 'Eat, Pray, Love')

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,est
80,7445,The Glass Castle,2005,621099,4.24,3.977734
3118,3465,Three Weeks With My Brother,2004,32208,4.01,3.864404
4038,6365221,Mennonite in a Little Black Dress: A Memoir of...,2009,23096,3.17,3.849192
4079,1918305,The Geography of Bliss: One Grump's Search for...,2008,23023,3.83,3.849192
3984,46190,Love Is a Mix Tape,2007,21971,3.83,3.849192
744,12868761,Let's Pretend This Never Happened: A Mostly Tr...,2012,118475,3.9,3.849192
4724,13642929,My Beloved World,2013,17742,4.03,3.849192
5702,316558,Kabul Beauty School: An American Woman Goes Be...,2007,17002,3.63,3.849192
2803,18039963,A House in the Sky,2013,29369,4.2,3.849192
2701,6114607,"The Midwife: A Memoir of Birth, Joy, and Hard ...",2002,19176,4.17,3.849192


In [30]:
def improved_hybrid(user_id, title, n=10):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:51]
    book_indices = [i[0] for i in sim_scores]

    df = books.iloc[book_indices][['book_id', 'title', 'ratings_count', 'average_rating', 'original_publication_year']]
    v = df['ratings_count']
    m = df['ratings_count'].quantile(0.60)
    R = df['average_rating']
    C = df['average_rating'].mean()
    df['weighted_rating'] = (R*v + C*m) / (v + m)

    df['est'] = df['book_id'].apply(lambda x: svd.predict(user_id, x).est)

    df['score'] = (df['est'] + df['weighted_rating']) / 2
    df = df.sort_values('score', ascending=False)
    return df[['book_id', 'title', 'original_publication_year', 'ratings_count', 'average_rating', 'score']].head(n)

In [31]:
improved_hybrid(4, 'Eat, Pray, Love')

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,score
80,7445,The Glass Castle,2005,621099,4.24,4.114372
328,2318271,The Last Lecture,2008,241869,4.25,4.014882
198,12691,Marley and Me: Life and Love With the World's ...,2005,367304,4.12,3.963863
1669,104189,Same Kind of Different as Me,2005,52964,4.21,3.950569
2803,18039963,A House in the Sky,2013,29369,4.2,3.924032
753,6366437,Half Broke Horses,2008,110597,4.05,3.918108
1067,29209,The Color of Water: A Black Man's Tribute to H...,1996,80906,4.06,3.915971
6286,8564644,Little Princes: One Man's Promise to Bring Hom...,2010,14765,4.25,3.905824
2701,6114607,"The Midwife: A Memoir of Birth, Joy, and Hard ...",2002,19176,4.17,3.902792
4593,31845516,Love Warrior,2016,20094,4.1,3.892257


In [32]:
improved_hybrid(10, 'Eat, Pray, Love')

Unnamed: 0,book_id,title,original_publication_year,ratings_count,average_rating,score
80,7445,The Glass Castle,2005,621099,4.24,4.09836
328,2318271,The Last Lecture,2008,241869,4.25,4.024178
198,12691,Marley and Me: Life and Love With the World's ...,2005,367304,4.12,3.97316
1669,104189,Same Kind of Different as Me,2005,52964,4.21,3.959865
2803,18039963,A House in the Sky,2013,29369,4.2,3.933329
753,6366437,Half Broke Horses,2008,110597,4.05,3.927404
1067,29209,The Color of Water: A Black Man's Tribute to H...,1996,80906,4.06,3.925268
6286,8564644,Little Princes: One Man's Promise to Bring Hom...,2010,14765,4.25,3.915121
2701,6114607,"The Midwife: A Memoir of Birth, Joy, and Hard ...",2002,19176,4.17,3.912088
4593,31845516,Love Warrior,2016,20094,4.1,3.901553
