# Content based recommendation system

## Background

## Set-up

### Dependencies

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Import data

In [2]:
books = pd.read_csv('../data/raw/books.csv')
to_read = pd.read_csv('../data/raw/to_read.csv')
ratings = pd.read_csv('../data/raw/ratings.csv')
book_tags = pd.read_csv('../data/raw/book_tags.csv')
tags = pd.read_csv('../data/raw/tags.csv')

## Data exploration

### books

In [3]:
# Dimensions
print(f"Number of records: {books.shape[0]}")
print(f"Number of columns: {books.shape[1]}")
books.head(2)

Number of records: 10000
Number of columns: 23


Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...


In [4]:
# Data types
display(books.dtypes.value_counts())
print("\n")
books.dtypes

int64      13
object      7
float64     3
Name: count, dtype: int64





book_id                        int64
goodreads_book_id              int64
best_book_id                   int64
work_id                        int64
books_count                    int64
isbn                          object
isbn13                       float64
authors                       object
original_publication_year    float64
original_title                object
title                         object
language_code                 object
average_rating               float64
ratings_count                  int64
work_ratings_count             int64
work_text_reviews_count        int64
ratings_1                      int64
ratings_2                      int64
ratings_3                      int64
ratings_4                      int64
ratings_5                      int64
image_url                     object
small_image_url               object
dtype: object

In [5]:
# Missing values
books.isnull().sum()[(books.isnull().sum() > 1)]

isbn                          700
isbn13                        585
original_publication_year      21
original_title                585
language_code                1084
dtype: int64

In [6]:
# Duplicate values
books[books.duplicated()]

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url


### ratings

In [7]:
# Dimensions
print(f"Number of records: {ratings.shape[0]}")
print(f"Number of columns: {ratings.shape[1]}")
ratings.head(2)

Number of records: 5976479
Number of columns: 3


Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4


In [8]:
# Data types
display(ratings.dtypes.value_counts())
print("\n")
ratings.dtypes

int64    3
Name: count, dtype: int64





user_id    int64
book_id    int64
rating     int64
dtype: object

In [9]:
# Missing values
ratings.isnull().sum()[(ratings.isnull().sum() > 1)]

Series([], dtype: int64)

In [10]:
# Duplicate values
ratings[ratings.duplicated()]

Unnamed: 0,user_id,book_id,rating


### to read

In [11]:
# Dimensions
print(f"Number of records: {to_read.shape[0]}")
print(f"Number of columns: {to_read.shape[1]}")
to_read.head(2)

Number of records: 912705
Number of columns: 2


Unnamed: 0,user_id,book_id
0,9,8
1,15,398


In [12]:
# Data types
display(to_read.dtypes.value_counts())
print("\n")
to_read.dtypes

int64    2
Name: count, dtype: int64





user_id    int64
book_id    int64
dtype: object

In [13]:
# Missing values
to_read.isnull().sum()[(to_read.isnull().sum() > 1)]

Series([], dtype: int64)

In [14]:
# Duplicate values
to_read[to_read.duplicated()]

Unnamed: 0,user_id,book_id


### book tags

In [15]:
# Dimensions
print(f"Number of records: {book_tags.shape[0]}")
print(f"Number of columns: {book_tags.shape[1]}")
book_tags.head(2)

Number of records: 999912
Number of columns: 3


Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174


In [16]:
# Data types
display(book_tags.dtypes.value_counts())
print("\n")
book_tags.dtypes

int64    3
Name: count, dtype: int64





goodreads_book_id    int64
tag_id               int64
count                int64
dtype: object

In [17]:
# Missing values
book_tags.isnull().sum()[(book_tags.isnull().sum() > 1)]

Series([], dtype: int64)

In [18]:
# Duplicate values
book_tags[book_tags.duplicated()]

Unnamed: 0,goodreads_book_id,tag_id,count
159371,22369,25148,4
265128,52629,10094,1
265140,52629,2928,1
265155,52629,13272,1
265187,52629,13322,1
308771,77449,25148,7


In [19]:
# Remove duplicates
book_tags = book_tags.loc[book_tags.duplicated() == False, :]

### tags

In [20]:
# Dimensions
print(f"Number of records: {tags.shape[0]}")
print(f"Number of columns: {tags.shape[1]}")
tags.head(2)

Number of records: 34252
Number of columns: 2


Unnamed: 0,tag_id,tag_name
0,0,-
1,1,--1-


In [21]:
# Data types
display(tags.dtypes.value_counts())
print("\n")
tags.dtypes

int64     1
object    1
Name: count, dtype: int64





tag_id       int64
tag_name    object
dtype: object

In [22]:
# Missing values
tags.isnull().sum()[(tags.isnull().sum() > 1)]

Series([], dtype: int64)

In [23]:
# Duplicate values
tags[tags.duplicated()]

Unnamed: 0,tag_id,tag_name


## Model 1: Item similarity

### Pre-processing

In [24]:
# Preprocess data
books['title'] = books['title'].str.lower()
books['authors'] = books['authors'].str.lower()

### Feature extraction / similarity scores

In [25]:
# Combine relevant features (book title and author) into a single column
books['features'] = books['title'] + ' ' + books['authors']
books.loc[0, 'features']

'the hunger games (the hunger games, #1) suzanne collins'

In [26]:
# TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(books['features'])

# Cosine similarities between books
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

### Recommendation system

In [27]:
def get_content_based_recommendations(book_title, books, similarity_scores, top_k):
    """Function to get content-based book recommendations for a given book title
    """
    # Find the index of the book in the dataframe
    idx = books.index[books['title'] == book_title].tolist()[0]

    # Get the pairwise similarity scores
    book_similarities = list(enumerate(similarity_scores[idx]))

    # Sort books based on similarity scores
    book_similarities = sorted(book_similarities, key=lambda x: x[1], reverse=True)

    # Exclude the book itself (most similar will be the book itself)
    book_similarities = book_similarities[1:]
    
    # Get top k similar books
    similar_books = book_similarities[0:top_k]
    
    # Extract book titles
    recommended_books = books.loc[[i for i, s in similar_books], 'title'].tolist()

    return recommended_books

In [28]:
# Example
book_title = 'the hunger games (the hunger games, #1)'
content_based_recommendations = get_content_based_recommendations(book_title, books, cosine_similarities, 5)

# Display recommendations
print(f"Content-based recommendations for '{book_title}':\n")
for i, title in enumerate(content_based_recommendations):
    print(i+1, title)

Content-based recommendations for 'the hunger games (the hunger games, #1)':

1 the hunger games trilogy boxset (the hunger games, #1-3)
2 catching fire (the hunger games, #2)
3 mockingjay (the hunger games, #3)
4 the world of the hunger games (hunger games trilogy)
5 the hunger games tribute guide


### Evaluation

## Model 2: Item-user profile similarity

### Pre-processing

In [29]:
# Book title lookup
book_lookup = books.groupby("book_id")["title"].first()
book_lookup = book_lookup.reset_index()
# book_lookup.head(2)

# Add book title to ratings df
ratings = ratings.join(book_lookup, on="book_id", how="left", rsuffix="_right")
ratings = ratings.drop(columns=["book_id_right"])
ratings.head(2)

Unnamed: 0,user_id,book_id,rating,title
0,1,258,5,"the scorch trials (maze runner, #2)"
1,2,4081,4,"the mistake (off-campus, #2)"


In [30]:
# Book tag lookup
tags["tag_name"] = tags["tag_name"].str.lower()
tag_lookup = book_tags.join(tags, on="tag_id", how="left", rsuffix="_right")
tag_lookup = tag_lookup.drop(columns=["tag_id_right"])
tag_lookup = tag_lookup.groupby("goodreads_book_id")["tag_name"].apply(list)
tag_lookup = tag_lookup.apply(lambda x: " ".join(list(x))).reset_index()
# tag_lookup.head(5)

# Add tags to books df
books = books.join(tag_lookup, on="goodreads_book_id", how="left", rsuffix="_right")
books = books.drop(columns=["goodreads_book_id_right"])
books.head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,features,tag_name
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,suzanne collins,2008.0,The Hunger Games,...,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,"the hunger games (the hunger games, #1) suzann...",
1,2,3,3,4640799,491,439554934,9780440000000.0,"j.k. rowling, mary grandpré",1997.0,Harry Potter and the Philosopher's Stone,...,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,harry potter and the sorcerer's stone (harry p...,favorites fantasy currently-reading young-adul...


### Feature extraction / similarity scores

In [31]:
# Combine relevant features (book title, author, and tags) into a single column
books['features_2'] = books['title'] + ' ' + books['authors'] + ' ' + books['tag_name'].fillna(' ')
books.head(2)

Unnamed: 0,book_id,goodreads_book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url,features,tag_name,features_2
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,suzanne collins,2008.0,The Hunger Games,...,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...,"the hunger games (the hunger games, #1) suzann...",,"the hunger games (the hunger games, #1) suzann..."
1,2,3,3,4640799,491,439554934,9780440000000.0,"j.k. rowling, mary grandpré",1997.0,Harry Potter and the Philosopher's Stone,...,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...,harry potter and the sorcerer's stone (harry p...,favorites fantasy currently-reading young-adul...,harry potter and the sorcerer's stone (harry p...


In [32]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(books['features_2'])

# Calculate cosine similarities between books
cosine_similarities = cosine_similarity(tfidf_matrix, tfidf_matrix)

### Recommender system

In [51]:
def get_content_based_recommendations(user_id, ratings, books, similarity_scores, top_k):
    """Function to get content-based book recommendations for a given user profile
    """
    # Filter the dataframe for the given user's reading history & join into a single string
    user_history_features = " ".join(ratings.loc[ratings["user_id"] == user_id, "title"].values)
    
    # TF-IDF Vectorization for the user's reading history
    user_vector = tfidf_vectorizer.transform([user_history_features])
    print(user_vector)
    
    # Calculate cosine similarities between the user's reading history and all books
    book_user_similarities = list(enumerate(cosine_similarity(user_vector, tfidf_matrix).flatten()))
    print(book_user_similarities)
    
    # Sort books based on similarity scores
    book_user_similarities = sorted(book_user_similarities, key=lambda x: x[1], reverse=True)

    # Exclude the book itself (most similar will be the book itself)
    book_user_similarities = book_user_similarities[1:]
    
    # Get top k similar books
    recommended_books = book_user_similarities[0:top_k]

    return recommended_books

In [52]:
# Example: Get content-based recommendations for a specific user
content_based_recommendations = get_content_based_recommendations(1, ratings, books, cosine_similarities, 5)
# # Display recommendations
# print(f"Content-based recommendations for User {user_id}: {content_based_recommendations}")
# content_based_recommendations

  (0, 16116)	0.09219922856325664
  (0, 15908)	0.04754541313583311
  (0, 15898)	0.1384239174493154
  (0, 15897)	0.05475249343293524
  (0, 15864)	0.046777213968847306
  (0, 15858)	0.03684060155550634
  (0, 15822)	0.036120544494280614
  (0, 15819)	0.039173221265137816
  (0, 15807)	0.04609961428162832
  (0, 15719)	0.039173221265137816
  (0, 15711)	0.039996472986842266
  (0, 15677)	0.07018097863479364
  (0, 15657)	0.040957526460168
  (0, 15531)	0.04122515369415768
  (0, 15496)	0.10950498686587049
  (0, 15486)	0.05076485370089128
  (0, 15482)	0.08917671117363875
  (0, 15478)	0.02753067745228939
  (0, 15457)	0.05076485370089128
  (0, 15377)	0.05475249343293524
  (0, 15160)	0.048432233991259795
  (0, 15092)	0.06780369654211958
  (0, 15034)	0.05475249343293524
  (0, 14971)	0.0389845831173349
  (0, 14939)	0.05475249343293524
  :	:
  (0, 1887)	0.07724981350107306
  (0, 1846)	0.040227717596637264
  (0, 1791)	0.09686446798251959
  (0, 1698)	0.04754541313583311
  (0, 1685)	0.09208012845532973
  (0, 