In [2]:
import os
import html
import re
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.decomposition import TruncatedSVD

# Custom libraries - must be in same directory
import xml_to_dict
import get_book_tags
import get_bookid_mapper

In [13]:
def clean_string(s):
    # often times a book will be missing a feature so we have to return if None
    if not s:
        return s
    
    # clean html
    TAG_RE = re.compile(r'<[^>]+>')
    s = html.unescape(s)
    s = TAG_RE.sub('', s)
    s = s.lower()
    return s

In [53]:
# Function that takes in book feature similarity matrices as input and outputs most similar book
def get_recommendations(df, indices, title, similarities, weights):
    
    # Get the index of the book that matches the title
    idx = indices[title]
    idx -= 1
    
    # Get the total number of books
    num_books = len(similarities[0])

    # Get the pairwsie similarity scores of all books with that book
    similarity_scores = []
    for similarity in similarities:
        similarity_scores.append(list(enumerate(similarity[idx])))
    
    # Sum and average the similarity scores of the three feature sets to get true similarity
    sim_scores = []
    for i in range(num_books):  
        book_id = similarity_scores[0][i][0]
        
        score = 0
        for j in range(len(weights)):
            score += (similarity_scores[j][i][1] * weights[j])
            
        sim_scores.append((book_id, score))
        
    # Sort the books based on the highest similarity scores first
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the N most similar books
    N = 20
    sim_scores = sim_scores[1:N]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return df['title'].iloc[book_indices]

In [15]:
# Read in book metadata and store in a dictionary
def get_books(metadata_directory, goodreads_to_bookid, book_tags):
    books = []
    for file in os.listdir(metadata_directory):
        filename = metadata_directory + '/' + os.fsdecode(file)
        raw_book, popular_shelves = xml_to_dict.dict_from_xml_file(filename)

        book = {}
        goodreads_id = raw_book['book']['id']
        book['id'] = goodreads_to_bookid[goodreads_id]
        book['title'] = raw_book['book']['title']
        book['author'] = raw_book['book']['authors']['author']
        
        # if multiple authors, only use first (main) author
        if isinstance(book['author'], dict):
            book['author'] = book['author']['name']
        else:
            book['author'] = book['author'][0]['name']

        book['description'] = raw_book['book']['description']
        book['description'] = clean_string(book['description'])

        # Turn popular shelves into soup
        book['popular_shelves'] = ''
        normalizing_value = 5
        for key,value in popular_shelves.items():
            for i in range(int(value) // normalizing_value):
                book['popular_shelves'] += ' ' + key
        
        # Turn book tags into soup
        book['tags'] = ''
        tags = book_tags[goodreads_id]
        for key,value in tags.items():
            for i in range(int(value) // normalizing_value):
                book['tags'] += ' ' + key

        books.append(book)
    return books

In [18]:
# Read in mapper and book tags
goodreads_to_bookid = get_bookid_mapper.get_mapper('../../goodbooks-10k/books.csv')
book_tags = get_book_tags.get_tags('../../goodbooks-10k/book_tags_with_bookid.csv', '../../goodbooks-10k/tags.csv')

In [19]:
# Get books as dictionary of all its features
metadata_directory = '../../goodbooks-10k/books_xml/books_xml'
books = get_books(metadata_directory, goodreads_to_bookid, book_tags)
len(books)

10000

In [37]:
df = pd.DataFrame(books)
df['id'] = df['id'].astype(int)
df = df.sort_values(by=['id'])
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,author,description,popular_shelves,tags,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Suzanne Collins,winning will make you famous. losing means cer...,young-adult young-adult young-adult young-adu...,young-adult young-adult young-adult young-adu...,"The Hunger Games (The Hunger Games, #1)"
2,J.K. Rowling,harry potter's life is miserable. his parents ...,fantasy fantasy fantasy fantasy fantasy fanta...,fantasy fantasy fantasy fantasy fantasy fanta...,Harry Potter and the Sorcerer's Stone (Harry P...
3,Stephenie Meyer,about three things i was absolutely positive.f...,young-adult young-adult young-adult young-adu...,young-adult young-adult young-adult young-adu...,"Twilight (Twilight, #1)"
4,Harper Lee,the unforgettable novel of a childhood in a sl...,classics classics classics classics classics ...,classics classics classics classics classics ...,To Kill a Mockingbird
5,F. Scott Fitzgerald,"the great gatsby, f. scott fitzgerald’s third ...",classics classics classics classics classics ...,classics classics classics classics classics ...,The Great Gatsby


In [38]:
# Count unique authors
df['author'].nunique()

3888

In [103]:
#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

# Vectorize the features
tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix_description = tfidf.fit_transform(df['description'])
tfidf_matrix_shelves = tfidf.fit_transform(df['popular_shelves'])
tfidf_matrix_tags = tfidf.fit_transform(df['tags'])

# Use pandas and scipy to make sparse sim matrix for authors
count_matrix_author = pd.get_dummies(df['author'])
count_matrix_author = scipy.sparse.csr_matrix(count_matrix_author.values)

# Compute the cosine similarity matrix for the features we want
cosine_sim_description = cosine_similarity(tfidf_matrix_description, tfidf_matrix_description)
cosine_sim_shelves = cosine_similarity(tfidf_matrix_shelves, tfidf_matrix_shelves)
cosine_sim_tags = cosine_similarity(tfidf_matrix_tags, tfidf_matrix_tags)
cosine_sim_author = cosine_similarity(count_matrix_author, count_matrix_author)

In [87]:
# Append all feature matrices together
feature_matrix = scipy.sparse.hstack([tfidf_matrix_description, tfidf_matrix_shelves, tfidf_matrix_tags])
feature_matrix.shape

(10000, 82203)

In [88]:
# Reduce the dimensions of the combined feature matrix
svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
feature_matrix_reduced = svd.fit_transform(feature_matrix)

In [104]:
# Compute the cosine similarity matrix for the combined and reduced features
cosine_sim = cosine_similarity(feature_matrix_reduced, feature_matrix_reduced)

In [90]:
#Construct a reverse map of indices and book titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [105]:
# Get recs using indivual and full sim matricies
title = 'The Great Gatsby'

similarities = [cosine_sim_description, cosine_sim_shelves, cosine_sim_tags, cosine_sim_author]
weights = [1, 1, 1, 0]
recs = get_recommendations(df, indices, title, similarities, weights)
recs

id
32                                   Of Mice and Men
3255                        The Beautiful and Damned
58                The Adventures of Huckleberry Finn
131                              The Grapes of Wrath
8                             The Catcher in the Rye
130                          The Old Man and the Sea
28                                 Lord of the Flies
160                               Great Expectations
4                              To Kill a Mockingbird
194                          Moby-Dick or, The Whale
730                                        The Pearl
264                               The Sun Also Rises
63                                 Wuthering Heights
225                                     East of Eden
116                     The Adventures of Tom Sawyer
83                              A Tale of Two Cities
559                        Tess of the D'Urbervilles
1094                                      My Ántonia
6155    Heart of Darkness and Selected Shor

In [106]:
# Get recs using the reduced combined feature matrix
similarities = [cosine_sim]
weights = [1]
recs = get_recommendations(df, indices, title, similarities, weights)
recs

id
3255                             The Beautiful and Damned
730                                             The Pearl
194                               Moby-Dick or, The Whale
160                                    Great Expectations
58                     The Adventures of Huckleberry Finn
32                                        Of Mice and Men
1094                                           My Ántonia
129                       One Flew Over the Cuckoo's Nest
131                                   The Grapes of Wrath
8                                  The Catcher in the Rye
63                                      Wuthering Heights
301                                     Heart of Darkness
28                                      Lord of the Flies
478                                       Robinson Crusoe
650                                     Uncle Tom's Cabin
225                                          East of Eden
7620                  Ethan Frome and Other Short Fiction
14         

In [93]:
# Save reduced feature matrix
filename = 'feature_matrix_reduced.npy'
np.save(filename, feature_matrix_reduced)
feature_matrix_reduced.shape

(10000, 1000)

In [66]:
# Verify saving and loading works
feature_matrix_reduced = np.load(filename)
feature_matrix_reduced.shape

(10000, 1000)