In [127]:
import os
import html
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# Custom libraries - must be in same directory
import xml_to_dict

In [126]:
def clean_string(s):
    # often times a book will be missing a feature so we have to return if None
    if not s:
        return s
    
    # clean html
    TAG_RE = re.compile(r'<[^>]+>')
    s = html.unescape(s)
    s = TAG_RE.sub('', s)
    
    s = s.lower()
    
    return s

In [136]:
# Function that takes in book title as input and outputs most similar book
def get_recommendations(df, indices, title, cosine_sim_desc, cosine_sim_shelves, cosine_sim_author):
    # Get the index of the book that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all books with that book
    sim_scores_desc = list(enumerate(cosine_sim_desc[idx]))
    sim_scores_shelves = list(enumerate(cosine_sim_shelves[idx]))
    sim_scores_author = list(enumerate(cosine_sim_author[idx]))
    
    # Assign weights to each feature
    weight_desc = 1
    weight_shelves = 1
    weight_author = .25
    
    # Sum and average the similarity scores of the three feature sets to get true similarity
    sim_scores = []
    for i in range(len(sim_scores_desc)):  
        book_id = sim_scores_desc[i][0]
        
        desc_score = sim_scores_desc[i][1] * weight_desc
        shelves_score = sim_scores_shelves[i][1] * weight_shelves
        author_score = sim_scores_author[i][1] * weight_author

        score = (book_id, (desc_score + shelves_score + author_score) / 3)
        sim_scores.append(score)
        
    # Sort the books based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the N most similar books
    N = 20
    sim_scores = sim_scores[1:N]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return df['title'].iloc[book_indices]

In [116]:
# Read in book metadata and store in a dictionary
def get_books(metadata_directory):
    books = []
    for file in os.listdir(metadata_directory):
        filename = metadata_directory + '/' + os.fsdecode(file)
        raw_book, popular_shelves = xml_to_dict.dict_from_xml_file(filename)

        book = {}
        book['id'] = raw_book['book']['id']
        book['title'] = raw_book['book']['title']
        book['author'] = raw_book['book']['authors']['author']
        if isinstance(book['author'], dict):
            book['author'] = book['author']['name']
        else:
            book['author'] = book['author'][0]['name']

        book['description'] = raw_book['book']['description']
        book['description'] = clean_string(book['description'])

        book['popular_shelves'] = popular_shelves
        # Turn popular shelves into soup
        soup = ''
        normalizing_value = 5
        for key,value in popular_shelves.items():
            for i in range(int(value) // normalizing_value):
                soup += ' ' + key
        book['popular_shelves'] = soup

        books.append(book)
    return books

In [141]:
#TODO - make this path not hardcoded
metadata_directory = '../../goodbooks-10k/books_xml/books_xml'
books = get_books(metadata_directory)
len(books)

10000

In [142]:
df = pd.DataFrame(books)
df.head()

Unnamed: 0,author,description,id,popular_shelves,title
0,J.K. Rowling,the war against voldemort is not going well: e...,1,fantasy fantasy fantasy fantasy fantasy fanta...,Harry Potter and the Half-Blood Prince (Harry ...
1,J.K. Rowling,"six years of magic, adventure, and mystery mak...",10,favorites favorites favorites favorites favor...,"Harry Potter Collection (Harry Potter, #1-6)"
2,Paul Auster,the discovery of a mysterious notebook turns a...,10006,fiction fiction fiction fiction fiction ficti...,Oracle Night
3,Eleanor H. Porter,when orphaned 11-year-old pollyanna comes to l...,1000751,classics classics classics classics classics ...,"Pollyanna (Pollyanna, #1)"
4,Jussi Adler-Olsen,indtil for blot 40 år siden blev utilpassede e...,10008056,krimi krimi krimi krimi krimi krimi krimi kri...,"Journal 64 (Afdeling Q, #4)"


In [None]:
#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

# Vectorize the features
tfidf = TfidfVectorizer(stop_words='english')
count = CountVectorizer(analyzer='word', stop_words='english')

tfidf_matrix_description = tfidf.fit_transform(df['description'])
tfidf_matrix_shelves = tfidf.fit_transform(df['popular_shelves'])
count_matrix_author = count.fit_transform(df['author'])

# Compute the cosine similarity matrix for the features we want
cosine_sim_description = linear_kernel(tfidf_matrix_description, tfidf_matrix_description)
cosine_sim_shelves = linear_kernel(tfidf_matrix_shelves, tfidf_matrix_shelves)
cosine_sim_author = cosine_similarity(count_matrix_author, count_matrix_author)

#Construct a reverse map of indices and book titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [152]:
title = 'East of Eden'
recs = get_recommendations(df, indices, title, cosine_sim_description, cosine_sim_shelves, cosine_sim_author)
print(recs)

9420                       Of Mice and Men
6486                   The Grapes of Wrath
6729          The Winter of Our Discontent
7051                             The Pearl
2507                         Tortilla Flat
6731                           Cannery Row
9372                          The Red Pony
2238                         Paradise Lost
1107                      The Moon Is Down
1979               Moby-Dick or, The Whale
6100                    The Sun Also Rises
5181    The Adventures of Huckleberry Finn
2613                            My Ántonia
3103                          Oliver Twist
6652                      The Great Gatsby
3448                  A Tale of Two Cities
3818               The Old Man and the Sea
8427                           Lorna Doone
6657                     Uncle Tom's Cabin
Name: title, dtype: object
