In [1]:
import os
import html
import re
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

# Custom libraries - must be in same directory
import xml_to_dict
import get_book_tags

In [2]:
def clean_string(s):
    # often times a book will be missing a feature so we have to return if None
    if not s:
        return s
    
    # clean html
    TAG_RE = re.compile(r'<[^>]+>')
    s = html.unescape(s)
    s = TAG_RE.sub('', s)
    s = s.lower()
    return s

In [32]:
# Function that takes in book feature similarity matrices as input and outputs most similar book
def get_recommendations(df, indices, title, similarities, weights):
    
    # Get the index of the book that matches the title
    idx = indices[title]
    
    # Get the total number of books
    num_books = len(similarities[0])

    # Get the pairwsie similarity scores of all books with that book
    similarity_scores = []
    for similarity in similarities:
        similarity_scores.append(list(enumerate(similarity[idx])))
    
    # Sum and average the similarity scores of the three feature sets to get true similarity
    sim_scores = []
    for i in range(num_books):  
        book_id = similarity_scores[0][i][0]
        
        score = 0
        for j in range(len(weights)):
            score += (similarity_scores[j][i][1] * weights[j])
            
        sim_scores.append((book_id, score))
        
    # Sort the books based on the highest similarity scores first
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the N most similar books
    N = 20
    sim_scores = sim_scores[1:N]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return df['title'].iloc[book_indices]

In [4]:
# Read in book metadata and store in a dictionary
def get_books(metadata_directory, book_tags_file, tags_file):
    book_tags = get_book_tags.get_tags(book_tags_file, tags_file)
    books = []
    for file in os.listdir(metadata_directory):
        filename = metadata_directory + '/' + os.fsdecode(file)
        raw_book, popular_shelves = xml_to_dict.dict_from_xml_file(filename)

        book = {}
        book['id'] = raw_book['book']['id']
        book['title'] = raw_book['book']['title']
        book['author'] = raw_book['book']['authors']['author']
        
        # if multiple authors, only use first (main) author
        if isinstance(book['author'], dict):
            book['author'] = book['author']['name']
        else:
            book['author'] = book['author'][0]['name']

        book['description'] = raw_book['book']['description']
        book['description'] = clean_string(book['description'])

        # Turn popular shelves into soup
        book['popular_shelves'] = ''
        normalizing_value = 5
        for key,value in popular_shelves.items():
            for i in range(int(value) // normalizing_value):
                book['popular_shelves'] += ' ' + key
        
        # Turn book tags into soup
        book['tags'] = ''
        tags = book_tags[book['id']]
        for key,value in tags.items():
            for i in range(int(value) // normalizing_value):
                book['tags'] += ' ' + key

        books.append(book)
    return books

In [5]:
#TODO - make this path not hardcoded
metadata_directory = '../../goodbooks-10k/books_xml/books_xml'
book_tags_file = '../../goodbooks-10k/book_tags_with_bookid.csv'
tags_file = '../../goodbooks-10k/tags.csv'
books = get_books(metadata_directory, book_tags_file, tags_file)
len(books)

{'fantasy': '37174', 'young-adult': '12716', 'fiction': '9954', 'harry-potter': '7169', 'ya': '4364', 'series': '3857', 'magic': '3374', 'childrens': '2408', 'adventure': '1742', 'children': '1560', 'j-k-rowling': '1309', 'children-s': '1095', 'sci-fi-fantasy': '1093', 'childhood': '1027', 'classics': '958', 'reread': '858', 'audiobook': '836', '5-stars': '828', 'children-s-books': '798', 'favorite-books': '792', 'kids': '748', 'novels': '744', 'fantasy-sci-fi': '738', 'middle-grade': '718', 'audiobooks': '701', 'paranormal': '673', 'read-more-than-once': '659', 'my-library': '653', 'ya-fantasy': '612', 'teen': '600', 'witches': '552', 'english': '552', 'urban-fantasy': '547', 'british': '534', 'jk-rowling': '532', 'books': '522', 'read-in-2016': '505', 'supernatural': '484', 're-reads': '474', 'mystery': '465', 'ya-fiction': '464', 'harry-potter-series': '450', 'my-favorites': '449', 'own-it': '446', 'childrens-books': '445', 'library': '440', 'audio': '433', 'young-adult-fiction': '4

10000

In [6]:
df = pd.DataFrame(books)
df.head()

Unnamed: 0,author,description,id,popular_shelves,tags,title
0,J.K. Rowling,the war against voldemort is not going well: e...,1,fantasy fantasy fantasy fantasy fantasy fanta...,fantasy fantasy fantasy fantasy fantasy fanta...,Harry Potter and the Half-Blood Prince (Harry ...
1,J.K. Rowling,"six years of magic, adventure, and mystery mak...",10,fantasy fantasy fantasy fantasy fantasy fanta...,fantasy fantasy fantasy fantasy fantasy fanta...,"Harry Potter Collection (Harry Potter, #1-6)"
2,Paul Auster,the discovery of a mysterious notebook turns a...,10006,fiction fiction fiction fiction fiction ficti...,fiction fiction fiction fiction fiction ficti...,Oracle Night
3,Eleanor H. Porter,when orphaned 11-year-old pollyanna comes to l...,1000751,classics classics classics classics classics ...,classics classics classics classics classics ...,"Pollyanna (Pollyanna, #1)"
4,Jussi Adler-Olsen,indtil for blot 40 år siden blev utilpassede e...,10008056,krimi krimi krimi krimi krimi krimi krimi kri...,krimi krimi krimi krimi krimi krimi krimi kri...,"Journal 64 (Afdeling Q, #4)"


In [7]:
# Count unique authors
df['author'].nunique()

3888

In [8]:
#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

# Vectorize the features
tfidf = TfidfVectorizer(stop_words='english')
count = CountVectorizer(analyzer='word', stop_words='english')

tfidf_matrix_description = tfidf.fit_transform(df['description'])
tfidf_matrix_shelves = tfidf.fit_transform(df['popular_shelves'])
tfidf_matrix_tags = tfidf.fit_transform(df['tags'])
count_matrix_author = count.fit_transform(df['author'])

# Compute the cosine similarity matrix for the features we want
cosine_sim_description = linear_kernel(tfidf_matrix_description, tfidf_matrix_description)
cosine_sim_shelves = linear_kernel(tfidf_matrix_shelves, tfidf_matrix_shelves)
cosine_sim_tags = linear_kernel(tfidf_matrix_tags, tfidf_matrix_tags)
cosine_sim_author = cosine_similarity(count_matrix_author, count_matrix_author)

#Construct a reverse map of indices and book titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [40]:
# Construct list of similarity matrices and their corresponding weights
similarities = [cosine_sim_description, cosine_sim_shelves, cosine_sim_tags, cosine_sim_author]
weights = [0, 1, 1, 0]

In [41]:
title = 'The Innovators: How a Group of  Hackers, Geniuses and Geeks Created the Digital Revolution'
recs = get_recommendations(df, indices, title, similarities, weights)
print(recs)

4007                      Elon Musk: Inventing the Future
9541                    Frida: A Biography of Frida Kahlo
5853                                                 Cash
1108    Elizabeth the Queen: The Life of a Modern Monarch
386                       Einstein: His Life and Universe
6868                Gonzo: The Life of Hunter S. Thompson
1730                                     A Beautiful Mind
2907    Nikola Tesla: Imagination and the Man That Inv...
8668                     Born Standing Up: A Comic's Life
9308                       The Fry Chronicles (Memoir #2)
4328                Rosemary: The Hidden Kennedy Daughter
7010               The Autobiography of Benjamin Franklin
7009    The Wolf of Wall Street (The Wolf of Wall Stre...
385                   Benjamin Franklin: An American Life
558                The Story of My Experiments With Truth
7980                       Moab Is My Washpot (Memoir #1)
278                                    Van Gogh: The Life
3166          