In [73]:
import os
import html
import re
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.decomposition import TruncatedSVD

# Custom libraries - must be in same directory
import xml_to_dict
import get_book_tags

In [2]:
def clean_string(s):
    # often times a book will be missing a feature so we have to return if None
    if not s:
        return s
    
    # clean html
    TAG_RE = re.compile(r'<[^>]+>')
    s = html.unescape(s)
    s = TAG_RE.sub('', s)
    s = s.lower()
    return s

In [3]:
# Function that takes in book feature similarity matrices as input and outputs most similar book
def get_recommendations(df, indices, title, similarities, weights):
    
    # Get the index of the book that matches the title
    idx = indices[title]
    
    # Get the total number of books
    num_books = len(similarities[0])

    # Get the pairwsie similarity scores of all books with that book
    similarity_scores = []
    for similarity in similarities:
        similarity_scores.append(list(enumerate(similarity[idx])))
    
    # Sum and average the similarity scores of the three feature sets to get true similarity
    sim_scores = []
    for i in range(num_books):  
        book_id = similarity_scores[0][i][0]
        
        score = 0
        for j in range(len(weights)):
            score += (similarity_scores[j][i][1] * weights[j])
            
        sim_scores.append((book_id, score))
        
    # Sort the books based on the highest similarity scores first
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the N most similar books
    N = 20
    sim_scores = sim_scores[1:N]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return df['title'].iloc[book_indices]

In [4]:
# Read in book metadata and store in a dictionary
def get_books(metadata_directory, book_tags_file, tags_file):
    book_tags = get_book_tags.get_tags(book_tags_file, tags_file)
    books = []
    for file in os.listdir(metadata_directory):
        filename = metadata_directory + '/' + os.fsdecode(file)
        raw_book, popular_shelves = xml_to_dict.dict_from_xml_file(filename)

        book = {}
        book['id'] = raw_book['book']['id']
        book['title'] = raw_book['book']['title']
        book['author'] = raw_book['book']['authors']['author']
        
        # if multiple authors, only use first (main) author
        if isinstance(book['author'], dict):
            book['author'] = book['author']['name']
        else:
            book['author'] = book['author'][0]['name']

        book['description'] = raw_book['book']['description']
        book['description'] = clean_string(book['description'])

        # Turn popular shelves into soup
        book['popular_shelves'] = ''
        normalizing_value = 5
        for key,value in popular_shelves.items():
            for i in range(int(value) // normalizing_value):
                book['popular_shelves'] += ' ' + key
        
        # Turn book tags into soup
        book['tags'] = ''
        tags = book_tags[book['id']]
        for key,value in tags.items():
            for i in range(int(value) // normalizing_value):
                book['tags'] += ' ' + key

        books.append(book)
    return books

In [5]:
#TODO - make this path not hardcoded
metadata_directory = '../../goodbooks-10k/books_xml/books_xml'
book_tags_file = '../../goodbooks-10k/book_tags_with_bookid.csv'
tags_file = '../../goodbooks-10k/tags.csv'
books = get_books(metadata_directory, book_tags_file, tags_file)
len(books)

10000

In [6]:
df = pd.DataFrame(books)
df.head()

Unnamed: 0,author,description,id,popular_shelves,tags,title
0,J.K. Rowling,the war against voldemort is not going well: e...,1,fantasy fantasy fantasy fantasy fantasy fanta...,fantasy fantasy fantasy fantasy fantasy fanta...,Harry Potter and the Half-Blood Prince (Harry ...
1,J.K. Rowling,"six years of magic, adventure, and mystery mak...",10,fantasy fantasy fantasy fantasy fantasy fanta...,fantasy fantasy fantasy fantasy fantasy fanta...,"Harry Potter Collection (Harry Potter, #1-6)"
2,Paul Auster,the discovery of a mysterious notebook turns a...,10006,fiction fiction fiction fiction fiction ficti...,fiction fiction fiction fiction fiction ficti...,Oracle Night
3,Eleanor H. Porter,when orphaned 11-year-old pollyanna comes to l...,1000751,classics classics classics classics classics ...,classics classics classics classics classics ...,"Pollyanna (Pollyanna, #1)"
4,Jussi Adler-Olsen,indtil for blot 40 år siden blev utilpassede e...,10008056,krimi krimi krimi krimi krimi krimi krimi kri...,krimi krimi krimi krimi krimi krimi krimi kri...,"Journal 64 (Afdeling Q, #4)"


In [15]:
# Count unique authors
df['author'].nunique()

3888

In [32]:
#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

# Vectorize the features
tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix_description = tfidf.fit_transform(df['description'])
tfidf_matrix_shelves = tfidf.fit_transform(df['popular_shelves'])
tfidf_matrix_tags = tfidf.fit_transform(df['tags'])

# Use pandas and scipy to make sparse sim matrix for authors
count_matrix_author = pd.get_dummies(df['author'])
count_matrix_author = scipy.sparse.csr_matrix(count_matrix_author.values)

# Compute the cosine similarity matrix for the features we want
cosine_sim_description = linear_kernel(tfidf_matrix_description, tfidf_matrix_description)
cosine_sim_shelves = linear_kernel(tfidf_matrix_shelves, tfidf_matrix_shelves)
cosine_sim_tags = linear_kernel(tfidf_matrix_tags, tfidf_matrix_tags)
cosine_sim_author = cosine_similarity(count_matrix_author, count_matrix_author)

#Construct a reverse map of indices and book titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

In [40]:
# Construct list of the similarity matrices and their corresponding weights
similarities = [cosine_sim_description, cosine_sim_shelves, cosine_sim_tags, cosine_sim_author]
weights = [.5, 1, 1, .25]

In [41]:
# Get most similar books to a certain book given title
title = 'The Innovators: How a Group of  Hackers, Geniuses and Geeks Created the Digital Revolution'
recs = get_recommendations(df, indices, title, similarities, weights)
print(recs)

8788    In the Plex: How Google Thinks, Works, and Sha...
7631    The Accidental Billionaires: The Founding of F...
2513    The Google Story: Inside the Hottest Business,...
8928    Where Good Ideas Come From: The Natural Histor...
3385    The World Is Flat: A Brief History of the Twen...
9350        The Information: A History, a Theory, a Flood
4007                      Elon Musk: Inventing the Future
4759    The Tipping Point: How Little Things Can Make ...
8478             At Home: A Short History of Private Life
8059                                 The Wisdom of Crowds
7319                                      The Right Stuff
4733    Hidden Figures: The American Dream and the Unt...
4058    The Residence: Inside the Private World of the...
3002    Creativity, Inc.: Overcoming the Unseen Forces...
3164    Guns, Germs, and Steel: The Fates of Human Soc...
2299           Lean In: Women, Work, and the Will to Lead
90      Ghost in the Wires: My Adventures as the World...
2680          

In [94]:
# Append all feature matrices together
feature_matrix = scipy.sparse.hstack([tfidf_matrix_description, tfidf_matrix_shelves, tfidf_matrix_tags])
feature_matrix.shape

(10000, 82203)

In [95]:
# Reduce the dimensions of the combined feature matrix
svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
feature_matrix_reduced = svd.fit_transform(feature_matrix)

In [96]:
# Save reduced feature matrix
filename = 'feature_matrix_reduced.npy'
np.save(filename, feature_matrix_reduced)
feature_matrix_reduced.shape

(10000, 1000)

In [97]:
# Verify saving and loading works
feature_matrix_reduced = np.load(filename)
feature_matrix_reduced.shape

(10000, 1000)