In [2]:
import os
import html
import re
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.decomposition import TruncatedSVD

# Custom libraries - must be in same directory
import xml_to_dict
import get_book_tags
import get_bookid_mapper

In [195]:
# Set this to where you save and load all data
data_path = '../../goodbooks-10k/'

In [3]:
def clean_string(s):
    # often times a book will be missing a feature so we have to return if None
    if not s:
        return s
    
    # clean html
    TAG_RE = re.compile(r'<[^>]+>')
    s = html.unescape(s)
    s = TAG_RE.sub('', s)
    s = s.lower()
    return s

In [124]:
# Function that takes in book feature similarity matrices as input and outputs most similar book
def get_recommendations(df, indices, title, similarities, weights):
    
    # Get the index of the book that matches the title
    idx = indices[title]
    idx -= 1
    
    # Get the total number of books
    num_books = len(similarities[0])

    # Get the pairwsie similarity scores of all books with that book
    similarity_scores = []
    for similarity in similarities:
        similarity_scores.append(list(enumerate(similarity[idx])))
    
    # Sum and average the similarity scores of the three feature sets to get true similarity
    sim_scores = []
    for i in range(num_books):  
        book_id = similarity_scores[0][i][0]
        
        score = 0
        for j in range(len(weights)):
            score += (similarity_scores[j][i][1] * weights[j])
            
        sim_scores.append((book_id, score))
        
    # Sort the books based on the highest similarity scores first
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the N most similar books
    N = 20
    sim_scores = sim_scores[0:N]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return df['title'].iloc[book_indices]

In [8]:
# Read in book metadata and store in a dictionary
def get_books(metadata_directory, goodreads_to_bookid, book_tags):
    books = []
    for file in os.listdir(metadata_directory):
        filename = metadata_directory + '/' + os.fsdecode(file)
        raw_book, popular_shelves = xml_to_dict.dict_from_xml_file(filename)

        book = {}
        goodreads_id = raw_book['book']['id']
        book['id'] = goodreads_to_bookid[goodreads_id]
        book['title'] = raw_book['book']['title']
        book['author'] = raw_book['book']['authors']['author']
        
        # if multiple authors, only use first (main) author
        if isinstance(book['author'], dict):
            book['author'] = book['author']['name']
        else:
            book['author'] = book['author'][0]['name']

        book['description'] = raw_book['book']['description']
        book['description'] = clean_string(book['description'])

        # Turn popular shelves into soup
        book['popular_shelves'] = ''
        normalizing_value = 5
        for key,value in popular_shelves.items():
            for i in range(int(value) // normalizing_value):
                book['popular_shelves'] += ' ' + key
        
        # Turn book tags into soup
        book['tags'] = ''
        tags = book_tags[goodreads_id]
        for key,value in tags.items():
            for i in range(int(value) // normalizing_value):
                book['tags'] += ' ' + key

        books.append(book)
    return books

In [9]:
# Read in mapper and book tags
goodreads_to_bookid = get_bookid_mapper.get_mapper(data_path + 'books.csv')
book_tags = get_book_tags.get_tags(data_path + 'book_tags_with_bookid.csv', data_path + 'tags.csv')

In [10]:
# Get books as dictionary of all its features
metadata_directory = data_path + 'books_xml/books_xml'
books = get_books(metadata_directory, goodreads_to_bookid, book_tags)
len(books)

10000

In [196]:
df = pd.DataFrame(books)
df['id'] = df['id'].astype(int)
df = df.sort_values(by=['id'])
df = df.set_index('id')
df.head()

Unnamed: 0_level_0,author,description,popular_shelves,tags,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Suzanne Collins,winning will make you famous. losing means cer...,young-adult young-adult young-adult young-adu...,young-adult young-adult young-adult young-adu...,"The Hunger Games (The Hunger Games, #1)"
2,J.K. Rowling,harry potter's life is miserable. his parents ...,fantasy fantasy fantasy fantasy fantasy fanta...,fantasy fantasy fantasy fantasy fantasy fanta...,Harry Potter and the Sorcerer's Stone (Harry P...
3,Stephenie Meyer,about three things i was absolutely positive.f...,young-adult young-adult young-adult young-adu...,young-adult young-adult young-adult young-adu...,"Twilight (Twilight, #1)"
4,Harper Lee,the unforgettable novel of a childhood in a sl...,classics classics classics classics classics ...,classics classics classics classics classics ...,To Kill a Mockingbird
5,F. Scott Fitzgerald,"the great gatsby, f. scott fitzgerald’s third ...",classics classics classics classics classics ...,classics classics classics classics classics ...,The Great Gatsby


In [52]:
#Construct a reverse map of indices and book titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

In [53]:
# Count unique authors
df['author'].nunique()

3888

In [54]:
# Vectorize the features
tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix_description = tfidf.fit_transform(df['description'])
tfidf_matrix_shelves = tfidf.fit_transform(df['popular_shelves'])
tfidf_matrix_tags = tfidf.fit_transform(df['tags'])

# Use pandas and scipy to make sparse sim matrix for authors
count_matrix_author = pd.get_dummies(df['author'])
count_matrix_author = scipy.sparse.csr_matrix(count_matrix_author.values)

# Compute the cosine similarity matrix for the features we want
cosine_sim_description = cosine_similarity(tfidf_matrix_description, tfidf_matrix_description)
cosine_sim_shelves = cosine_similarity(tfidf_matrix_shelves, tfidf_matrix_shelves)
cosine_sim_tags = cosine_similarity(tfidf_matrix_tags, tfidf_matrix_tags)
cosine_sim_author = cosine_similarity(count_matrix_author, count_matrix_author)

In [187]:
# Get recs using seperate sim matrices
title = 'Charlie and the Great Glass Elevator (Charlie Bucket, #2)'

similarities = [cosine_sim_description, cosine_sim_shelves, cosine_sim_tags, cosine_sim_author]
weights = [1, 1, 1, 0]
recs = get_recommendations(df, indices, title, similarities, weights)
recs

id
1938    Charlie and the Great Glass Elevator (Charlie ...
416                                           The Witches
1662                                    Fantastic Mr. Fox
335                             James and the Giant Peach
2123                         George's Marvellous Medicine
2741                      Danny the Champion of the World
373                                               The BFG
5311                                            Esio Trot
8192                     The Giraffe and the Pelly and Me
6097    The Magic Finger (Young Puffin Developing Reader)
1497                    The Borrowers (The Borrowers, #1)
1373    The Indian in the Cupboard (The Indian in the ...
158     Charlie and the Chocolate Factory (Charlie Buc...
8753    The Castle in the Attic (The Castle in the Att...
8856    Mrs. Piggle-Wiggle's Magic (Mrs. Piggle Wiggle...
9661    Hello, Mrs. Piggle-Wiggle (Mrs. Piggle Wiggle,...
1545       The House at Pooh Corner (Winnie-the-Pooh, #2)
59         

In [56]:
# Append all feature matrices together
feature_matrix = scipy.sparse.hstack([tfidf_matrix_description, tfidf_matrix_shelves, tfidf_matrix_tags])
feature_matrix.shape

(10000, 82203)

In [57]:
# Reduce the dimensions of the combined feature matrix
svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
feature_matrix_reduced = svd.fit_transform(feature_matrix)

In [67]:
# Compute the cosine similarity matrix for the combined and reduced features
cosine_sim_features = cosine_similarity(feature_matrix_reduced, feature_matrix_reduced)

In [173]:
# Get recs using the reduced combined feature matrix
similarities_features = [cosine_sim_features]
weights_features = [1]
recs_features = get_recommendations(df, indices, title, similarities_features, weights_features)
recs_features

id
9670                           Trump: The Art of the Deal
4803    The Everything Store: Jeff Bezos and the Age o...
8855               Screw It, Let's Do It: Lessons In Life
7628    Onward: How Starbucks Fought for Its Life with...
7547    Competitive Strategy: Techniques for Analyzing...
8179    In Search of Excellence: Lessons from America'...
3882    The $100 Startup: Reinvent the Way You Make a ...
3661    The Innovator's Dilemma: The Revolutionary Boo...
2220    Built to Last: Successful Habits of Visionary ...
2442                                              Winning
1926    Made to Stick: Why Some Ideas Survive and Othe...
6583            Shoe Dog: A Memoir by the Creator of NIKE
7693                                     How Google Works
2388    Delivering Happiness: A Path to Profits, Passi...
9529    EntreLeadership: 20 Years of Practical Busines...
5713    Crush It!: Why Now Is the Time to Cash In on Y...
7581         The Personal MBA: Master the Art of Business
2686    Blu

In [197]:
# Save reduced feature matrix
filename = data_path + 'feature_matrix_reduced.npy'
np.save(filename, feature_matrix_reduced)
feature_matrix_reduced.shape

(10000, 1000)

In [198]:
# Load in item_matrix (concepts and features) and test recs
filename = data_path + 'item_matrix.npy'
item_matrix = np.load(filename)
item_matrix.shape

(10000, 1100)

In [190]:
# Compute the cosine similarity matrix for the item matrix
cosine_sim_item_matrix = cosine_similarity(item_matrix)

In [180]:
# Save similarity matrix using the item_matrix (joined matrices from ratings and features)
filename = data_path + 'similarity_matrix.npy'
np.save(filename, cosine_sim_item_matrix)

In [191]:
# Get recs using the item matrix (concepts and features)
similarities_item_matrix = [cosine_sim_item_matrix]
weights_item_matrix = [1]
recs_item_matrix = get_recommendations(df, indices, title, similarities_item_matrix, weights_item_matrix)
recs_item_matrix

id
1938    Charlie and the Great Glass Elevator (Charlie ...
1662                                    Fantastic Mr. Fox
373                                               The BFG
335                             James and the Giant Peach
2123                         George's Marvellous Medicine
416                                           The Witches
2741                      Danny the Champion of the World
8192                     The Giraffe and the Pelly and Me
5311                                            Esio Trot
6097    The Magic Finger (Young Puffin Developing Reader)
6959                   Runaway Ralph (Ralph S. Mouse, #2)
1497                    The Borrowers (The Borrowers, #1)
2976                        Ramona the Brave (Ramona, #3)
1373    The Indian in the Cupboard (The Indian in the ...
8161                              Pippi in the South Seas
9974            Anastasia Krupnik (Anastasia Krupnik, #1)
1916    The Mouse and the Motorcycle (Ralph S. Mouse, #1)
3100       

In [199]:
# Test just the matrix derived from ratings matrix
item_matrix_test = item_matrix[:,0:100]
item_matrix_test.shape

(10000, 100)

In [200]:
# Compute the cosine similarity matrix for the collab filtering matrix
cosine_sim_test = cosine_similarity(item_matrix_test)

In [201]:
similarities_test = [cosine_sim_test]
weights_test = [1]
recs_test = get_recommendations(df, indices, title, similarities_test, weights_test)
recs_test

id
1938    Charlie and the Great Glass Elevator (Charlie ...
2741                      Danny the Champion of the World
1258                                            The Twits
1662                                    Fantastic Mr. Fox
5311                                            Esio Trot
373                                               The BFG
8192                     The Giraffe and the Pelly and Me
416                                           The Witches
7266      The Wonderful Story of Henry Sugar and Six More
2620                              Boy: Tales of Childhood
6097    The Magic Finger (Young Puffin Developing Reader)
7103                                           Going Solo
2123                         George's Marvellous Medicine
335                             James and the Giant Peach
8857                                     Revolting Rhymes
184                                               Matilda
158     Charlie and the Chocolate Factory (Charlie Buc...
7045       