In [1]:
import os
import html
import re
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.decomposition import TruncatedSVD

# Custom libraries - must be in same directory
import xml_to_dict
import get_book_tags
import get_bookid_mapper

In [2]:
# Set this to where you save and load all data
data_path = '../../goodbooks-10k/'

In [3]:
def clean_string(s):
    # often times a book will be missing a feature so we have to return if None
    if not s:
        return s
    
    # clean html
    TAG_RE = re.compile(r'<[^>]+>')
    s = html.unescape(s)
    s = TAG_RE.sub('', s)
    s = s.lower()
    return s

In [9]:
# Function that takes in book feature similarity matrices as input and outputs most similar book
def get_recommendations(df, indices, title, similarities, weights):
    
    # Get the index of the book that matches the title
    idx = indices[title]
    idx -= 1
    
    # Get the total number of books
    num_books = len(similarities[0])

    # Get the pairwsie similarity scores of all books with that book
    similarity_scores = []
    for similarity in similarities:
        similarity_scores.append(list(enumerate(similarity[idx])))
    
    # Sum and average the similarity scores of the three feature sets to get true similarity
    sim_scores = []
    for i in range(num_books):  
        book_id = similarity_scores[0][i][0]
        
        score = 0
        for j in range(len(weights)):
            score += (similarity_scores[j][i][1] * weights[j])
            
        sim_scores.append((book_id, score))
        
    # Sort the books based on the highest similarity scores first
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the N most similar books
    N = 20
    sim_scores = sim_scores[0:N]

    # Get the book indices
    book_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar books
    return df['title'].iloc[book_indices]

In [10]:
# Read in book metadata and store in a dictionary
def get_books(metadata_directory, goodreads_to_bookid, book_tags):
    books = []
    for file in os.listdir(metadata_directory):
        filename = metadata_directory + '/' + os.fsdecode(file)
        raw_book, popular_shelves = xml_to_dict.dict_from_xml_file(filename)

        book = {}
        goodreads_id = raw_book['book']['id']
        book['id'] = goodreads_to_bookid[goodreads_id]
        book['title'] = raw_book['book']['title']
        book['author'] = raw_book['book']['authors']['author']
        
        # if multiple authors, only use first (main) author
        if isinstance(book['author'], dict):
            book['author'] = book['author']['name']
        else:
            book['author'] = book['author'][0]['name']

        book['description'] = raw_book['book']['description']
        book['description'] = clean_string(book['description'])

        # Turn popular shelves into soup
        book['popular_shelves'] = ''
        normalizing_value = 5
        for key,value in popular_shelves.items():
            for i in range(int(value) // normalizing_value):
                book['popular_shelves'] += ' ' + key
        
        # Turn book tags into soup
        book['tags'] = ''
        tags = book_tags[goodreads_id]
        for key,value in tags.items():
            for i in range(int(value) // normalizing_value):
                book['tags'] += ' ' + key

        books.append(book)
    return books

In [11]:
# Read in mapper and book tags
goodreads_to_bookid = get_bookid_mapper.get_mapper(data_path + 'books.csv')
book_tags = get_book_tags.get_tags(data_path + 'book_tags_with_bookid.csv', data_path + 'tags.csv')

In [12]:
# Get books as dictionary of all its features
metadata_directory = data_path + 'books_xml/books_xml'
books = get_books(metadata_directory, goodreads_to_bookid, book_tags)
len(books)

10000

In [None]:
df = pd.DataFrame(books)
df['id'] = df['id'].astype(int)
df = df.sort_values(by=['id'])
df = df.set_index('id')
df.head()

In [14]:
#Construct a reverse map of indices and book titles
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

In [None]:
# Count unique authors
df['author'].nunique()

In [54]:
# Vectorize the features
tfidf = TfidfVectorizer(stop_words='english')

tfidf_matrix_description = tfidf.fit_transform(df['description'])
tfidf_matrix_shelves = tfidf.fit_transform(df['popular_shelves'])
tfidf_matrix_tags = tfidf.fit_transform(df['tags'])

# Use pandas and scipy to make sparse sim matrix for authors
count_matrix_author = pd.get_dummies(df['author'])
count_matrix_author = scipy.sparse.csr_matrix(count_matrix_author.values)

# Compute the cosine similarity matrix for the features we want
cosine_sim_description = cosine_similarity(tfidf_matrix_description, tfidf_matrix_description)
cosine_sim_shelves = cosine_similarity(tfidf_matrix_shelves, tfidf_matrix_shelves)
cosine_sim_tags = cosine_similarity(tfidf_matrix_tags, tfidf_matrix_tags)
cosine_sim_author = cosine_similarity(count_matrix_author, count_matrix_author)

In [None]:
# Get recs using seperate sim matrices
title = 'The Fellowship of the Ring (The Lord of the Rings, #1)'

similarities = [cosine_sim_description, cosine_sim_shelves, cosine_sim_tags, cosine_sim_author]
weights = [1, 1, 1, 0]
recs = get_recommendations(df, indices, title, similarities, weights)
recs

In [56]:
# Append all feature matrices together
feature_matrix = scipy.sparse.hstack([tfidf_matrix_description, tfidf_matrix_shelves, tfidf_matrix_tags])
feature_matrix.shape

(10000, 82203)

In [57]:
# Reduce the dimensions of the combined feature matrix
svd = TruncatedSVD(n_components=1000, n_iter=7, random_state=42)
feature_matrix_reduced = svd.fit_transform(feature_matrix)

In [None]:
# Compute the cosine similarity matrix for the combined and reduced features
cosine_sim_features = cosine_similarity(feature_matrix_reduced, feature_matrix_reduced)

In [None]:
# Get recs using the reduced combined feature matrix
similarities_features = [cosine_sim_features]
weights_features = [1]
recs_features = get_recommendations(df, indices, title, similarities_features, weights_features)
recs_features

In [197]:
# Save reduced feature matrix
filename = data_path + 'feature_matrix_reduced.npy'
np.save(filename, feature_matrix_reduced)
feature_matrix_reduced.shape

(10000, 1000)

In [96]:
# Load in item_matrix (concepts and features) and test recs
filename = data_path + 'item_matrix.npy'
item_matrix = np.load(filename)
item_matrix.shape

(10000, 2000)

In [97]:
# Compute the cosine similarity matrix for the item matrix
cosine_sim_item_matrix = cosine_similarity(item_matrix)

In [None]:
# Save similarity matrix using the item_matrix (joined matrices from ratings and features)
filename = data_path + 'similarity_matrix.npy'
np.save(filename, cosine_sim_item_matrix)

In [98]:
# Get recs using the item matrix (concepts and features)
similarities_item_matrix = [cosine_sim_item_matrix]
weights_item_matrix = [1]
recs_item_matrix = get_recommendations(df, indices, title, similarities_item_matrix, weights_item_matrix)
recs_item_matrix

id
19      The Fellowship of the Ring (The Lord of the Ri...
155            The Two Towers (The Lord of the Rings, #2)
161     The Return of the King (The Lord of the Rings,...
7                                              The Hobbit
189     The Lord of the Rings (The Lord of the Rings, ...
1629              The Tombs of Atuan (Earthsea Cycle, #2)
8038    The Belgariad Boxed Set: Pawn of Prophecy / Qu...
5735    Chronicles of the Black Company (The Chronicle...
1812              Magician: Master (The Riftwar Saga, #2)
9343    The Wheel of Time: Boxed Set  (Wheel of Time, ...
5560        The Magic of Recluce (The Saga of Recluce #1)
4200    The Hundred Thousand Kingdoms (Inheritance Tri...
9668                The Broken Kingdoms (Inheritance, #2)
1591              The Farthest Shore (Earthsea Cycle, #3)
5429    The Curse of Chalion (World of the Five Gods, #1)
8614                    Dawn of Wonder (The Wakening, #1)
917                  The Blade Itself (The First Law, #1)
1731       

In [99]:
# Test just the matrix derived from ratings matrix
item_matrix_test = item_matrix[:,0:1000]
item_matrix_test.shape

(10000, 1000)

In [100]:
# Compute the cosine similarity matrix for the collab filtering matrix
cosine_sim_test = cosine_similarity(item_matrix_test)

In [101]:
similarities_test = [cosine_sim_test]
weights_test = [1]
recs_test = get_recommendations(df, indices, title, similarities_test, weights_test)
recs_test

id
19     The Fellowship of the Ring (The Lord of the Ri...
7                                             The Hobbit
155           The Two Towers (The Lord of the Rings, #2)
161    The Return of the King (The Lord of the Rings,...
2      Harry Potter and the Sorcerer's Stone (Harry P...
37     The Lion, the Witch, and the Wardrobe (Chronic...
39        A Game of Thrones (A Song of Ice and Fire, #1)
54     The Hitchhiker's Guide to the Galaxy (Hitchhik...
13                                                  1984
14                                           Animal Farm
62           The Golden Compass (His Dark Materials, #1)
4                                  To Kill a Mockingbird
28                                     Lord of the Flies
53                    Eragon (The Inheritance Cycle, #1)
21     Harry Potter and the Order of the Phoenix (Har...
25     Harry Potter and the Deathly Hallows (Harry Po...
27     Harry Potter and the Half-Blood Prince (Harry ...
18     Harry Potter and the 