In [36]:
import os
import sys
import re
import html
import numpy as np
import pandas as pd
import scipy
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.decomposition import TruncatedSVD
from surprise import Reader, Dataset, SVD, evaluate, dump, accuracy
from collections import defaultdict

# Custom libraries
sys.path.append('../Util')
from loader import get_books, get_book_dataframe, get_book_features, get_mapper, get_tags
from joiner import get_ratings, get_joint, load_amazon, load_goodreads
from reduction import reduce_matrix, get_sparse
import xml_to_dict

In [37]:
data_path = '../../goodbooks-10k/'

In [38]:
def clean_string(s):
    # often times a book will be missing a feature so we have to return if None
    if not s:
        return s
    
    # clean html
    TAG_RE = re.compile(r'<[^>]+>')
    s = html.unescape(s)
    s = TAG_RE.sub('', s)
    return s

In [39]:
def get_books(data_path):
    metadata_directory = data_path + 'books_xml/books_xml'
    goodreads_to_bookid = get_mapper(data_path + 'books.csv')
    book_tags = get_tags(data_path + 'book_tags_with_bookid.csv', data_path + 'tags.csv')
    books = []
    
    for file in os.listdir(metadata_directory):
        filename = metadata_directory + '/' + os.fsdecode(file)
        raw_book, popular_shelves = xml_to_dict.dict_from_xml_file(filename)

        book = {}
        goodreads_id = raw_book['book']['id']
        book['id'] = goodreads_to_bookid[goodreads_id]
        book['title'] = raw_book['book']['title']
        book['image_url'] = raw_book['book']['image_url']
        book['url'] = raw_book['book']['url']
        book['author'] = raw_book['book']['authors']['author']
        
        # if multiple authors, only use first (main) author
        if isinstance(book['author'], dict):
            book['author'] = book['author']['name']
        else:
            book['author'] = book['author'][0]['name']

        book['description'] = raw_book['book']['description']
        book['description'] = clean_string(book['description'])
        
        books.append(book)
    return books

In [40]:
books = get_books(data_path)

In [41]:
df = pd.DataFrame(books)
df['id'] = df['id'].astype(int)
df = df.sort_values(by=['id'])
df = df.set_index('id')

#Replace NaN with an empty string
df['description'] = df['description'].fillna('')

In [42]:
df.columns

Index(['author', 'description', 'image_url', 'title', 'url'], dtype='object')

In [43]:
df.to_pickle('../.tmp/books_dataframe_reduced')