In [1]:
import pandas as pd
import re 
from tqdm import tqdm
import ast

DATA_PATH = 'data/'

# Preprocess the two book dataframes 
- Load dataframes ('https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home', 'https://github.com/malcolmosh/goodbooks-10k-extended/blob/master/README.md')
- Remove faulty elements of dataframe without the descriptions 
- Remove "additions" to titles in the description dataframes - Example: title (series #1) -> title
- Make a new dataframe that contains "Book ID" (from the dataframe without descriptions), "Book Title" (-||-), "genre" (description dataframe), "description" (description dataframe)
- Store this dataframe

In [2]:
# Load dataframes
descriptions_df = pd.read_csv(DATA_PATH + "books_descriptions.csv")
book_ID_df = pd.read_json(DATA_PATH + 'goodreads_book_works.json', lines=True)

# Remove columns that are not needed in book_ID_df 
book_ID_df = book_ID_df.filter(items=["best_book_id", "original_title", "reviews_count"])


In [3]:
# List amount of books are in each dataframe 
print(f"Amount of books in descriptions_df: {len(descriptions_df)}")
print(f"Amount of books in book_ID_df: {len(book_ID_df)}")

Amount of books in descriptions_df: 10000
Amount of books in book_ID_df: 1521962


In [4]:
# Remove faulty elements from the book_ID_df
book_ID_df = book_ID_df[book_ID_df['original_title'] != '']

print(f"Amount of books in book_ID_df: {len(book_ID_df)}")

Amount of books in book_ID_df: 646906


In [5]:
# Remove "additions" to titles in the descriptions_df
for i, row in tqdm(descriptions_df.iterrows()):
    original_title = row["title"]
    new_title = re.sub(r'\((.*)', '', original_title)
    descriptions_df.at[i, "title"] = new_title.strip()

# Check if it worked -> it did
# descriptions_df.head()

10000it [00:00, 19941.89it/s]


In [6]:
# Lower case all titles to not have confusion in this manner 
descriptions_df['title'] = descriptions_df['title'].str.lower()
book_ID_df['original_title'] = book_ID_df['original_title'].str.lower()

In [7]:
# Find corresponding indexes to merge the dataframes
titles_not_found = []
book_df = pd.DataFrame(columns=['book_id', 'title', 'description', 'genres'])

for i, row in tqdm(descriptions_df.iterrows()):
    title = row['title']
    # Check if the title is in book_ID_df, else append it to the titles_not_found list
    if title in book_ID_df['original_title'].values:
        # Get all rows that have the a matching title as the current row
        temp_df = book_ID_df[book_ID_df['original_title'] == title]
        
        # Get the book id of the book  with the highest amount of reviews
        book_id = temp_df['best_book_id'][temp_df['reviews_count'].idxmax()]
        descriptions = row['description']
        genres = row['genres']
        book_df = book_df.append({'book_id': book_id, 'title': title, 'description': descriptions, 'genres': genres}, ignore_index=True)
    else: 
        titles_not_found.append(title)

# print the amount of elements that are not found
print(f"Out of the 10000 titles {len(titles_not_found)} are not found in the book_ID_df")

10000it [06:38, 25.08it/s]

Out of the 10000 titles 2082 are not found in the book_ID_df





In [8]:
# Save book_df to a csv 
book_df.to_csv(DATA_PATH + 'book_df.csv', index=False)

In [9]:
# TODO - do this better somehow? 
# Ideas, remove all special characters (but then also " ") 
# Convert all to tokens, then make the inner product of the tokens, normalize and take the highest value. 


# # Find corresponding indexes to merge the dataframes 
# matching_idxs = []

# count = 0
# for i, title in enumerate(descriptions_df['title']):
#     title_found_count = 0
#     if title in book_ID_df['original_title'].values:
#         pass
#         # print(f"Found match: '{title}', at index {i}")
        
#         # #Save pair of matching indexes
#         # matching_idxs.append((i, j))
#     else:
        
#         title_found = False
#         first_match = False
#         while True:
#             #Check all original titles for a match
#             for original_title in book_ID_df['original_title']:
#                 if title == original_title[:len(title)]:
#                     title_found = True
#                     if not first_match:
#                         shortest_title = original_title
#                     else:
#                         if len(original_title) < len(shortest_title): 
#                             shortest_title = original_title
#             if title_found:
#                 #print(f"Found match: '{title}' in '{shortest_title}'")
#                 break
#             title = title[:-1]
#             if not len(title):
#                 break


# Preprocess the shelves 
- Use "book_id_map.csv" to find the books we use (ids) and store "new_ids" (the ids we can use to find the relevant shelfs)
- Drop all rows in "goodreads_interactions.csv" that have different ids than "new_ids". 
- Store this dataset.  

In [10]:
# Load dataframes
book_id_map_df = pd.read_csv(DATA_PATH + "book_id_map.csv")
book_df = pd.read_csv(DATA_PATH + "book_df.csv")

In [11]:
# Create map from book_id to book_id_csv
book_id_map = {book_id_map_df['book_id'][i]: book_id_map_df['book_id_csv'][i] for i in range(len(book_id_map_df))}

# Change the book ID in our dataset to match the shelf dataset
remove_list = [] # remove about 15 books that are for inexplicable reasons not in the shelf dataset
for i in range(len(book_df)):
    try:
        book_df["book_id"][i] = book_id_map[book_df["book_id"][i]]
    except:
        remove_list.append(i)
print(f"Out of {i} books {len(remove_list)} are not in the shelf dataset and hence removed")
book_df.drop(remove_list, inplace=True)

# Save the book_df dataframe with the index change 
book_df.to_csv(DATA_PATH + 'book_matching_ids_df.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_df["book_id"][i] = book_id_map[book_df["book_id"][i]]


Out of 7917 books 15 are not in the shelf dataset and hence removed


In [12]:
# Load the shelves dataframe (this takes some time and memory)
shelves_df = pd.read_csv(DATA_PATH + "goodreads_interactions.csv")

# Check how many books are on the shelves of ALL users combined 
len(shelves_df)

FileNotFoundError: [Errno 2] No such file or directory: 'data/goodreads_interactions.csv'

In [None]:
# Remove books on the shelves that are not in the book_df
shelves_df = shelves_df[shelves_df['book_id'].isin(book_df['book_id'].tolist())]

# Save the new shelves_df 
shelves_df.to_csv(DATA_PATH + 'shelves_df.csv', index=False)

# Check how many books are on the shelves of ALL users combined - after removal of books not in book_df
print(f"We have {len(shelves_df)} shelves in total, and in these there are {len(set(shelves_df['book_id'].tolist()))} unique books.")

NameError: name 'shelves_df' is not defined

# TF-IDF embeddings 
- Create TF-IDF embeddings 
- Create them for genres aswell 

In [58]:
a_set = set() 
for i in descriptions_df["genres"].to_list():
    a_set = a_set.union(set(ast.literal_eval(i)))

print(len(a_set))

39


In [59]:
a_set

{'art',
 'biography',
 'books',
 'business',
 'chick-lit',
 'christian',
 'classics',
 'comics',
 'contemporary',
 'cookbooks',
 'crime',
 'fantasy',
 'fiction',
 'gay-and-lesbian',
 'graphic-novels',
 'historical-fiction',
 'history',
 'horror',
 'humor-and-comedy',
 'manga',
 'memoir',
 'music',
 'mystery',
 'nonfiction',
 'paranormal',
 'philosophy',
 'poetry',
 'psychology',
 'religion',
 'romance',
 'science',
 'science-fiction',
 'self-help',
 'spirituality',
 'sports',
 'suspense',
 'thriller',
 'travel',
 'young-adult'}