In [1]:
import numpy as np
import pandas as pd
import re 
from tqdm import tqdm
import ast

DATA_PATH = 'data/'

# Preprocess the two book dataframes 
- Load dataframes ('https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home', 'https://github.com/malcolmosh/goodbooks-10k-extended/blob/master/README.md')
- Remove faulty elements of dataframe without the descriptions 
- Remove "additions" to titles in the description dataframes - Example: title (series #1) -> title
- Make a new dataframe that contains "Book ID" (from the dataframe without descriptions), "Book Title" (-||-), "genre" (description dataframe), "description" (description dataframe)
- Store this dataframe

In [2]:
# Load dataframes
descriptions_df = pd.read_csv(DATA_PATH + "books_descriptions.csv")
book_ID_df = pd.read_json(DATA_PATH + 'goodreads_book_works.json', lines=True)

# Remove columns that are not needed in book_ID_df 
book_ID_df = book_ID_df.filter(items=["best_book_id", "original_title", "reviews_count"])


In [3]:
# List amount of books are in each dataframe 
print(f"Amount of books in descriptions_df: {len(descriptions_df)}")
print(f"Amount of books in book_ID_df: {len(book_ID_df)}")

Amount of books in descriptions_df: 10000
Amount of books in book_ID_df: 1521962


In [4]:
# Remove faulty elements from the book_ID_df
book_ID_df = book_ID_df[book_ID_df['original_title'] != '']

print(f"Amount of books in book_ID_df: {len(book_ID_df)}")

Amount of books in book_ID_df: 646906


In [5]:
# Remove faulty elements from the description dataframe
descriptions_df = descriptions_df[descriptions_df['description'].apply(lambda x: isinstance(x, str))]

In [6]:
# Remove "additions" to titles in the descriptions_df
for i, row in tqdm(descriptions_df.iterrows()):
    original_title = row["title"]
    new_title = re.sub(r'\((.*)', '', original_title)
    descriptions_df.at[i, "title"] = new_title.strip()

# Check if it worked -> it did
# descriptions_df.head()

9943it [00:00, 20424.75it/s]


In [7]:
# Lower case all titles to not have confusion in this manner 
descriptions_df['title'] = descriptions_df['title'].str.lower()
book_ID_df['original_title'] = book_ID_df['original_title'].str.lower()

In [8]:
# Find corresponding indexes to merge the dataframes
titles_not_found = []
book_df = pd.DataFrame(columns=['book_id', 'title', 'description', 'genres'])

for i, row in tqdm(descriptions_df.iterrows()):
    title = row['title']
    # Check if the title is in book_ID_df, else append it to the titles_not_found list
    if title in book_ID_df['original_title'].values:
        # Get all rows that have the a matching title as the current row
        temp_df = book_ID_df[book_ID_df['original_title'] == title]
        
        # Get the book id of the book  with the highest amount of reviews
        book_id = temp_df['best_book_id'][temp_df['reviews_count'].idxmax()]
        descriptions = row['description']
        genres = row['genres']
        book_df = book_df.append({'book_id': book_id, 'title': title, 'description': descriptions, 'genres': genres}, ignore_index=True)
    else: 
        titles_not_found.append(title)

# print the amount of elements that are not found
print(f"Out of the 10000 titles {len(titles_not_found)} are not found in the book_ID_df")

9943it [06:18, 26.25it/s]

Out of the 10000 titles 2061 are not found in the book_ID_df





In [9]:
# Somehow the same book appears multiple times, hence we drop the duplicates
book_df.drop_duplicates(subset=['title'], inplace=True)

In [10]:
# Save book_df to a csv 
book_df.to_csv(DATA_PATH + 'book_df.csv', index=False)

In [11]:
# TODO - do this better somehow? 
# Ideas, remove all special characters (but then also " ") 
# Convert all to tokens, then make the inner product of the tokens, normalize and take the highest value. 


# # Find corresponding indexes to merge the dataframes 
# matching_idxs = []

# count = 0
# for i, title in enumerate(descriptions_df['title']):
#     title_found_count = 0
#     if title in book_ID_df['original_title'].values:
#         pass
#         # print(f"Found match: '{title}', at index {i}")
        
#         # #Save pair of matching indexes
#         # matching_idxs.append((i, j))
#     else:
        
#         title_found = False
#         first_match = False
#         while True:
#             #Check all original titles for a match
#             for original_title in book_ID_df['original_title']:
#                 if title == original_title[:len(title)]:
#                     title_found = True
#                     if not first_match:
#                         shortest_title = original_title
#                     else:
#                         if len(original_title) < len(shortest_title): 
#                             shortest_title = original_title
#             if title_found:
#                 #print(f"Found match: '{title}' in '{shortest_title}'")
#                 break
#             title = title[:-1]
#             if not len(title):
#                 break


# Preprocess the shelves 
- Use "book_id_map.csv" to find the books we use (ids) and store "new_ids" (the ids we can use to find the relevant shelfs)
- Drop all rows in "goodreads_interactions.csv" that have different ids than "new_ids". 
- Store this dataset.  

In [12]:
# Load dataframes
book_id_map_df = pd.read_csv(DATA_PATH + "book_id_map.csv")
book_df = pd.read_csv(DATA_PATH + "book_df.csv")

In [13]:
# Create map from book_id to book_id_csv
book_id_map = {book_id_map_df['book_id'][i]: book_id_map_df['book_id_csv'][i] for i in range(len(book_id_map_df))}

# Change the book ID in our dataset to match the shelf dataset
remove_list = [] # remove about 15 books that are for inexplicable reasons not in the shelf dataset
for i in range(len(book_df)):
    try:
        book_df["book_id"][i] = book_id_map[book_df["book_id"][i]]
    except:
        remove_list.append(i)
print(f"Out of {i} books {len(remove_list)} are not in the shelf dataset and hence removed")
book_df.drop(remove_list, inplace=True)

# Save the book_df dataframe with the index change 
book_df.to_csv(DATA_PATH + 'book_matching_ids_df.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_df["book_id"][i] = book_id_map[book_df["book_id"][i]]


Out of 7690 books 15 are not in the shelf dataset and hence removed


In [14]:
my_list = book_df["book_id"].to_list()

print(len(my_list))
print(len(set(my_list)))

7676
7676


In [15]:
# Load the shelves dataframe (this takes some time and memory)
shelves_df = pd.read_csv(DATA_PATH + "goodreads_interactions.csv")

# Check how many books are on the shelves of ALL users combined 
len(shelves_df)

228648342

In [16]:
# Remove books on the shelves that are not in the book_df
shelves_df = shelves_df[shelves_df['book_id'].isin(book_df['book_id'].tolist())]

# Save the new shelves_df 
shelves_df.to_csv(DATA_PATH + 'shelves_df.csv', index=False)

# Check how many books are on the shelves of ALL users combined - after removal of books not in book_df
print(f"We have {len(shelves_df)} shelves in total, and in these there are {len(set(shelves_df['book_id'].tolist()))} unique books.")

We have 68176467 shelves in total, and in these there are 7676 unique books.


# TF-IDF embeddings 
- Create TF-IDF embeddings 
- Create them for genres aswell 

In [17]:
# imports for the text analysis
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import ast

# Import the book description dataframe 
book_df = pd.read_csv(DATA_PATH + "book_matching_ids_df.csv") # TODO help me not make a "Unnamed: 0" column... I want to use the book_id as index, but then it creates this column

# Logorithmic scale chosen for IDF 
BASE = 2

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\david\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
# Create tokens from descriptions 
# Function to clean strings (from week 7)
def clean_strings(strings):
    """ Cleans a list of strings by removing URLs, numbers, punctuation and stop words
    
    Args:
    - strings: a list of strings
    
    returns:
    - cleaned_strings: a list of cleaned strings
    """
    cleaned_strings = []
    stop_words = set(stopwords.words('english'))

    for string in strings:
        # Remove URLs
        string = re.sub(r'http\S+', '', string)

        # Remove numbers
        string = re.sub(r'[0-9]', '', string)

        # Keep only what is not punctuation
        string = re.sub(r'[^\w\s]', '', string)

        # Lowercase
        string = string.lower()
        
        # Remove empty strings and remove stop words and moke them
        if len(string) and string not in stop_words and type(string) == str:
            cleaned_strings.append(string)

    return cleaned_strings

# book_df.drop(axis=1, columns=['tokens'], inplace=True)

# If the dataframe has not yet gotten the tokens, do it here and save it
if not 'tokens' in book_df.columns: 
    book_df['tokens'] = None
    for i, row in tqdm(book_df.iterrows()):
        description = row['description']
        # If the description is not a string, it is probably a NaN, so we set it to None
        if type(description) == str:
            tokens = nltk.word_tokenize(description)
            clean_tokens = clean_strings(tokens)
            book_df['tokens'][i] = clean_tokens # str(clean_tokens)

    book_df.to_csv(DATA_PATH + 'book_matching_ids_df.csv', index=False)
else:
    book_df['tokens'] = book_df['tokens'].apply(ast.literal_eval)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  book_df['tokens'][i] = clean_tokens # str(clean_tokens)
7676it [00:11, 675.92it/s]


In [20]:
# Calculate the tf scores for each community (from the previous weeks)
def TF_from_corpus(corpus):
    """ Calculates the TF scores for each word in the corpus

    Args:
        corpus (list): list of lists of words/strings

    Returns:
        TF_df (pandas.DataFrame): Dataframe containing the TF scores for each word in the corpus
    """
    # Create empty dictionary to keep track of word counts
    word_counts = {}
    n_communities = len(corpus) 
    
    # Iterate through all communities
    for i, document in tqdm(enumerate(corpus)):
        # Iterate through each word in the current sublist
        for word in document:
            # If the current word is not in the dictionary, add it with a list of zeros
            if word not in word_counts:
                word_counts[word] = [0] * n_communities
            
            # Increment count for the current word and list index
            word_counts[word][i] += 1
    
    # Create pandas dataframe from the word_counts dictionary
    TF_df = pd.DataFrame.from_dict(word_counts).transpose()
    
    return TF_df

In [21]:
# Function that takes in a TF and an IDF and computes the TF_IDF dataframe (from the previous weeks)
def make_TF_IDF(TF_df, IDF_dict):
    """Multiply the TF and IDF scores to get the TF-IDF scores

    Args:
        TF_df (pandas.DataFrame): Dataframe containing the TF scores for each word in the corpus
        IDF_dict (dict): Dictionary containing the IDF scores for each word in the corpus

    Returns:
        TF_IDF (pandas.DataFrame): Dataframe containing the TF-IDF scores for each word in the corpus
    """
    # Create the TF-IDF dataframe
    TF_IDF = pd.DataFrame(index=TF_df.index, columns=TF_df.columns)

    # iterate over the index of the DataFrame
    for word in tqdm(TF_df.index):
        # multiply the values by the IDF_dict value
        TF_IDF.loc[word] = TF_df.loc[word] * IDF_dict[word]
        
        
    return TF_IDF

In [22]:
# Load the TF dataframe for the corpus if possible, else create it
try:
    TF_book_df = pd.read_csv(DATA_PATH + "TF_book_df.csv", index_col=0)
except:
    # Create it
    TF_book_df = TF_from_corpus(book_df['tokens'])

    # Rename the columns to the book_ids 
    TF_book_df.columns = book_df['book_id'].tolist()

    # Save the TF dataframe 
    TF_book_df.to_csv(DATA_PATH + "TF_book_df.csv")

7676it [00:03, 2441.24it/s]


In [23]:
# Create the total token count "T_all_books" and the IDF score for each book "IDF_book_dict" 
try:
    IDF_dict = np.load(DATA_PATH + 'IDF_dict.npy', allow_pickle=True).item()
except:
    T_all_books = TF_book_df.apply(lambda row: (row != 0).sum(), axis=1)
    # The log BASE is chosen when loading the libraries 
    IDF_dict = {word: np.emath.logn(BASE, len(TF_book_df.columns)/ T_all_books[word]) for word in TF_book_df.index}

    np.save(DATA_PATH + 'IDF_dict.npy', IDF_dict)

In [24]:
# Create the TF-IDF scores for each book "TF_IDF_book_df" if it has not already been made 
try:
    TF_IDF_book_df = pd.read_csv(DATA_PATH + "TF_IDF_book_df.csv", index_col=0)
except:
    # Create the dataframe 
    TF_IDF_book_df = make_TF_IDF(TF_book_df, IDF_dict)

    # Save the dataframe 
    TF_IDF_book_df.to_csv(DATA_PATH + "TF_IDF_book_df.csv")


100%|██████████| 62786/62786 [00:38<00:00, 1639.61it/s]


#### Genre TF and TF_IDF scores
- Do this by having all books with a genre define the "document" for that genre 
- Then compute the "TF_genre_df" dataframe, by summing all books from "TF_book_df" from that genre
- Here we make the decision that the IDF is the same as for the books. 
    - (Alternatively one could have weighed each book and made a new IDF score, however, this weighs a book with twice as many genres twice as large, hence we use the other option)

In [28]:
# Find the set of genres
genres = set() 
for i in book_df["genres"].to_list():
    genres = genres.union(set(ast.literal_eval(i)))


In [48]:
# Try to load the TF_genres_df, else create it
try:
    # Load the TF_genres_df
    TF_genres_df = pd.read_csv(DATA_PATH + "TF_genres_df.csv", index_col=0)
except: 
    # For each genre, sum all TF scores for books in that genre
    TF_genres_df = pd.DataFrame(index=TF_book_df.index, columns=genres)

    TF_genres_df = TF_genres_df.fillna(0)

    # Go through all books and add the TF scores to the genres of the book
    for i, row in tqdm(book_df.iterrows()):
        for genre in ast.literal_eval(row['genres']):
            TF_genres_df[genre] = TF_genres_df[genre] + TF_book_df[row['book_id']]
            
    # Save the genres_TF_df
    TF_genres_df.to_csv(DATA_PATH + "TF_genres_df.csv")

7676it [00:12, 620.36it/s]


In [52]:
# Try to load the TF_IDF_genres_df, else create it
try:
    # Load the TF_IDF_genres_df
    TF_IDF_genres_df = pd.read_csv(DATA_PATH + "TF_IDF_genres_df.csv", index_col=0)
except:
    # Create the dataframe 
    TF_IDF_genres_df = make_TF_IDF(TF_genres_df, IDF_dict)

    # Save the dataframe 
    TF_IDF_genres_df.to_csv(DATA_PATH + "TF_IDF_genres_df.csv")

100%|██████████| 62786/62786 [00:11<00:00, 5576.23it/s]


In [50]:
# Delete
for i in list(TF_genres_df.index):
    if type(i) != str:
        print(i)

In [49]:
# Delete
def find_diff_indexes(list1, list2):
    diff_indexes = []
    for i in range(len(list1)):
        if list1[i] != list2[i]:
            diff_indexes.append(i)
    return diff_indexes

find_diff_indexes(list(TF_genres_df.index), list(TF_book_df.index))


[]

In [53]:
# Delete
TF_IDF_genres_df

Unnamed: 0,art,comics,fantasy,poetry,memoir,horror,thriller,chick-lit,fiction,paranormal,...,psychology,manga,books,classics,biography,historical-fiction,history,cookbooks,humor-and-comedy,philosophy
winning,7.048158,0.0,169.155792,28.192632,35.24079,14.096316,35.24079,42.288948,317.16711,49.337106,...,21.144474,0.0,14.096316,105.72237,91.626054,91.626054,56.385264,0.0,0.0,21.144474
means,9.412933,23.532333,635.372998,14.1194,70.597,98.8358,230.616866,305.920332,1298.984796,357.691466,...,56.4776,14.1194,127.0746,103.542266,84.7164,202.378066,89.422866,9.412933,0.0,47.064667
fame,26.709281,6.67732,93.482484,13.354641,93.482484,26.709281,93.482484,106.837125,387.284578,13.354641,...,20.031961,0.0,60.095883,86.805164,120.191766,100.159805,60.095883,6.67732,0.0,6.67732
fortunelosing,0.0,0.0,12.906139,0.0,0.0,0.0,0.0,0.0,12.906139,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
certain,11.278705,11.278705,360.918557,16.918057,33.836115,56.393525,191.737983,152.262516,772.591286,129.705106,...,28.196762,0.0,73.311582,78.950934,39.475467,163.541221,50.754172,5.639352,0.0,39.475467
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
indistinguishably,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.906139,12.906139,0.0,0.0,0.0
undifferentially,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.906139,12.906139,0.0,0.0,0.0
manatarms,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.906139,12.906139,0.0,0.0,0.0
historywith,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,12.906139,12.906139,0.0,0.0,0.0


# Create a genre for each book
- make the inner product which each book and the genre vector (both normed)
- let the largest inner product that the book contains be the genre of the book 

In [57]:
# Inner product function 
def inner_product(v1, v2):
    """Calculates the normed inner product of two vectors

    Args:
        v1 (list): list of numbers
        v2 (list): list of numbers

    Returns:
        inner_product (float): inner product of the two vectors (divided by the product of their norms)
    """
    
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

# Generate genre by taking inner products between each book and each genre of that book, and choosing the maximal
def get_genres(book_df, TF_IDF_genres_df, TF_IDF_book_df): 
    # Create a dictionary to store the genre for each book
    genres = {}
    
    # Go through each book 
    for i, row in tqdm(book_df.iterrows()):
        # Get the TF-IDF scores for the current book
        book_TF_IDF = TF_IDF_book_df[row["book_id"]]
        
        best_genre = (None, 0)
        
        # Go through each genre of the book
        for genre in ast.literal_eval(row['genres']):
            genre_TF_IDF = TF_IDF_genres_df[genre]
            # print(f"inner_product {inner_product(book_TF_IDF, genre_TF_IDF)} {genre}") # Testing
            if inner_product(book_TF_IDF, genre_TF_IDF) > best_genre[1]:
                best_genre = (genre, inner_product(book_TF_IDF, genre_TF_IDF))
        # print(best_genre) # Testing
        # break # Testing
        # Save the best genre 
        genres[row["book_id"]] = best_genre[0]

    return genres

# Get the genres for each book
genres = get_genres(book_df, TF_IDF_genres_df, TF_IDF_book_df)

book_df["top_genre"] = book_df["book_id"].map(genres)


0it [00:00, ?it/s]

inner_product 0.10087184173215952 young-adult
inner_product 0.09227150818438483 fiction
inner_product 0.09639648006828362 fantasy
inner_product 0.10932637605734288 science-fiction
inner_product 0.09280483506292282 romance
('science-fiction', 0.10932637605734288)





In [58]:
# Save the book_df as complete_book_df
book_df.to_csv(DATA_PATH + "complete_book_df.csv")