In [None]:
import networkx as nx
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import numpy as np
import pandas as pd
import netwulf
import copy
#import community
import seaborn as sns
import pickle as pkl
import seaborn as sns
import pickle

DATA_PATH = 'data/'

# Preprocessing

# Preprocess the two book dataframes
- Load dataframes ('https://sites.google.com/eng.ucsd.edu/ucsdbookgraph/home', 'https://github.com/malcolmosh/goodbooks-10k-extended/blob/master/README.md')
- Remove faulty elements of dataframe without the descriptions
- Remove "additions" to titles in the description dataframes - Example: title (series #1) -> title
- Make a new dataframe that contains "Book ID" (from the dataframe without descriptions), "Book Title" (-||-), "genre" (description dataframe), "description" (description dataframe)
- Store this dataframe

In [None]:
# Load dataframes
descriptions_df = pd.read_csv(DATA_PATH + "books_descriptions.csv")
book_ID_df = pd.read_json(DATA_PATH + 'goodreads_book_works.json', lines=True)

# Remove columns that are not needed in book_ID_df
book_ID_df = book_ID_df.filter(items=["best_book_id", "original_title", "reviews_count"])


In [None]:
# List amount of books are in each dataframe
print(f"Amount of books in descriptions_df: {len(descriptions_df)}")
print(f"Amount of books in book_ID_df: {len(book_ID_df)}")

In [None]:
# Remove faulty elements from the book_ID_df
book_ID_df = book_ID_df[book_ID_df['original_title'] != '']

print(f"Amount of books in book_ID_df: {len(book_ID_df)}")

In [None]:
# Remove faulty elements from the description dataframe
descriptions_df = descriptions_df[descriptions_df['description'].apply(lambda x: isinstance(x, str))]

In [None]:
# Remove "additions" to titles in the descriptions_df
for i, row in tqdm(descriptions_df.iterrows()):
    original_title = row["title"]
    new_title = re.sub(r'\((.*)', '', original_title)
    descriptions_df.at[i, "title"] = new_title.strip()

# Check if it worked -> it did
# descriptions_df.head()

In [None]:
# Lower case all titles to not have confusion in this manner
descriptions_df['title'] = descriptions_df['title'].str.lower()
book_ID_df['original_title'] = book_ID_df['original_title'].str.lower()

In [None]:
# Find corresponding indexes to merge the dataframes
warnings.simplefilter(action='ignore', category=FutureWarning)
titles_not_found = []
book_df = pd.DataFrame(columns=['book_id', 'title', 'description', 'genres'])

for i, row in tqdm(descriptions_df.iterrows(), total = descriptions_df.shape[0]):
    title = row['title']
    # Check if the title is in book_ID_df, else append it to the titles_not_found list
    if title in book_ID_df['original_title'].values:
        # Get all rows that have the a matching title as the current row
        temp_df = book_ID_df[book_ID_df['original_title'] == title]

        # Get the book id of the book  with the highest amount of reviews
        book_id = temp_df['best_book_id'][temp_df['reviews_count'].idxmax()]
        descriptions = row['description']
        genres = row['genres']
        book_df = book_df.append({'book_id': book_id, 'title': title, 'description': descriptions, 'genres': genres}, ignore_index=True)
    else:
        titles_not_found.append(title)

# print the amount of elements that are not found
print(f"Out of the 10000 titles {len(titles_not_found)} are not found in the book_ID_df")

In [None]:
# Somehow the same book appears multiple times, hence we drop the duplicates
book_df.drop_duplicates(subset=['title'], inplace=True)

In [1]:
# TODO - Possibly add the capacity to look through titles and determine if a shorter version could be found in the other dataset ("harry potter and the sorcersers stone" becoming "harry potter and the " and possibly finding harry potter and the philosophers stone"

# Preprocess the shelves
- Use "book_id_map.csv" to find the books we use (ids) and store "new_ids" (the ids we can use to find the relevant shelfs)
- Drop all rows in "goodreads_interactions.csv" that have different ids than "new_ids".
- Store this dataset.

In [None]:
# Load dataframes
book_id_map_df = pd.read_csv(DATA_PATH + "book_id_map.csv")
book_df = pd.read_csv(DATA_PATH + "book_df.csv")

In [None]:
# Create map from book_id to book_id_csv
book_id_map = {book_id_map_df['book_id'][i]: book_id_map_df['book_id_csv'][i] for i in range(len(book_id_map_df))}

# Change the book ID in our dataset to match the shelf dataset
remove_list = [] # remove about 15 books that are for inexplicable reasons not in the shelf dataset
for i in range(len(book_df)):
    try:
        book_df["book_id"][i] = book_id_map[book_df["book_id"][i]]
    except:
        remove_list.append(i)
print(f"Out of {i} books {len(remove_list)} are not in the shelf dataset and hence removed")
book_df.drop(remove_list, inplace=True)

# Save the book_df dataframe with the index change
book_df.to_csv(DATA_PATH + 'book_matching_ids_df.csv', index=False)

In [None]:
# Load the shelves dataframe (this takes some time and memory)
shelves_df = pd.read_csv(DATA_PATH + "goodreads_interactions.csv")

# Check how many books are on the shelves of ALL users combined
print(f'There are {len(shelves_df)} books, copies counted aswell')

In [None]:
# Remove books on the shelves that are not in the book_df
shelves_df = shelves_df[shelves_df['book_id'].isin(book_df['book_id'].tolist())]

# Save the new shelves_df
shelves_df.to_csv(DATA_PATH + 'shelves_df.csv', index=False)

# Check how many books are on the shelves of ALL users combined - after removal of books not in book_df
print(f"We have {len(shelves_df)} shelves in total, and in these there are {len(set(shelves_df['book_id'].tolist()))} unique books.")

# TF-IDF embeddings
- Create TF-IDF embeddings
- Create them for genres aswell

In [None]:
# imports for the text analysis
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import ast

# Import the book description dataframe
book_df = pd.read_csv(DATA_PATH + "book_matching_ids_df.csv") # TODO help me not make a "Unnamed: 0" column... I want to use the book_id as index, but then it creates this column

# Logorithmic scale chosen for IDF
BASE = 2

In [2]:
# Function to clean strings (from week 7)
def clean_strings(strings):
    """ Cleans a list of strings by removing URLs, numbers, punctuation and stop words

    Args:
    - strings: a list of strings

    returns:
    - cleaned_strings: a list of cleaned strings
    """
    cleaned_strings = []
    stop_words = set(stopwords.words('english'))

    for string in strings:
        # Remove URLs
        string = re.sub(r'http\S+', '', string)

        # Remove numbers
        string = re.sub(r'[0-9]', '', string)

        # Keep only what is not punctuation
        string = re.sub(r'[^\w\s]', '', string)

        # Lowercase
        string = string.lower()

        # Remove empty strings and remove stop words and moke them
        if len(string) and string not in stop_words and type(string) == str:
            cleaned_strings.append(string)

    return cleaned_strings


In [None]:
# Create tokens from descriptions
# If the dataframe has not yet gotten the tokens, do it here and save it
if not 'tokens' in book_df.columns:
    book_df['tokens'] = None
    for i, row in tqdm(book_df.iterrows(), total = book_df.shape[0]):
        description = row['description']
        # If the description is not a string, it is probably a NaN, so we set it to None
        if type(description) == str:
            tokens = nltk.word_tokenize(description)
            clean_tokens = clean_strings(tokens)
            book_df['tokens'][i] = clean_tokens # str(clean_tokens)

    book_df.to_csv(DATA_PATH + 'book_matching_ids_df.csv', index=False)
else:
    book_df['tokens'] = book_df['tokens'].apply(ast.literal_eval)

In [None]:
# Calculate the tf scores for each community (from the previous weeks)
def TF_from_corpus(corpus):
    """ Calculates the TF scores for each word in the corpus

    Args:
        corpus (list): list of lists of words/strings

    Returns:
        TF_df (pandas.DataFrame): Dataframe containing the TF scores for each word in the corpus
    """
    # Create empty dictionary to keep track of word counts
    word_counts = {}
    n_communities = len(corpus)

    # Iterate through all communities
    for i, document in tqdm(enumerate(corpus), total=n_communities):
        # Iterate through each word in the current sublist
        for word in document:
            # If the current word is not in the dictionary, add it with a list of zeros
            if word not in word_counts:
                word_counts[word] = [0] * n_communities

            # Increment count for the current word and list index
            word_counts[word][i] += 1

    # Create pandas dataframe from the word_counts dictionary
    TF_df = pd.DataFrame.from_dict(word_counts).transpose()

    return TF_df

In [None]:
# Function that takes in a TF and an IDF and computes the TF_IDF dataframe (from the previous weeks)
def make_TF_IDF(TF_df, IDF_dict):
    """Multiply the TF and IDF scores to get the TF-IDF scores

    Args:
        TF_df (pandas.DataFrame): Dataframe containing the TF scores for each word in the corpus
        IDF_dict (dict): Dictionary containing the IDF scores for each word in the corpus

    Returns:
        TF_IDF (pandas.DataFrame): Dataframe containing the TF-IDF scores for each word in the corpus
    """
    # Create the TF-IDF dataframe
    TF_IDF = pd.DataFrame(index=TF_df.index, columns=TF_df.columns)

    # iterate over the index of the DataFrame
    for word in tqdm(TF_df.index, total=TF_df.shape[0]):
        # multiply the values by the IDF_dict value
        TF_IDF.loc[word] = TF_df.loc[word] * IDF_dict[word]


    return TF_IDF

In [None]:
# Load the TF dataframe for the corpus if possible, else create it
try:
    TF_book_df = pd.read_csv(DATA_PATH + "TF_book_df.csv", index_col=0)
except:
    # Create it
    TF_book_df = TF_from_corpus(book_df['tokens'])

    # Rename the columns to the book_ids
    TF_book_df.columns = book_df['book_id'].tolist()

    # Save the TF dataframe
    TF_book_df.to_csv(DATA_PATH + "TF_book_df.csv")

In [None]:
# Create the total token count "T_all_books" and the IDF score for each book "IDF_book_dict"
try:
    IDF_dict = np.load(DATA_PATH + 'IDF_dict.npy', allow_pickle=True).item()
except:
    T_all_books = TF_book_df.apply(lambda row: (row != 0).sum(), axis=1)
    # The log BASE is chosen when loading the libraries
    IDF_dict = {word: np.emath.logn(BASE, len(TF_book_df.columns)/ T_all_books[word]) for word in TF_book_df.index}

    np.save(DATA_PATH + 'IDF_dict.npy', IDF_dict)

In [None]:
# Create the TF-IDF scores for each book "TF_IDF_book_df" if it has not already been made
try:
    TF_IDF_book_df = pd.read_csv(DATA_PATH + "TF_IDF_book_df.csv", index_col=0)
except:
    # Create the dataframe
    TF_IDF_book_df = make_TF_IDF(TF_book_df, IDF_dict)

    # Save the dataframe
    TF_IDF_book_df.to_csv(DATA_PATH + "TF_IDF_book_df.csv")

#### Genre TF and TF_IDF scores
- Do this by having all books with a genre define the "document" for that genre
- Then compute the "TF_genre_df" dataframe, by summing all books from "TF_book_df" from that genre
- Here we make the decision that the IDF is the same as for the books.
    - (Alternatively one could have weighed each book and made a new IDF score, however, this weighs a book with twice as many genres twice as large, hence we use the other option)

In [None]:
# Find the set of genres
genres = set()
for i in book_df["genres"].to_list():
    genres = genres.union(set(ast.literal_eval(i)))

In [None]:
# Try to load the TF_genres_df, else create it
try:
    # Load the TF_genres_df
    TF_genres_df = pd.read_csv(DATA_PATH + "TF_genres_df.csv", index_col=0)
except:
    # For each genre, sum all TF scores for books in that genre
    TF_genres_df = pd.DataFrame(index=TF_book_df.index, columns=genres)

    TF_genres_df = TF_genres_df.fillna(0)

    # Go through all books and add the TF scores to the genres of the book
    for i, row in tqdm(book_df.iterrows()):
        for genre in ast.literal_eval(row['genres']):
            TF_genres_df[genre] = TF_genres_df[genre] + TF_book_df[row['book_id']]

    # Save the genres_TF_df
    TF_genres_df.to_csv(DATA_PATH + "TF_genres_df.csv")

In [None]:
# Try to load the TF_IDF_genres_df, else create it
try:
    # Load the TF_IDF_genres_df
    TF_IDF_genres_df = pd.read_csv(DATA_PATH + "TF_IDF_genres_df.csv", index_col=0)
except:
    # Create the dataframe
    TF_IDF_genres_df = make_TF_IDF(TF_genres_df, IDF_dict)

    # Save the dataframe
    TF_IDF_genres_df.to_csv(DATA_PATH + "TF_IDF_genres_df.csv")

In [3]:
# Have not added things with # delete

# Create a genre for each book
- make the inner product which each book and the genre vector (both normed)
- let the largest inner product that the book contains be the genre of the book

In [4]:
# Inner product function
def inner_product(v1, v2):
    """Calculates the normed inner product of two vectors

    Args:
        v1 (list): list of numbers
        v2 (list): list of numbers

    Returns:
        inner_product (float): inner product of the two vectors (divided by the product of their norms)
    """

    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [5]:
# Generate genre by taking inner products between each book and each genre of that book, and choosing the maximal
def get_genres(book_df, TF_IDF_genres_df, TF_IDF_book_df):
    # TODO: Check these args and return
    """
    Args:
        book_df (DataFrame): DataFrame with genres for each book
        TF_IDF_genres_df (DataFrame): DataFrame with TF_IDF each genre
        TF_IDF_book_df (DataFrame): DataFrame with TF_IDF each book

    Returns:
        genres (dict): Dictionary with book ids as keys and the corresponding gerne that matches the best
    """

    # Create a dictionary to store the genre for each book
    genres = {}

    # Go through each book
    for i, row in tqdm(book_df.iterrows()):
        # Get the TF-IDF scores for the current book
        book_TF_IDF = TF_IDF_book_df[row["book_id"]]

        best_genre = (None, 0)

        # Go through each genre of the book
        for genre in ast.literal_eval(row['genres']):
            genre_TF_IDF = TF_IDF_genres_df[genre]
            # print(f"inner_product {inner_product(book_TF_IDF, genre_TF_IDF)} {genre}") # Testing
            if inner_product(book_TF_IDF, genre_TF_IDF) > best_genre[1]:
                best_genre = (genre, inner_product(book_TF_IDF, genre_TF_IDF))
        # print(best_genre) # Testing
        # break # Testing
        # Save the best genre
        genres[row["book_id"]] = best_genre[0]

    return genres

In [None]:
# Get the genres for each book
genres = get_genres(book_df, TF_IDF_genres_df, TF_IDF_book_df)

book_df["top_genre"] = book_df["book_id"].map(genres)


In [None]:
# Save the book_df as complete_book_df
book_df.to_csv(DATA_PATH + "complete_book_df.csv")

# Create network
- Load dataframe with shelves
- Implode dataframe to make shelves into edgelist
- Calculate assortativity
    - Genre
    - Degrees
- Get largest sub network
- Create communities
    - Color according to genre

In [None]:
# load data
if shelves_df not in locals():
    shelves_df = pd.read_csv(DATA_PATH + 'shelves_df.csv')

In [None]:
# Implode dataframe to get books for each user in a list for each
imploded_df = shelves_df.groupby('user_id')['book_id'].apply(list).reset_index()
print(f'The amount of users are: {len(imploded_df)}')

In [None]:
# Create dictionary edges and appearances
edges = {}

# Go through shelves to get edges
for i, shelf in tqdm(imploded_df.iterrows(), total=imploded_df.shape[0]):
    if len(shelf['book_id']) > 1:
        # Generate pairs of books
        for i in range(len(shelf['book_id'])):
            for j in range(i+1, len(shelf['book_id'])):
                # Create edge
                edge = frozenset([shelf['book_id'][i], shelf['book_id'][j]])
                # Add edge to dictionary
                if edge in edges:
                    edges[edge] += 1
                else:
                    edges[edge] = 1

print(f'The amount of edges are {len(edges)}')

In [None]:
# TODO: Comment on this being to much in rapport

In [None]:
# Save the edges
np.save(DATA_PATH + 'shelf_edges.npy', edges)

In [None]:
#TODO: Jeg tror ikke at det her er ordenligt pushet eftersom det er en pickle og ikke en npy, og har derfor heller ikke tilføjet resten af "Generate network"

In [None]:
with open(DATA_PATH + 'edges.pickle', 'wb') as f:
    # use pickle.dump to serialize the dictionary and save it to the file
    pickle.dump(edges, f)