In [1]:
import networkx as nx
import pandas as pd
import numpy as np
import netwulf as nw
import matplotlib.pyplot as plt
from tqdm import tqdm
import ast


try:
    from wordcloud import WordCloud
except:
    print('Did not import wordcloud due to an error')

DATA_PATH = 'data/'

Did not import wordcloud due to an error


# Word clouds

In [None]:
# Used to make all node ids strings
def make_all_nodes_strings(graph):

    # Check if first id is a string, in which case, don't do anyting
    if isinstance(list(graph.nodes())[0], str):
            return graph

    # Make a mapping so each node can be renamed into what it was, but a string instead
    node_map = {}
    for node in graph.nodes():
        node_map[node] = str(node)
    str_graph = nx.relabel_nodes(graph, node_map)

    return str_graph

In [None]:
shelf_graph = nx.read_graphml(DATA_PATH + 'shelves_graph_04.graphml')
shelf_graph = make_all_nodes_strings(shelf_graph)

shelf_giant = nx.Graph()
NLP_graph = nx.read_graphml(DATA_PATH + 'NLP_graph_04.graphml')
NLP_graph = make_all_nodes_strings(NLP_graph)
NLP_giant = nx.Graph()

shelf_louvain = np.load(DATA_PATH + 'shelves_communities_04.npy', allow_pickle = True)
NLP_louvain = np.load(DATA_PATH + 'NLP_communities_04.npy', allow_pickle= True)

In [None]:
complete_book_df = pd.read_csv(DATA_PATH + "complete_book_df.csv")

In [None]:
def make_attributes(df, graph):
    book_attributes = dict()
    for i, book in tqdm(df.iterrows(), total=df.shape[0]):
        node = book['book_id']
        if str(node) not in graph.nodes():
            continue
        book_attributes[node] = dict()
        top_genre = book['top_genre']
        title = book['title']
        genres = ast.literal_eval(book['genres'])

        book_attributes[node]['title'] = title
        book_attributes[node]['genres'] = genres
        book_attributes[node]['top_genre'] = top_genre
    return book_attributes

In [None]:
attribute_dict = make_attributes(complete_book_df, shelf_graph)

In [None]:
nx.set_node_attributes(shelf_graph, attribute_dict)

In [None]:
# Find top three books for each community according to degree
def get_top_3_books(graph_type, louvain_groups):
    top_3_books = {}
    # Store the top 3 books by degree for all communities
    for i, community in enumerate(louvain_groups):
        # Get the top 3 books by degree
        sorted_dict = sorted(dict(graph_type.degree(community)).items(), key=lambda x: x[1], reverse=True)
        names = nx.get_node_attributes(graph_type, "title")
        top_3_keys = [k for k, v in sorted_dict[:3]] # Get the ID
        top_3_names = [names[k] for k in top_3_keys] # get the name

        top_3_books[i] = [(top_3_keys[j], top_3_names[j]) for j in range(len(top_3_keys))]
        return top_3_books

In [None]:
shelf_top_3_books = get_top_3_books(shelf_giant, shelf_louvain)
print(shelf_top_3_books)

In [None]:
# Print the top 3 books for the 9 largest communities
shelf_top_3_books = get_top_3_books(shelf_giant, shelf_louvain)
NLP_top_3_books = get_top_3_books(NLP_giant, NLP_louvain)

shelf_idx_9_largest = np.argsort([-len(community) for community in shelf_louvain])[:9]
NLP_idx_9_largest = np.argsort([-len(community) for community in NLP_louvain])[:9]

#For the shelf graph
for i, community in enumerate(shelf_idx_9_largest):
    print(f"{i+1}. largest community {community}: ")
    for book in shelf_top_3_books[community]:
        print(f"Book: {book[1]}, degree: {shelf_giant.degree(book[0])}")
    print()

#For the shelf graph
for i, community in enumerate(NLP_idx_9_largest):
    print(f"{i+1}. largest community {community}: ")
    for book in NLP_top_3_books[community]:
        print(f"Book: {book[1]}, degree: {NLP_giant.degree(book[0])}")
    print()

In [2]:
#Requires TF_IDF for each community

def plot_wordcloud(TF_IDF, community, n_words=100):
    """Make a wordcloud from the TF-IDF scores for a given community
    Args:
        TF_IDF (pandas.DataFrame): Dataframe containing the TF-IDF scores for each word in the corpus
        community (int): The community to make the wordcloud for
        n_words (int, optional): The number of words to include in the wordcloud. Defaults to 100.
    """
    word_cloud = WordCloud().generate_from_frequencies(TF_IDF[community].nlargest(n_words).to_dict()).to_image()
    fig = plt.figure(figsize=(16, 16))

    authors = [author[1] for author in top_3_books[community]]
    plt.title(f"Wordcloud for community {community} size {len(CSS_louvain_groups[community])}\nTop 3 authors: {authors}")
    plt.imshow(word_cloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()