In [1]:
import pandas as pd
import networkx as nx
import pickle
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm.autonotebook import tqdm, trange

  from tqdm.autonotebook import tqdm, trange


In [2]:
df_1 = pd.read_csv("../data/file_part_1_filtered.csv")

In [3]:
df_1.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

In [4]:
with open('../data/book_id_review_80793.pkl','rb') as b_r:
    book_review = pickle.load(b_r)

In [5]:
df_1 = df_1[~df_1.description.isna()]

In [6]:
df_1 = df_1[~df_1.publication_year.isna()]

In [7]:
df_1["book_id"] = df_1["book_id"].astype(str)

In [8]:
rev_df = pd.DataFrame({"book_id":book_review.keys(),"Review":book_review.values()})

In [9]:
df_1_rev = pd.merge(df_1,rev_df,on=["book_id"],how="left")

In [10]:
df_1_rev.to_csv("../data/file_part_1_with_review.csv",index=False)

In [11]:
# def process_reviews(reviews, book_description, embedding_model, top_k=5):
#     # Embed the book description
#     book_embedding = embedding_model.encode([book_description])[0]
    
#     # Embed all reviews
#     review_embeddings = embedding_model.encode(reviews)
    
#     # Calculate cosine similarity between book description and reviews
#     similarities = cosine_similarity([book_embedding], review_embeddings)[0]
    
#     # Get indices of top-k most similar reviews
#     top_indices = similarities.argsort()[-top_k:][::-1]
    
#     # Return top-k most relevant reviews
#     return [reviews[i] for i in top_indices]

In [12]:
# # Initialize embedding model
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [11]:
df_1_top_rev = pd.read_csv("../data/file_part_1_with_top_reviews.csv")

In [12]:
G = nx.Graph()

In [13]:
for _, row in tqdm(df_1_top_rev.iterrows(), total=len(df_1_rev), desc="Processing rows"):
    G.add_node(row['book_id'], type='book', title=row['title'],
                   isbn=row['isbn'], isbn13=row['isbn13'],
                   average_rating=row['average_rating'],
                   ratings_count=row['ratings_count'],
                   text_reviews_count=row['text_reviews_count'],
                   num_pages=row['num_pages'],
                   description=row['description'],
                #    language_code=row['language_code'],
                   country_code=row['country_code'],
                   embedding=None)
    authors = eval(row['authors'])
    for author in authors:
        # print(author)
        G.add_node(author.get('author_id'), type='author')
        G.add_edge(row['book_id'], author.get('author_id'), relation='written_by')
            # Add publisher node and edge
    if pd.notna(row['publisher']):
        G.add_node(row['publisher'], type='publisher')
        G.add_edge(row['book_id'], row['publisher'], relation='published_by')
    if pd.notna(row['publication_year']):
        G.add_node(row['publication_year'], type='year')
        G.add_edge(row['book_id'], row['publication_year'], relation='published_in_year')

    # Add format node and edge
    if pd.notna(row['format']):
        G.add_node(row['format'], type='format')
        G.add_edge(row['book_id'], row['format'], relation='available_in')
        # Add similar books edges
    if len(row['similar_books'])>0:
        similar_books = row['similar_books']
        for similar_book in similar_books:
            if similar_book in df_1['book_id']:
                G.add_edge(row['book_id'], similar_book, relation='similar_to')
            # Process reviews
    if 'Top_Reviews' in row:
        try:
            top_reviews = eval(row["Top_Reviews"])
            for i, review in enumerate(top_reviews):
                review_node = f"{row['book_id']}_review_{i}"
                G.add_node(review_node, type='review', content=review)
                G.add_edge(row['book_id'], review_node, relation='has_review')
        except:
            pass

Processing rows:  66%|██████▌   | 44697/67651 [00:43<00:18, 1215.73it/s]

In [6]:
col_list = ['book_id','isbn','isbn13','average_rating','ratings_count','text_reviews_count','num_pages','description',
            'language_code','country_code','authors','publisher','publication_year','format','similar_books']

In [7]:
import sys

In [8]:
print(f"{sys.getsizeof(df_1[col_list])/1024**2} mb")

126.69052982330322 mb


In [9]:
print(f"{sys.getsizeof(G)/1024**2} mb")

4.57763671875e-05 mb


In [10]:
import pickle

# Saving the graph
def save_graph(G, filename):
    with open(filename, 'wb') as f:
        pickle.dump(G, f)

# Loading the graph
def load_graph(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)


In [11]:
# Save the graph
save_graph(G, '../graph-data/my_graph.pkl')



In [12]:
# Load the graph later
loaded_G = load_graph('../graph-data/my_graph.pkl')

In [13]:
print(f"{sys.getsizeof(loaded_G)/1024**2} mb")

4.57763671875e-05 mb


In [19]:
from pyvis.network import Network
import networkx as nx
import hashlib

def hash_complex_object(obj):
    """Create a hash for objects that are not strings or integers."""
    return hashlib.md5(str(obj).encode()).hexdigest()

def node_to_id(node):
    """Convert node to a suitable id for Pyvis."""
    if isinstance(node, (str, int)):
        return str(node)
    else:
        return hash_complex_object(node)

def create_pyvis_graph(G, output_filename='pyvis_graph.html'):
    # Create a Pyvis network
    net = Network(notebook=True, height="750px", width="100%", bgcolor="#222222", font_color="white")
    
    # Create a mapping of original node to its ID in the Pyvis graph
    node_mapping = {}
    
    # Add nodes
    for node, node_attrs in G.nodes(data=True):
        node_type = node_attrs.get('type', 'unknown')
        color = {
            'book': '#FF6B6B',
            'author': '#4ECDC4',
            'publisher': '#45B7D1',
            'year': '#FFA07A',
            'format': '#98D8C8'
        }.get(node_type, '#FFFFFF')
        
        # Convert node to a suitable id
        node_id = node_to_id(node)
        node_mapping[node] = node_id
        
        # Use a truncated version of the node id as the label if it's too long
        label = str(node) if len(str(node)) <= 20 else str(node)[:17] + '...'
        
        net.add_node(node_id, label=label, title=str(node_attrs), color=color)
    
    # Add edges
    for source, target, edge_attrs in G.edges(data=True):
        source_id = node_mapping[source]
        target_id = node_mapping[target]
        net.add_edge(source_id, target_id, title=edge_attrs.get('relation', ''))
    
    # Set physics layout
    net.force_atlas_2based()
    
    # Save and show the graph
    net.show(output_filename)



In [20]:
# Usage
create_pyvis_graph(G)



KeyboardInterrupt: 