In [1]:
import pandas as pd
import networkx as nx
import pickle
import numpy as np

from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm.autonotebook import tqdm, trange

  from tqdm.autonotebook import tqdm, trange


In [2]:
df_1 = pd.read_csv("../data/file_part_1_filtered.csv")

In [3]:
df_1.columns

Index(['isbn', 'text_reviews_count', 'series', 'country_code', 'language_code',
       'popular_shelves', 'asin', 'is_ebook', 'average_rating', 'kindle_asin',
       'similar_books', 'description', 'format', 'link', 'authors',
       'publisher', 'num_pages', 'publication_day', 'isbn13',
       'publication_month', 'edition_information', 'publication_year', 'url',
       'image_url', 'book_id', 'ratings_count', 'work_id', 'title',
       'title_without_series'],
      dtype='object')

In [4]:
with open('../data/book_id_review_80793.pkl','rb') as b_r:
    book_review = pickle.load(b_r)

In [5]:
df_1 = df_1[~df_1.description.isna()]

In [6]:
df_1 = df_1[~df_1.publication_year.isna()]

In [7]:
df_1["book_id"] = df_1["book_id"].astype(str)

In [8]:
rev_df = pd.DataFrame({"book_id":book_review.keys(),"Review":book_review.values()})

In [9]:
df_1_rev = pd.merge(df_1,rev_df,on=["book_id"],how="left")

In [10]:
df_1_rev.to_csv("../data/file_part_1_with_review.csv",index=False)

In [11]:
# def process_reviews(reviews, book_description, embedding_model, top_k=5):
#     # Embed the book description
#     book_embedding = embedding_model.encode([book_description])[0]
    
#     # Embed all reviews
#     review_embeddings = embedding_model.encode(reviews)
    
#     # Calculate cosine similarity between book description and reviews
#     similarities = cosine_similarity([book_embedding], review_embeddings)[0]
    
#     # Get indices of top-k most similar reviews
#     top_indices = similarities.argsort()[-top_k:][::-1]
    
#     # Return top-k most relevant reviews
#     return [reviews[i] for i in top_indices]

In [12]:
# # Initialize embedding model
# embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

In [11]:
df_1_top_rev = pd.read_csv("../data/file_part_1_with_top_reviews.csv")

In [12]:
G = nx.Graph()

In [13]:
for _, row in tqdm(df_1_top_rev.iterrows(), total=len(df_1_rev), desc="Processing rows"):
    G.add_node(row['book_id'], type='book', title=row['title'],
                   isbn=row['isbn'], isbn13=row['isbn13'],
                   average_rating=row['average_rating'],
                   ratings_count=row['ratings_count'],
                   text_reviews_count=row['text_reviews_count'],
                   num_pages=row['num_pages'],
                   description=row['description'],
                #    language_code=row['language_code'],
                   country_code=row['country_code'],
                   embedding=None)
    authors = eval(row['authors'])
    for author in authors:
        # print(author)
        G.add_node(author.get('author_id'), type='author')
        G.add_edge(row['book_id'], author.get('author_id'), relation='written_by')
            # Add publisher node and edge
    if pd.notna(row['publisher']):
        G.add_node(row['publisher'], type='publisher')
        G.add_edge(row['book_id'], row['publisher'], relation='published_by')
    if pd.notna(row['publication_year']):
        G.add_node(row['publication_year'], type='year')
        G.add_edge(row['book_id'], row['publication_year'], relation='published_in_year')

    # Add format node and edge
    if pd.notna(row['format']):
        G.add_node(row['format'], type='format')
        G.add_edge(row['book_id'], row['format'], relation='available_in')
        # Add similar books edges
    if len(row['similar_books'])>0:
        similar_books = row['similar_books']
        for similar_book in similar_books:
            if similar_book in df_1['book_id']:
                G.add_edge(row['book_id'], similar_book, relation='similar_to')
            # Process reviews
    if 'Top_Reviews' in row:
        try:
            top_reviews = eval(row["Top_Reviews"])
            for i, review in enumerate(top_reviews):
                review_node = f"{row['book_id']}_review_{i}"
                G.add_node(review_node, type='review', content=review)
                G.add_edge(row['book_id'], review_node, relation='has_review')
        except:
            pass

Processing rows: 100%|██████████| 67651/67651 [01:01<00:00, 1107.99it/s]


In [22]:
import networkx as nx
import pandas as pd
import random

def validate_graph(G, df):
    """
    Main function to run all validation checks
    """
    print("Starting graph validation...")
    
    check_node_count(G, df)
    check_book_attributes(G, df)
    check_author_connections(G, df)
    check_publisher_connections(G, df)
    check_year_connections(G, df)
    check_format_connections(G, df)
    check_similar_books(G, df)
    check_review_connections(G)
    check_graph_connectivity(G)
    
    print("Graph validation complete.")

def check_node_count(G, df):
    """
    Check if the number of nodes in the graph matches the expected count
    """
    expected_book_count = len(df)
    actual_book_count = len([n for n in G.nodes if G.nodes[n]['type'] == 'book'])
    
    print(f"Expected book count: {expected_book_count}")
    print(f"Actual book count: {actual_book_count}")
    if expected_book_count != actual_book_count:
        print(f"WARNING: Mismatch in book count. Expected {expected_book_count}, got {actual_book_count}")

def check_book_attributes(G, df):
    """
    Check if book nodes have all required attributes
    """
    required_attributes = ['title', 'isbn', 'isbn13', 'average_rating', 'ratings_count', 
                           'text_reviews_count', 'num_pages', 'description', 'country_code']
    
    sample_books = random.sample(list(df['book_id']), min(100, len(df)))
    missing_attributes = {}
    
    for book_id in sample_books:
        node_data = G.nodes[book_id]
        for attr in required_attributes:
            if attr not in node_data:
                if book_id not in missing_attributes:
                    missing_attributes[book_id] = []
                missing_attributes[book_id].append(attr)
    
    if missing_attributes:
        print("WARNING: Some books are missing attributes:")
        for book_id, attrs in missing_attributes.items():
            print(f"Book {book_id} is missing: {', '.join(attrs)}")
    else:
        print("All sampled books have the required attributes.")

def check_author_connections(G, df):
    """
    Check if all books are connected to their authors
    """
    sample_books = random.sample(list(df['book_id']), min(100, len(df)))
    missing_connections = []
    
    for book_id in sample_books:
        authors = eval(df[df['book_id'] == book_id]['authors'].iloc[0])
        for author in authors:
            author_id = author['author_id']
            if not G.has_edge(book_id, author_id):
                missing_connections.append((book_id, author_id))
    
    if missing_connections:
        print("WARNING: Some books are not connected to their authors:")
        for book_id, author_id in missing_connections:
            print(f"Missing edge between book {book_id} and author {author_id}")
    else:
        print("All sampled books are correctly connected to their authors.")

def check_publisher_connections(G, df):
    """
    Check if books are connected to their publishers
    """
    sample_books = random.sample(list(df[df['publisher'].notna()]['book_id']), min(100, len(df)))
    missing_connections = []
    
    for book_id in sample_books:
        publisher = df[df['book_id'] == book_id]['publisher'].iloc[0]
        if not G.has_edge(book_id, publisher):
            missing_connections.append((book_id, publisher))
    
    if missing_connections:
        print("WARNING: Some books are not connected to their publishers:")
        for book_id, publisher in missing_connections:
            print(f"Missing edge between book {book_id} and publisher {publisher}")
    else:
        print("All sampled books are correctly connected to their publishers.")

def check_year_connections(G, df):
    """
    Check if books are connected to their publication years
    """
    sample_books = random.sample(list(df[df['publication_year'].notna()]['book_id']), min(100, len(df)))
    missing_connections = []
    
    for book_id in sample_books:
        year = df[df['book_id'] == book_id]['publication_year'].iloc[0]
        if not G.has_edge(book_id, year):
            missing_connections.append((book_id, year))
    
    if missing_connections:
        print("WARNING: Some books are not connected to their publication years:")
        for book_id, year in missing_connections:
            print(f"Missing edge between book {book_id} and year {year}")
    else:
        print("All sampled books are correctly connected to their publication years.")

def check_format_connections(G, df):
    """
    Check if books are connected to their formats
    """
    sample_books = random.sample(list(df[df['format'].notna()]['book_id']), min(100, len(df)))
    missing_connections = []
    
    for book_id in sample_books:
        format = df[df['book_id'] == book_id]['format'].iloc[0]
        if not G.has_edge(book_id, format):
            missing_connections.append((book_id, format))
    
    if missing_connections:
        print("WARNING: Some books are not connected to their formats:")
        for book_id, format in missing_connections:
            print(f"Missing edge between book {book_id} and format {format}")
    else:
        print("All sampled books are correctly connected to their formats.")

def check_similar_books(G, df):
    """
    Check if books are connected to their similar books
    """
    sample_books = random.sample(list(df[df['similar_books'].apply(lambda x: len(x) > 0)]['book_id']), min(100, len(df)))
    missing_connections = []
    
    for book_id in sample_books:
        similar_books = df[df['book_id'] == book_id]['similar_books'].iloc[0]
        for similar_book in similar_books:
            if similar_book in df['book_id'].values and not G.has_edge(book_id, similar_book):
                missing_connections.append((book_id, similar_book))
    
    if missing_connections:
        print("WARNING: Some books are not connected to their similar books:")
        for book_id, similar_book in missing_connections:
            print(f"Missing edge between book {book_id} and similar book {similar_book}")
    else:
        print("All sampled books are correctly connected to their similar books.")

def check_review_connections(G):
    """
    Check if books with reviews have the correct number of review connections
    """
    book_nodes = [n for n in G.nodes if G.nodes[n]['type'] == 'book']
    sample_books = random.sample(book_nodes, min(100, len(book_nodes)))
    books_with_excess_reviews = []
    
    for book_id in sample_books:
        review_edges = [e for e in G.edges(book_id, data=True) if e[2]['relation'] == 'has_review']
        if len(review_edges) > 5:
            books_with_excess_reviews.append((book_id, len(review_edges)))
    
    if books_with_excess_reviews:
        print("WARNING: Some books have more than 5 review connections:")
        for book_id, review_count in books_with_excess_reviews:
            print(f"Book {book_id} has {review_count} review connections")
    else:
        print("All sampled books have 5 or fewer review connections.")

def check_graph_connectivity(G):
    """
    Check if the graph is connected
    """
    is_connected = nx.is_connected(G)
    print(f"Graph is {'connected' if is_connected else 'not connected'}")
    if not is_connected:
        components = list(nx.connected_components(G))
        print(f"Number of connected components: {len(components)}")
        print(f"Sizes of connected components: {[len(c) for c in components]}")


In [23]:
validate_graph(G, df_1_top_rev)

Starting graph validation...
Expected book count: 67651
Actual book count: 67650
All sampled books have the required attributes.
All sampled books are correctly connected to their authors.
All sampled books are correctly connected to their publishers.
All sampled books are correctly connected to their publication years.
All sampled books are correctly connected to their formats.
All sampled books are correctly connected to their similar books.
All sampled books have 5 or fewer review connections.
Graph is not connected
Number of connected components: 8
Sizes of connected components: [305192, 6, 7, 4, 4, 4, 5, 3]
Graph validation complete.


In [14]:
col_list = ['book_id','isbn','isbn13','average_rating','ratings_count','text_reviews_count','num_pages','description',
            'language_code','country_code','authors','publisher','publication_year','format','similar_books','Top_Reviews']

In [15]:
import sys

In [16]:
print(f"{sys.getsizeof(df_1_top_rev[col_list])/1024**2} mb")

247.40308094024658 mb


In [17]:
print(f"{sys.getsizeof(G)/1024**2} mb")

4.57763671875e-05 mb


In [24]:
import pickle

# Saving the graph
def save_graph(G, filename):
    with open(filename, 'wb') as f:
        pickle.dump(G, f)

# Loading the graph
def load_graph(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)


In [25]:
# Save the graph
save_graph(G, '../graph-data/book_file_with_review_graph.pkl')



In [26]:
# Load the graph later
loaded_G = load_graph('../graph-data/book_file_with_review_graph.pkl')

In [27]:
validate_graph(loaded_G, df_1_top_rev)

Starting graph validation...
Expected book count: 67651
Actual book count: 67650
All sampled books have the required attributes.
All sampled books are correctly connected to their authors.
All sampled books are correctly connected to their publishers.
All sampled books are correctly connected to their publication years.
All sampled books are correctly connected to their formats.
All sampled books are correctly connected to their similar books.
All sampled books have 5 or fewer review connections.
Graph is not connected
Number of connected components: 8
Sizes of connected components: [305192, 6, 7, 4, 4, 4, 5, 3]
Graph validation complete.


In [None]:
## SouravB here!