## Step_1: Loading Neo4j Credentials in Python  
This code imports necessary libraries and reads Neo4j connection details from `credentials.json`.  


In [1]:
from pyspark.sql import SparkSession
import networkx as nx
from tqdm import tqdm
import pickle
import pandas as pd 
import json

with open('credentials.json', 'r') as file:
    config = json.load(file)

# Database configs
db_url = config["db_url"]
db_username = config["db_username"]
db_password = config["db_password"]
db_name = config["db_name"]

## Step_2: Initializing SparkSession for Neo4j  
This segment configures and initializes a SparkSession to connect to Neo4j with specified database credentials and resources.  


In [2]:
spark_obj = (
    SparkSession.builder.config("neo4j.url", db_url)
    .config("spark.network.timeout", "600s")
    .config("spark.executor.heartbeatInterval", "100s")
    .config("spark.executor.memory", "4g")
    .config("spark.executor.cores", "2")  
    .config("spark.driver.memory", "4g")
    .config("spark.jars", "neo4j-connector-apache-spark_2.12-5.3.2_for_spark_3.jar")
    .config("neo4j.authentication.basic.username", db_username)
    .config("neo4j.authentication.basic.password", db_password)
    .config("neo4j.database", db_name)
    .getOrCreate())

## Step_3 Loading Graph from Neo4j  
This function queries nodes and edges from Neo4j using Spark, converts them to Pandas DataFrames, and constructs a NetworkX graph object with detailed node attributes and edges.  


In [3]:
def load_graph_from_neo4j(_spark_obj):
    nodes_df = _spark_obj.read \
        .format("org.neo4j.spark.DataSource") \
        .option("query", "MATCH (p:Paper) RETURN p.paper_id as paper_id, p.venue as venue, p.labels as labels, p.authors as authors") \
        .load()
    
    edges_df = _spark_obj.read \
        .format("org.neo4j.spark.DataSource") \
        .option("query", "MATCH (p1:Paper)-[:CITES]->(p2:Paper) RETURN p1.paper_id as citing, p2.paper_id as cited")\
        .load()
    
    nodes = nodes_df.toPandas()
    edges = edges_df.toPandas()
    
    graph_obj = nx.Graph()
    for _, row in tqdm(nodes.iterrows(), total=nodes.shape[0], desc="Loading nodes"):
        graph_obj.add_node(row['paper_id'], Venue=row['venue'], Labels=row['labels'], Authors=row['authors'])

    for _, row in tqdm(edges.iterrows(), total=edges.shape[0], desc="Loading edges"):
        graph_obj.add_edge(row['citing'], row['cited'])

    print("Graph loaded with", graph_obj.number_of_nodes(), "nodes and", graph_obj.number_of_edges(),"edges.")
    
    return nodes, edges, graph_obj

In [4]:
nodes, edges, G = load_graph_from_neo4j(spark_obj)

Loading nodes: 100%|██████████| 564340/564340 [00:22<00:00, 24830.16it/s]
Loading edges: 100%|██████████| 957727/957727 [00:35<00:00, 26639.26it/s]


Graph loaded with 564340 nodes and 955748 edges.


## Step_4 Similarity Ranking using SimRank  
This function computes similarity rankings for a query node using SimRank with different importance factors (`c`) on a subgraph extracted from Neo4j. It ranks the top-k similar nodes and prints the results.  


In [5]:
def similarity_ranking(g_obj, _query_node, top_k,  _spark_obj, _edges):
    # Importance Factor
    print("Query node:", _query_node)
    c = [0.7, 0.8, 0.9]
    sim_matrix = {}
    
    cypher_query = f"""
    MATCH (n)-[:CITES*1..3]-(m)
    WHERE n.paper_id = '{_query_node}'
    RETURN DISTINCT m.paper_id as paper_id, m.authors as authors, m.labels as labels, m.venue as venue
    """
    subgraph_df = _spark_obj.read \
        .format("org.neo4j.spark.DataSource") \
        .option("query", cypher_query) \
        .load()
        
    subgraph_df = subgraph_df.select("*").toPandas()
    
    valid_ids = set(subgraph_df['paper_id'])
    filtered_edges = _edges[
        _edges['citing'].isin(valid_ids) & _edges['cited'].isin(valid_ids)
    ]
    
    subgraph_obj = nx.DiGraph()  
    
    for _, row in subgraph_df.iterrows():
        subgraph_obj.add_node(row['paper_id'], Authors=row['authors'], Labels=row['labels'], Venue=row['venue'])
    
    for _, row in filtered_edges.iterrows():
        subgraph_obj.add_edge(row['citing'], row['cited'])
        
    for imp_factor in c:
        sim_matrix[imp_factor] = nx.simrank_similarity(subgraph_obj, source=query_node, importance_factor=imp_factor, tolerance=0.000005)
        
    top_nodes = {}

    for c, similarities in sim_matrix.items():
        top_nodes[c] = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_k]
    
    for c, top_k in top_nodes.items():
        print(f"Top 10 nodes for c={c}:")
        for node, similarity in top_k:
            print(f"Node: {node}, Similarity: {similarity}")
        print()
        
    return sim_matrix


## SimRank for query_1 : 2982615777 for c = 0.7,0.8,0.9


In [8]:
query_node = input("Enter Query node: ")
query = str(query_node)
k = int(input("Enter Top k values: "))
similarity_matrix = similarity_ranking(G, query_node, top_k=k, _spark_obj=spark_obj,_edges=edges)

Query node: 2982615777
Top 10 nodes for c=0.7:
Node: 2982615777, Similarity: 1.0
Node: 2604445413, Similarity: 0.0
Node: 1487773734, Similarity: 0.0
Node: 2573184146, Similarity: 0.0
Node: 2129988636, Similarity: 0.0
Node: 2410108711, Similarity: 0.0
Node: 1172733195, Similarity: 0.0
Node: 162515996, Similarity: 0.0
Node: 2605269223, Similarity: 0.0
Node: 1999984505, Similarity: 0.0

Top 10 nodes for c=0.8:
Node: 2982615777, Similarity: 1.0
Node: 2604445413, Similarity: 0.0
Node: 1487773734, Similarity: 0.0
Node: 2573184146, Similarity: 0.0
Node: 2129988636, Similarity: 0.0
Node: 2410108711, Similarity: 0.0
Node: 1172733195, Similarity: 0.0
Node: 162515996, Similarity: 0.0
Node: 2605269223, Similarity: 0.0
Node: 1999984505, Similarity: 0.0

Top 10 nodes for c=0.9:
Node: 2982615777, Similarity: 1.0
Node: 2604445413, Similarity: 0.0
Node: 1487773734, Similarity: 0.0
Node: 2573184146, Similarity: 0.0
Node: 2129988636, Similarity: 0.0
Node: 2410108711, Similarity: 0.0
Node: 1172733195, Sim

## SimRank for query_2 : 1556418098 c = 0.7,0.8,0.9

In [7]:
query_node = input("Enter Query node: ")
query = str(query_node)
k = int(input("Enter Top k values: "))
similarity_matrix = similarity_ranking(G, query_node, top_k=k, _spark_obj=spark_obj,_edges=edges)

Query node: 1556418098
Top 10 nodes for c=0.7:
Node: 1556418098, Similarity: 1.0
Node: 2158129819, Similarity: 0.06364215473841438
Node: 2133689609, Similarity: 0.06364215473841438
Node: 1831222313, Similarity: 0.06363636363636363
Node: 2197996701, Similarity: 0.04454545454545454
Node: 2031218379, Similarity: 0.03182366214524881
Node: 1994772839, Similarity: 0.0318218388447181
Node: 2139975922, Similarity: 0.031818264886868466
Node: 2154083146, Similarity: 0.02969696969696969
Node: 1930723014, Similarity: 0.027840909090909083

Top 10 nodes for c=0.8:
Node: 1556418098, Similarity: 1.0
Node: 2158129819, Similarity: 0.07273647419987554
Node: 2133689609, Similarity: 0.07273647419987554
Node: 1831222313, Similarity: 0.07272727272727274
Node: 2197996701, Similarity: 0.058181818181818196
Node: 2154083146, Similarity: 0.038787878787878795
Node: 2031218379, Similarity: 0.03637234578764584
Node: 1994772839, Similarity: 0.036369562453467286
Node: 2139975922, Similarity: 0.036363798426147634
Node: