Feature: Compare Node Counts by Source Across All Labels

Imports, Config, Logging Setup

In [22]:
import os
import logging
import configparser
import pandas as pd
from neo4j import GraphDatabase

# Setup
current_path = os.getcwd()

config = configparser.ConfigParser()
config.read(f"{current_path}/config.ini")

neo4j_uri_2023 = config['DEFAULT']['Neo4j-Uri-2023']
neo4j_uri_2024 = config['DEFAULT']['Neo4j-Uri-2024']
username = config['DEFAULT']['Neo4j-Username']
password_2023 = config['DEFAULT']['Neo4j-Password-2023']
password_2024 = config['DEFAULT']['Neo4j-Password-2024']
output_folder = config['DEFAULT']['Output-Folder']

logging.basicConfig(
    filename=f"{current_path}/node_source_diff_all.log",
    level=logging.INFO,
    format='%(asctime)s | %(levelname)s | %(message)s'
)

logging.info("Starting node source comparison for all labels")
print("Comparing node counts by source for all labels...")

Comparing node counts by source for all labels...


 Neo4j Query Helpers

In [23]:
def run_query(driver, query):
    with driver.session() as session:
        result = session.run(query)
        return pd.DataFrame([dict(r) for r in result])

def get_source_counts(driver, label):
    query = f"""
    MATCH (n:`{label}`)
    RETURN n.source AS source, count(*) AS count
    """
    return run_query(driver, query)

Compare All Labels 

In [24]:
# List of node labels to compare
labels = ["researcher", "publication", "dataset", "grant", "organisation"]
all_comparisons = []

# Connect to both graphs
driver_2023 = GraphDatabase.driver(neo4j_uri_2023, auth=(username, password_2023))
driver_2024 = GraphDatabase.driver(neo4j_uri_2024, auth=(username, password_2024))

for label in labels:
    try:
        df_2023 = get_source_counts(driver_2023, label)
        df_2024 = get_source_counts(driver_2024, label)

        merged_df = pd.merge(
            df_2023,
            df_2024,
            on="source",
            how="outer",
            suffixes=("_2023", "_2024")
        )

        merged_df.fillna(0, inplace=True)
        merged_df["diff"] = merged_df["count_2024"] - merged_df["count_2023"]
        merged_df["label"] = label

        merged_df = merged_df[["label", "source", "count_2023", "count_2024", "diff"]]
        all_comparisons.append(merged_df)

        logging.info(f"Compared: {label}")
    except Exception as e:
        logging.error(f"Failed to compare {label}: {e}")

driver_2023.close()
driver_2024.close()

# Combine all results
final_df = pd.concat(all_comparisons)
final_df.sort_values(by=["label", "diff"], ascending=[True, False], inplace=True)
final_df.reset_index(drop=True, inplace=True)

final_df

Unnamed: 0,label,source,count_2023,count_2024,diff
0,dataset,datacite.org,1454,8521.0,7067.0
1,dataset,orcid.org,16061,17439.0,1378.0
2,dataset,crossref.org,2031,2275.0,244.0
3,grant,nhmrc.org,14790,28547.0,13757.0
4,grant,arc.gov.au,20352,32338.0,11986.0
5,grant,orcid.org,25700,29781.0,4081.0
6,grant,crossref.org,86,49.0,-37.0
7,organisation,orcid.org,173825,242839.0,69014.0
8,organisation,ror.org,105294,107824.0,2530.0
9,organisation,crossref.org,5509,7856.0,2347.0


Save CSV Report 

In [26]:
output_file = f"{output_folder}/node_source_diff_all_labels.csv"

try:
    final_df.to_csv(output_file, index=False)
    logging.info(f"Saved node source diff for all labels to {output_file}")
    print(f"Saved node source diff for all labels to {output_file}")
except Exception as e:
    logging.error(f"Failed to write final CSV: {e}")
    print(f"Error writing CSV: {e}")


Saved node source diff for all labels to /Users/sriharshithathoram/Desktop/neo4j-is/node_source_diff_all_labels.csv
