# Feature: Compare Node Counts by Source Across All Labels

## Importing

In [None]:
import os
import logging
import configparser
import pandas as pd
from neo4j import GraphDatabase

## Get Current Path

In [None]:
# Setup
current_path = os.path.dirname(os.path.realpath("nodes.ipynb"))
# current_path = os.path.dirname(os.path.realpath('f0153.ipynb'))

## Read Config File 

In [None]:
config = configparser.ConfigParser(inline_comment_prefixes=())
config.read(f"{current_path}/config.ini")

neo4j_uri_old = config['DEFAULT']['Neo4j-Uri-old']
neo4j_uri_new = config['DEFAULT']['Neo4j-Uri-new']
username_new = config['DEFAULT']['Neo4j-Username-new']
username_old = config['DEFAULT']['Neo4j-Username-old']
password_old = config['DEFAULT']['Neo4j-Password-old'].strip('"')
password_new = config['DEFAULT']['Neo4j-Password-new']
output_folder = config['DEFAULT']['Output-Folder']

## Set Up Log

In [None]:
logging.basicConfig(
    filename=f"{current_path}/node_source_diff_all.log",
    level=logging.INFO,
    format='%(asctime)s | %(levelname)s | %(message)s'
)

logging.info("Starting node source comparison for all labels")
print("Comparing node counts by source for all labels...")

## Neo4j Query Helpers

In [None]:
def get_graph_version(driver):
    query = "MATCH (n:version) RETURN n.version AS version LIMIT 1"
    try:
        df = run_query(driver, query)
        return df.iloc[0]['version']
    except Exception as e:
        logging.warning(f"Couldn't fetch version info: {e}")
        return "unknown"

def run_query(driver, query):
    with driver.session() as session:
        result = session.run(query)
        return pd.DataFrame([dict(r) for r in result])

def get_source_counts(driver, label):
    query = f"""
    MATCH (n:`{label}`)
    RETURN n.source AS source, count(*) AS count
    """
    return run_query(driver, query)

## Compare All Labels 

In [None]:
# List of node labels to compare
labels = ["researcher", "publication", "dataset", "grant", "organisation"]
all_comparisons = []

# Connect to both graphs
driver_old = GraphDatabase.driver(neo4j_uri_old, auth=(username_old, password_old))
driver_new = GraphDatabase.driver(neo4j_uri_new, auth=(username_new, password_new))

# Dynamically get version numbers
version_old = get_graph_version(driver_old)
version_new = get_graph_version(driver_new)

for label in labels:
    try:
        df_old = get_source_counts(driver_old, label)
        df_new = get_source_counts(driver_new, label)

        merged_df = pd.merge(
            df_old,
            df_new,
            on="source",
            how="outer",
            suffixes=(f"_{version_old}", f"_{version_new}")
        )

        merged_df.fillna(0, inplace=True)
        merged_df["diff"] = merged_df[f"count_{version_new}"] - merged_df[f"count_{version_old}"]
        merged_df["label"] = label

        merged_df = merged_df[["label", "source", f"count_{version_old}", f"count_{version_new}", "diff"]]
        all_comparisons.append(merged_df)

        logging.info(f"Compared: {label}")
    except Exception as e:
        logging.error(f"Failed to compare {label}: {e}")

driver_old.close()
driver_new.close()

# Combine all results
final_df = pd.concat(all_comparisons)
final_df.sort_values(by=["label", "diff"], ascending=[True, False], inplace=True)
final_df.reset_index(drop=True, inplace=True)

final_df

## Save CSV Report 

In [None]:
try:
    final_df.to_csv(f"{output_folder}/node_source_diff_all_labels.csv", index=False)
    logging.info(f"Saved node source diff for all labels to {output_folder}/node_source_diff_all_labels.csv")
    print(f"Saved node source diff for all labels to {output_folder}/node_source_diff_all_labels.csv")
except Exception as e:
    logging.error(f"Failed to write final CSV file: {e}")
    print(f"Error writing CSV file: {e}")
