In [1]:
import sys
import os
import time
import psutil
import pandas as pd
from neo4j import GraphDatabase

In [2]:
# Database connection setup
uri = "bolt://localhost:7687"
username = "neo4j" # username for Neo4j, most probably is Neo4j, if you did not change while installing Neo4j Desktop
password = "eigen1234" # Password for database.
database_name = "d4.citeseer"  # Database name

driver = GraphDatabase.driver(uri, auth=(username, password))

In [3]:
def check_connection():
    try:
        # Establish a session with the specified database
        # with driver.session(database=database_name) as session:
        with driver.session() as session:
            # Run a simple query to check the connection
            result = session.run("RETURN 'Connection to database successful' AS message")
            for record in result:
                print(record["message"])
    except Exception as e:
        print("Error connecting to the database:", e)

# Call the check_connection function
check_connection()

Connection to database successful


In [4]:
# Function to run a query and measure performance metrics
def run_query(driver, query, parameters):
    # Start time and resources
    process = psutil.Process(os.getpid())
    start_time = time.time()
    start_cpu_times = process.cpu_times()
    start_mem = process.memory_info().rss  # Resident Set Size

    # with driver.session(database=database_name) as session:
    with driver.session() as session:
        result = session.run(query, parameters)
        record = result.single()
        data = record.data() if record else None

    # End time and resources
    end_time = time.time()
    end_cpu_times = process.cpu_times()
    end_mem = process.memory_info().rss

    # Calculations
    duration = end_time - start_time
    cpu_used = (end_cpu_times.user + end_cpu_times.system) - (start_cpu_times.user + start_cpu_times.system)
    memory_used = (end_mem - start_mem) / (1024 ** 2)  # Convert to MB

    return data, duration, memory_used, cpu_used

In [5]:
# Predefined experiments configurations with all combinations for each sub-dataset
experiments = []

# Define graph types and Laplacian methods
laplacian_types = ["sym", "rw", "ad"]

# Define number of eigenvectors and silhouette usage (common for all experiments)
number_of_eigenvectors = 6

for laplacian_type in laplacian_types:
    experiments.append({
        "node_label": "citeseer",
        "is_feature_based": False,
        "laplacian_type": laplacian_type,
        "number_of_eigenvectors": number_of_eigenvectors,
        "use_kmean_for_silhouette": True
    })

# Print or analyze the configurations to ensure correctness
# for experiment in experiments:
#     print(experiment)

In [6]:
# Main function to run experiments
def run_experiments(driver):
    results = []
    total_experiments = len(experiments)
    for idx, config in enumerate(experiments, 1):
        query = """
        WITH simkit.experimental_spectralClustering({
            node_label: $node_label,
            is_feature_based: $is_feature_based,
            distance_measure: "euclidean",
            graph_type: "full",
            parameter: "3",
            remove_columns: "id,label",
            laplacian_type: $laplacian_type,
            number_of_eigenvectors: $number_of_eigenvectors,
            number_of_iterations: "100",
            distance_measure_kmean: "euclidean",
            target_column: "label",
            use_kmean_for_silhouette: $use_kmean_for_silhouette,
            seed: 42
        }) AS result
        RETURN result.silhouette_score AS silhouette_score, 
               result.rand_index AS rand_index,
               result.total_time AS total_time,
               result.affinity_time AS affinity_time,
               result.laplacian_time AS laplacian_time,
               result.clustering_time AS clustering_time,
               result.adjusted_rand_index_time AS adjusted_rand_index_time
        """
        
        # Measure Memory and CPU Usage Before Query Execution
        memory_before = psutil.virtual_memory().used
        cpu_before = psutil.cpu_percent(interval=None)

        # Execute the Query and Measure Time
        start_time = time.perf_counter()
        data, _, _, _ = run_query(driver, query, config)
        elapsed_time = time.perf_counter() - start_time

        # Measure Memory and CPU Usage After Query Execution
        memory_after = psutil.virtual_memory().used
        cpu_after = psutil.cpu_percent(interval=None)

        # Calculate Metrics
        memory_used = (memory_after - memory_before) / 1e6  # Memory in MB
        cpu_used = (cpu_before + cpu_after) / 2  # Average CPU usage

        # Extract Results from Java
        silhouette_score = data['silhouette_score'] if data else None
        rand_index = data['rand_index'] if data else None
        total_time = data['total_time'] if data else None
        affinity_time = data['affinity_time'] if data else None
        laplacian_time = data['laplacian_time'] if data else None
        clustering_time = data['clustering_time'] if data else None
        adjusted_rand_index_time = data['adjusted_rand_index_time'] if data else None

        # Save Results
        results.append({
            **config,
            "silhouette_score": silhouette_score,
            "rand_index": rand_index,
            "total_time": total_time or elapsed_time,
            "affinity_time": affinity_time,
            "laplacian_time": laplacian_time,
            "clustering_time": clustering_time,
            "adjusted_rand_index_time": adjusted_rand_index_time,
            "memory_used": memory_used,
            "cpu_used": cpu_used
        })

        print(f"Completed experiment {idx}/{total_experiments} with config: {config}")

    driver.close()
    return results

In [7]:
# # Example scheduling and execution
# def job():
#     print("Running experiments...")
#     result_data = run_experiments(driver)
#     # Save to DataFrame and then to CSV
#     df = pd.DataFrame(result_data)
#     df.to_csv("iris_results.csv", index=False)
#     print("Experiments completed and saved.")

# schedule.every().day.at("01:00").do(job)

# # Loop to keep the scheduler running
# while True:
#     schedule.run_pending()
#     time.sleep(60)  # wait one minute

print("Running experiments...")
result_data = run_experiments(driver)
# Save to DataFrame and then to CSV
df = pd.DataFrame(result_data)
df.to_csv("citeseer_results.csv", index=False)
print("Experiments completed and saved.")

Running experiments...
Completed experiment 1/3 with config: {'node_label': 'citeseer', 'is_feature_based': False, 'laplacian_type': 'sym', 'number_of_eigenvectors': 6, 'use_kmean_for_silhouette': True}
Completed experiment 2/3 with config: {'node_label': 'citeseer', 'is_feature_based': False, 'laplacian_type': 'rw', 'number_of_eigenvectors': 6, 'use_kmean_for_silhouette': True}
Completed experiment 3/3 with config: {'node_label': 'citeseer', 'is_feature_based': False, 'laplacian_type': 'ad', 'number_of_eigenvectors': 6, 'use_kmean_for_silhouette': True}
Experiments completed and saved.


In [8]:
result_data

[{'node_label': 'citeseer',
  'is_feature_based': False,
  'laplacian_type': 'sym',
  'number_of_eigenvectors': 6,
  'use_kmean_for_silhouette': True,
  'silhouette_score': 0.6213422957524163,
  'rand_index': 0.015934734913958054,
  'total_time': 700419.770208,
  'affinity_time': 742.46375,
  'laplacian_time': 536236.839416,
  'clustering_time': 162556.077625,
  'adjusted_rand_index_time': 882.6305,
  'memory_used': 409.141248,
  'cpu_used': 27.549999999999997},
 {'node_label': 'citeseer',
  'is_feature_based': False,
  'laplacian_type': 'rw',
  'number_of_eigenvectors': 6,
  'use_kmean_for_silhouette': True,
  'silhouette_score': 0.9391537350018979,
  'rand_index': -0.0005830685107247707,
  'total_time': 657604.712541,
  'affinity_time': 826.094417,
  'laplacian_time': 517475.940084,
  'clustering_time': 138422.057916,
  'adjusted_rand_index_time': 879.988375,
  'memory_used': -312.5248,
  'cpu_used': 12.6},
 {'node_label': 'citeseer',
  'is_feature_based': False,
  'laplacian_type': 