In [1]:
import sys
import os
import time
import psutil
import pandas as pd
from neo4j import GraphDatabase

In [2]:
# Database connection setup
uri = "bolt://localhost:7687"
user = "neo4j"
password = "eigen1234"
database_name="d2.madelon"
driver = GraphDatabase.driver(uri, auth=(user, password))

In [3]:
def check_connection():
    try:
        # Establish a session with the specified database
        # with driver.session(database=database_name) as session:
        with driver.session() as session:
            # Run a simple query to check the connection
            result = session.run("RETURN 'Connection to database successful' AS message")
            for record in result:
                print(record["message"])
    except Exception as e:
        print("Error connecting to the database:", e)

# Call the check_connection function
check_connection()

Connection to database successful


In [4]:
# Function to run a query and measure performance metrics
def run_query(driver, query, parameters):
    # Start time and resources
    process = psutil.Process(os.getpid())
    start_time = time.time()
    start_cpu_times = process.cpu_times()
    start_mem = process.memory_info().rss  # Resident Set Size

    # with driver.session(database=database_name) as session:
    with driver.session() as session:
        result = session.run(query, parameters)
        record = result.single()
        data = record.data() if record else None

    # End time and resources
    end_time = time.time()
    end_cpu_times = process.cpu_times()
    end_mem = process.memory_info().rss

    # Calculations
    duration = end_time - start_time
    cpu_used = (end_cpu_times.user + end_cpu_times.system) - (start_cpu_times.user + start_cpu_times.system)
    memory_used = (end_mem - start_mem) / (1024 ** 2)  # Convert to MB

    return data, duration, memory_used, cpu_used

In [5]:
# Predefined experiments configurations with all combinations for each sub-dataset
experiments = []

# Define graph types and Laplacian methods
graph_types = ["full", "eps", "knn", "mknn"]
laplacian_types = ["sym", "rw", "ad"]

# Define parameters for each graph type per sub-dataset
parameters = {
    "madelon_33": {"full": "26", "eps": "4.667", "knn": "138", "mknn": "55"}
}

# Define number of eigenvectors and silhouette usage (common for all experiments)
number_of_eigenvectors = 2
use_kmean_for_silhouette = False

# Generate experiment configurations for each sub-dataset
sub_datasets = ["madelon_33"]
for sub_dataset in sub_datasets:
    for graph_type in graph_types:
        for laplacian_type in laplacian_types:
            experiments.append({
                "node_label": sub_dataset,
                "is_feature_based": True,
                "graph_type": graph_type,
                "parameter": parameters[sub_dataset][graph_type],
                "laplacian_type": laplacian_type,
                "number_of_eigenvectors": number_of_eigenvectors,
                "use_kmean_for_silhouette": use_kmean_for_silhouette
            })

# Print or analyze the configurations to ensure correctness
# for experiment in experiments:
#     print(experiment)

In [6]:
def run_experiments(driver):
    results = []
    total_experiments = len(experiments)

    for idx, config in enumerate(experiments, 1):
        query = """
        WITH simkit.experimental_spectralClustering({
            node_label: $node_label,
            is_feature_based: $is_feature_based,
            distance_measure: "euclidean",
            graph_type: $graph_type,
            parameter: $parameter,
            remove_columns: "Index,target",
            laplacian_type: $laplacian_type,
            number_of_eigenvectors: $number_of_eigenvectors,
            number_of_iterations: "100",
            distance_measure_kmean: "euclidean",
            target_column: "target",
            use_kmean_for_silhouette: $use_kmean_for_silhouette,
            seed: 42
        }) AS result
        RETURN result.silhouette_score AS silhouette_score, 
               result.rand_index AS rand_index,
               result.total_time AS total_time,
               result.affinity_time AS affinity_time,
               result.laplacian_time AS laplacian_time,
               result.clustering_time AS clustering_time,
               result.adjusted_rand_index_time AS adjusted_rand_index_time
        """

        # Measure Memory and CPU Usage Before Query Execution
        memory_before = psutil.virtual_memory().used
        cpu_before = psutil.cpu_percent(interval=None)

        # Execute the Query and Measure Time
        start_time = time.perf_counter()
        data, _, _, _ = run_query(driver, query, config)
        elapsed_time = time.perf_counter() - start_time

        # Measure Memory and CPU Usage After Query Execution
        memory_after = psutil.virtual_memory().used
        cpu_after = psutil.cpu_percent(interval=None)

        # Calculate Metrics
        memory_used = (memory_after - memory_before) / 1e6  # Memory in MB
        cpu_used = (cpu_before + cpu_after) / 2  # Average CPU usage

        # Extract Results from Java
        silhouette_score = data['silhouette_score'] if data else None
        rand_index = data['rand_index'] if data else None
        total_time = data['total_time'] if data else None
        affinity_time = data['affinity_time'] if data else None
        laplacian_time = data['laplacian_time'] if data else None
        clustering_time = data['clustering_time'] if data else None
        adjusted_rand_index_time = data['adjusted_rand_index_time'] if data else None

        # Save Results
        results.append({
            **config,
            "silhouette_score": silhouette_score,
            "rand_index": rand_index,
            "total_time": total_time or elapsed_time,
            "affinity_time": affinity_time,
            "laplacian_time": laplacian_time,
            "clustering_time": clustering_time,
            "adjusted_rand_index_time": adjusted_rand_index_time,
            "memory_used": memory_used,
            "cpu_used": cpu_used
        })

        print(f"Completed experiment {idx}/{total_experiments} with config: {config}")

    driver.close()
    return results

In [7]:
# # Example scheduling and execution
# def job():
#     print("Running experiments...")
#     result_data = run_experiments(driver)
#     # Save to DataFrame and then to CSV
#     df = pd.DataFrame(result_data)
#     df.to_csv("iris_results.csv", index=False)
#     print("Experiments completed and saved.")

# schedule.every().day.at("01:00").do(job)

# # Loop to keep the scheduler running
# while True:
#     schedule.run_pending()
#     time.sleep(60)  # wait one minute

print("Running experiments...")
result_data1 = run_experiments(driver)
# Save to DataFrame and then to CSV
df1 = pd.DataFrame(result_data1)
df1.to_csv("madelon_results1.csv", index=False)
print("Experiments completed and saved.")

Running experiments...
Completed experiment 1/12 with config: {'node_label': 'madelon_33', 'is_feature_based': True, 'graph_type': 'full', 'parameter': '26', 'laplacian_type': 'sym', 'number_of_eigenvectors': 2, 'use_kmean_for_silhouette': False}
Completed experiment 2/12 with config: {'node_label': 'madelon_33', 'is_feature_based': True, 'graph_type': 'full', 'parameter': '26', 'laplacian_type': 'rw', 'number_of_eigenvectors': 2, 'use_kmean_for_silhouette': False}
Completed experiment 3/12 with config: {'node_label': 'madelon_33', 'is_feature_based': True, 'graph_type': 'full', 'parameter': '26', 'laplacian_type': 'ad', 'number_of_eigenvectors': 2, 'use_kmean_for_silhouette': False}
Completed experiment 4/12 with config: {'node_label': 'madelon_33', 'is_feature_based': True, 'graph_type': 'eps', 'parameter': '4.667', 'laplacian_type': 'sym', 'number_of_eigenvectors': 2, 'use_kmean_for_silhouette': False}
Completed experiment 5/12 with config: {'node_label': 'madelon_33', 'is_feature_b

In [8]:
result_data1

[{'node_label': 'madelon_33',
  'is_feature_based': True,
  'graph_type': 'full',
  'parameter': '26',
  'laplacian_type': 'sym',
  'number_of_eigenvectors': 2,
  'use_kmean_for_silhouette': False,
  'silhouette_score': 0.010866799180726533,
  'rand_index': 0.7884990231139244,
  'total_time': 1401431.415459,
  'affinity_time': 25758.569375,
  'laplacian_time': 1142844.359709,
  'clustering_time': 232472.184042,
  'adjusted_rand_index_time': 354.548459,
  'memory_used': -600.80128,
  'cpu_used': 33.0},
 {'node_label': 'madelon_33',
  'is_feature_based': True,
  'graph_type': 'full',
  'parameter': '26',
  'laplacian_type': 'rw',
  'number_of_eigenvectors': 2,
  'use_kmean_for_silhouette': False,
  'silhouette_score': 0.011074421861220774,
  'rand_index': 0.7435624212240732,
  'total_time': 1481497.774,
  'affinity_time': 22882.101709,
  'laplacian_time': 1225331.831209,
  'clustering_time': 232948.432083,
  'adjusted_rand_index_time': 334.941958,
  'memory_used': -274.644992,
  'cpu_use

In [9]:
# Predefined experiments configurations with all combinations for each sub-dataset
experiments = []

# Define graph types and Laplacian methods
graph_types = ["full", "eps", "knn", "mknn"]
laplacian_types = ["sym", "rw", "ad"]

# Define parameters for each graph type per sub-dataset
parameters = {
    "madelon_66": {"full": "33", "eps": "4.669", "knn": "271", "mknn": "77"}
}

# Define number of eigenvectors and silhouette usage (common for all experiments)
number_of_eigenvectors = 2
use_kmean_for_silhouette = False

# Generate experiment configurations for each sub-dataset
sub_datasets = ["madelon_66"]
for sub_dataset in sub_datasets:
    for graph_type in graph_types:
        for laplacian_type in laplacian_types:
            experiments.append({
                "node_label": sub_dataset,
                "is_feature_based": True,
                "graph_type": graph_type,
                "parameter": parameters[sub_dataset][graph_type],
                "laplacian_type": laplacian_type,
                "number_of_eigenvectors": number_of_eigenvectors,
                "use_kmean_for_silhouette": use_kmean_for_silhouette
            })

# Print or analyze the configurations to ensure correctness
# for experiment in experiments:
#     print(experiment)

In [10]:
def run_experiments(driver):
    results = []
    total_experiments = len(experiments)

    for idx, config in enumerate(experiments, 1):
        query = """
        WITH simkit.experimental_spectralClustering({
            node_label: $node_label,
            is_feature_based: $is_feature_based,
            distance_measure: "euclidean",
            graph_type: $graph_type,
            parameter: $parameter,
            remove_columns: "Index,target",
            laplacian_type: $laplacian_type,
            number_of_eigenvectors: $number_of_eigenvectors,
            number_of_iterations: "100",
            distance_measure_kmean: "euclidean",
            target_column: "target",
            use_kmean_for_silhouette: $use_kmean_for_silhouette,
            seed: 42
        }) AS result
        RETURN result.silhouette_score AS silhouette_score, 
               result.rand_index AS rand_index,
               result.total_time AS total_time,
               result.affinity_time AS affinity_time,
               result.laplacian_time AS laplacian_time,
               result.clustering_time AS clustering_time,
               result.adjusted_rand_index_time AS adjusted_rand_index_time
        """

        # Measure Memory and CPU Usage Before Query Execution
        memory_before = psutil.virtual_memory().used
        cpu_before = psutil.cpu_percent(interval=None)

        # Execute the Query and Measure Time
        start_time = time.perf_counter()
        data, _, _, _ = run_query(driver, query, config)
        elapsed_time = time.perf_counter() - start_time

        # Measure Memory and CPU Usage After Query Execution
        memory_after = psutil.virtual_memory().used
        cpu_after = psutil.cpu_percent(interval=None)

        # Calculate Metrics
        memory_used = (memory_after - memory_before) / 1e6  # Memory in MB
        cpu_used = (cpu_before + cpu_after) / 2  # Average CPU usage

        # Extract Results from Java
        silhouette_score = data['silhouette_score'] if data else None
        rand_index = data['rand_index'] if data else None
        total_time = data['total_time'] if data else None
        affinity_time = data['affinity_time'] if data else None
        laplacian_time = data['laplacian_time'] if data else None
        clustering_time = data['clustering_time'] if data else None
        adjusted_rand_index_time = data['adjusted_rand_index_time'] if data else None

        # Save Results
        results.append({
            **config,
            "silhouette_score": silhouette_score,
            "rand_index": rand_index,
            "total_time": total_time or elapsed_time,
            "affinity_time": affinity_time,
            "laplacian_time": laplacian_time,
            "clustering_time": clustering_time,
            "adjusted_rand_index_time": adjusted_rand_index_time,
            "memory_used": memory_used,
            "cpu_used": cpu_used
        })

        print(f"Completed experiment {idx}/{total_experiments} with config: {config}")

    driver.close()
    return results

In [11]:
# # Example scheduling and execution
# def job():
#     print("Running experiments...")
#     result_data = run_experiments(driver)
#     # Save to DataFrame and then to CSV
#     df = pd.DataFrame(result_data)
#     df.to_csv("iris_results.csv", index=False)
#     print("Experiments completed and saved.")

# schedule.every().day.at("01:00").do(job)

# # Loop to keep the scheduler running
# while True:
#     schedule.run_pending()
#     time.sleep(60)  # wait one minute

print("Running experiments...")
result_data2 = run_experiments(driver)
# Save to DataFrame and then to CSV
df2 = pd.DataFrame(result_data2)
df2.to_csv("madelon_results2.csv", index=False)
print("Experiments completed and saved.")

Running experiments...


  with driver.session() as session:


Completed experiment 1/12 with config: {'node_label': 'madelon_66', 'is_feature_based': True, 'graph_type': 'full', 'parameter': '33', 'laplacian_type': 'sym', 'number_of_eigenvectors': 2, 'use_kmean_for_silhouette': False}
Completed experiment 2/12 with config: {'node_label': 'madelon_66', 'is_feature_based': True, 'graph_type': 'full', 'parameter': '33', 'laplacian_type': 'rw', 'number_of_eigenvectors': 2, 'use_kmean_for_silhouette': False}
Completed experiment 3/12 with config: {'node_label': 'madelon_66', 'is_feature_based': True, 'graph_type': 'full', 'parameter': '33', 'laplacian_type': 'ad', 'number_of_eigenvectors': 2, 'use_kmean_for_silhouette': False}
Completed experiment 4/12 with config: {'node_label': 'madelon_66', 'is_feature_based': True, 'graph_type': 'eps', 'parameter': '4.669', 'laplacian_type': 'sym', 'number_of_eigenvectors': 2, 'use_kmean_for_silhouette': False}
Completed experiment 5/12 with config: {'node_label': 'madelon_66', 'is_feature_based': True, 'graph_typ

In [12]:
result_data2

[{'node_label': 'madelon_66',
  'is_feature_based': True,
  'graph_type': 'full',
  'parameter': '33',
  'laplacian_type': 'sym',
  'number_of_eigenvectors': 2,
  'use_kmean_for_silhouette': False,
  'silhouette_score': 0.01097172906627428,
  'rand_index': 0.9200323990515031,
  'total_time': 11236400.154916,
  'affinity_time': 48319.491958,
  'laplacian_time': 10659360.593792,
  'clustering_time': 528278.89975,
  'adjusted_rand_index_time': 438.781875,
  'memory_used': 165.888,
  'cpu_used': 12.899999999999999},
 {'node_label': 'madelon_66',
  'is_feature_based': True,
  'graph_type': 'full',
  'parameter': '33',
  'laplacian_type': 'rw',
  'number_of_eigenvectors': 2,
  'use_kmean_for_silhouette': False,
  'silhouette_score': 0.01103436279420762,
  'rand_index': 0.8714716730086772,
  'total_time': 13221426.579791,
  'affinity_time': 56689.322958,
  'laplacian_time': 12604134.02725,
  'clustering_time': 560139.751417,
  'adjusted_rand_index_time': 462.799042,
  'memory_used': -247.1362

In [5]:
# Predefined experiments configurations with all combinations for each sub-dataset
experiments = []

# Define graph types and Laplacian methods
graph_types = ["full", "eps", "knn", "mknn"]
laplacian_types = ["sym", "rw", "ad"]

# Define parameters for each graph type per sub-dataset
parameters = {
    "madelon_full": {"full": "45", "eps": "4.669", "knn": "419", "mknn": "117"}
}

# Define number of eigenvectors and silhouette usage (common for all experiments)
number_of_eigenvectors = 3
use_kmean_for_silhouette = False

# Generate experiment configurations for each sub-dataset
sub_datasets = ["madelon_full"]
for sub_dataset in sub_datasets:
    for graph_type in graph_types:
        for laplacian_type in laplacian_types:
            experiments.append({
                "node_label": sub_dataset,
                "is_feature_based": True,
                "graph_type": graph_type,
                "parameter": parameters[sub_dataset][graph_type],
                "laplacian_type": laplacian_type,
                "number_of_eigenvectors": number_of_eigenvectors,
                "use_kmean_for_silhouette": use_kmean_for_silhouette
            })

# Print or analyze the configurations to ensure correctness
# for experiment in experiments:
#     print(experiment)

In [6]:
def run_experiments(driver):
    results = []
    total_experiments = len(experiments)

    for idx, config in enumerate(experiments, 1):
        query = """
        WITH simkit.experimental_spectralClustering({
            node_label: $node_label,
            is_feature_based: $is_feature_based,
            distance_measure: "euclidean",
            graph_type: $graph_type,
            parameter: $parameter,
            remove_columns: "Index,target",
            laplacian_type: $laplacian_type,
            number_of_eigenvectors: $number_of_eigenvectors,
            number_of_iterations: "100",
            distance_measure_kmean: "euclidean",
            target_column: "target",
            use_kmean_for_silhouette: $use_kmean_for_silhouette,
            seed: 42
        }) AS result
        RETURN result.silhouette_score AS silhouette_score, 
               result.rand_index AS rand_index,
               result.total_time AS total_time,
               result.affinity_time AS affinity_time,
               result.laplacian_time AS laplacian_time,
               result.clustering_time AS clustering_time,
               result.adjusted_rand_index_time AS adjusted_rand_index_time
        """

        # Measure Memory and CPU Usage Before Query Execution
        memory_before = psutil.virtual_memory().used
        cpu_before = psutil.cpu_percent(interval=None)

        # Execute the Query and Measure Time
        start_time = time.perf_counter()
        data, _, _, _ = run_query(driver, query, config)
        elapsed_time = time.perf_counter() - start_time

        # Measure Memory and CPU Usage After Query Execution
        memory_after = psutil.virtual_memory().used
        cpu_after = psutil.cpu_percent(interval=None)

        # Calculate Metrics
        memory_used = (memory_after - memory_before) / 1e6  # Memory in MB
        cpu_used = (cpu_before + cpu_after) / 2  # Average CPU usage

        # Extract Results from Java
        silhouette_score = data['silhouette_score'] if data else None
        rand_index = data['rand_index'] if data else None
        total_time = data['total_time'] if data else None
        affinity_time = data['affinity_time'] if data else None
        laplacian_time = data['laplacian_time'] if data else None
        clustering_time = data['clustering_time'] if data else None
        adjusted_rand_index_time = data['adjusted_rand_index_time'] if data else None

        # Save Results
        results.append({
            **config,
            "silhouette_score": silhouette_score,
            "rand_index": rand_index,
            "total_time": total_time or elapsed_time,
            "affinity_time": affinity_time,
            "laplacian_time": laplacian_time,
            "clustering_time": clustering_time,
            "adjusted_rand_index_time": adjusted_rand_index_time,
            "memory_used": memory_used,
            "cpu_used": cpu_used
        })

        print(f"Completed experiment {idx}/{total_experiments} with config: {config}")

    driver.close()
    return results

In [7]:
# # Example scheduling and execution
# def job():
#     print("Running experiments...")
#     result_data = run_experiments(driver)
#     # Save to DataFrame and then to CSV
#     df = pd.DataFrame(result_data)
#     df.to_csv("iris_results.csv", index=False)
#     print("Experiments completed and saved.")

# schedule.every().day.at("01:00").do(job)

# # Loop to keep the scheduler running
# while True:
#     schedule.run_pending()
#     time.sleep(60)  # wait one minute

print("Running experiments...")
result_data3 = run_experiments(driver)
# Save to DataFrame and then to CSV
df3 = pd.DataFrame(result_data3)
df3.to_csv("madelon_results3.csv", index=False)
print("Experiments completed and saved.")

Running experiments...


Failed to read from defunct connection IPv4Address(('localhost', 7687)) (ResolvedIPv4Address(('127.0.0.1', 7687)))


KeyboardInterrupt: 

In [None]:
result_data1