In [1]:
import sys
import os
import time
import psutil
import pandas as pd
from neo4j import GraphDatabase

In [2]:
# Database connection setup
uri = "bolt://localhost:7687"
user = "neo4j"
password = "eigen1234"
database_name="d2.madelon"
driver = GraphDatabase.driver(uri, auth=(user, password))

In [3]:
# Neo4j connection
def create_session(uri, user, password):
    driver = GraphDatabase.driver(uri, auth=(user, password))
    return driver

# Function to run a query and measure performance metrics
def run_query(driver, query, parameters):
    # Start time and resources
    process = psutil.Process(os.getpid())
    start_time = time.time()
    start_cpu_times = process.cpu_times()
    start_mem = process.memory_info().rss  # Resident Set Size

    with driver.session(database=database_name) as session:
        result = session.run(query, parameters)
        record = result.single()
        data = record.data() if record else None

    # End time and resources
    end_time = time.time()
    end_cpu_times = process.cpu_times()
    end_mem = process.memory_info().rss

    # Calculations
    duration = end_time - start_time
    cpu_used = (end_cpu_times.user + end_cpu_times.system) - (start_cpu_times.user + start_cpu_times.system)
    memory_used = (end_mem - start_mem) / (1024 ** 2)  # Convert to MB

    return data, duration, memory_used, cpu_used

In [4]:
# Predefined experiments configurations with all combinations for each sub-dataset
experiments = []

# Define graph types and Laplacian methods
graph_types = ["full", "eps", "knn", "mknn"]
laplacian_types = ["sym", "rw", "ad"]

# Define parameters for each graph type per sub-dataset
parameters = {
    "madelon_33": {"full": "9", "eps": "1.141", "knn": "20", "mknn": "24"},
    "madelon_66": {"full": "5", "eps": "1.143", "knn": "14", "mknn": "16"},
    "madelon_full": {"full": "11", "eps": "1.161", "knn": "28", "mknn": "29"}
}

# Define number of eigenvectors and silhouette usage (common for all experiments)
number_of_eigenvectors = 3
use_kmean_for_silhouette = False

# Generate experiment configurations for each sub-dataset
sub_datasets = ["madelon_33", "madelon_66", "madelon_full"]
for sub_dataset in sub_datasets:
    for graph_type in graph_types:
        for laplacian_type in laplacian_types:
            experiments.append({
                "node_label": sub_dataset,
                "is_feature_based": True,
                "graph_type": graph_type,
                "parameter": parameters[sub_dataset][graph_type],
                "laplacian_type": laplacian_type,
                "number_of_eigenvectors": number_of_eigenvectors,
                "use_kmean_for_silhouette": use_kmean_for_silhouette
            })

# Print or analyze the configurations to ensure correctness
# for experiment in experiments:
#     print(experiment)

In [5]:
# Main function to run experiments
def run_experiments(driver):
    results = []
    total_experiments = len(experiments)
    for idx, config in enumerate(experiments, 1):
        query = """
        WITH simkit.spectralClustering({
            node_label: $node_label,
            is_feature_based: $is_feature_based,
            distance_measure: "euclidean",
            graph_type: $graph_type,
            parameter: $parameter,
            remove_columns: "index,target",
            laplacian_type: $laplacian_type,
            number_of_eigenvectors: $number_of_eigenvectors,
            number_of_iterations: "100",
            distance_measure_kmean: "euclidean",
            target_column: "target",
            use_kmean_for_silhouette: $use_kmean_for_silhouette,
            seed: 42
        }) AS silhouette_score
        WITH silhouette_score, simkit.adjustedRandIndex({
            nodeSet: $node_label,
            trueLabels: "target"
        }) AS rand_index
        RETURN silhouette_score, rand_index
        """
        # Run the combined query and collect metrics
        data, duration, memory_used, cpu_used = run_query(driver, query, config)
        silhouette_score = data['silhouette_score'] if data else None
        rand_index = data['rand_index'] if data else None

        # Save results
        results.append({
            **config,
            "silhouette_score": silhouette_score,
            "rand_index": rand_index,
            "duration": duration,
            "memory_used": memory_used,
            "cpu_used": cpu_used
        })
        # Print progress after each experiment
        print(f"Completed experiment {idx}/{total_experiments} with config: {config}")

    driver.close()
    return results

In [6]:
# # Example scheduling and execution
# def job():
#     print("Running experiments...")
#     result_data = run_experiments(driver)
#     # Save to DataFrame and then to CSV
#     df = pd.DataFrame(result_data)
#     df.to_csv("iris_results.csv", index=False)
#     print("Experiments completed and saved.")

# schedule.every().day.at("01:00").do(job)

# # Loop to keep the scheduler running
# while True:
#     schedule.run_pending()
#     time.sleep(60)  # wait one minute

print("Running experiments...")
result_data = run_experiments(driver)
# Save to DataFrame and then to CSV
df = pd.DataFrame(result_data)
df.to_csv("madelon_results.csv", index=False)
print("Experiments completed and saved.")

Running experiments...
Completed experiment 1/36 with config: {'node_label': 'iris_full', 'is_feature_based': True, 'graph_type': 'full', 'parameter': '11', 'laplacian_type': 'sym', 'number_of_eigenvectors': 3, 'use_kmean_for_silhouette': False}
Completed experiment 2/36 with config: {'node_label': 'iris_full', 'is_feature_based': True, 'graph_type': 'full', 'parameter': '11', 'laplacian_type': 'rw', 'number_of_eigenvectors': 3, 'use_kmean_for_silhouette': False}
Completed experiment 3/36 with config: {'node_label': 'iris_full', 'is_feature_based': True, 'graph_type': 'full', 'parameter': '11', 'laplacian_type': 'ad', 'number_of_eigenvectors': 3, 'use_kmean_for_silhouette': False}
Completed experiment 4/36 with config: {'node_label': 'iris_full', 'is_feature_based': True, 'graph_type': 'eps', 'parameter': '1.161', 'laplacian_type': 'sym', 'number_of_eigenvectors': 3, 'use_kmean_for_silhouette': False}
Completed experiment 5/36 with config: {'node_label': 'iris_full', 'is_feature_based'

In [7]:
result_data

[{'node_label': 'iris_full',
  'is_feature_based': True,
  'graph_type': 'full',
  'parameter': '11',
  'laplacian_type': 'sym',
  'number_of_eigenvectors': 3,
  'use_kmean_for_silhouette': False,
  'silhouette_score': 0.5055000468668748,
  'rand_index': 0.7880443567941549,
  'duration': 99.7205159664154,
  'memory_used': -96.421875,
  'cpu_used': 0.02040614399999985},
 {'node_label': 'iris_full',
  'is_feature_based': True,
  'graph_type': 'full',
  'parameter': '11',
  'laplacian_type': 'rw',
  'number_of_eigenvectors': 3,
  'use_kmean_for_silhouette': False,
  'silhouette_score': 0.5066439011368349,
  'rand_index': 0.7733799163281198,
  'duration': 131.64908695220947,
  'memory_used': 2.4375,
  'cpu_used': 0.021565567999999757},
 {'node_label': 'iris_full',
  'is_feature_based': True,
  'graph_type': 'full',
  'parameter': '11',
  'laplacian_type': 'ad',
  'number_of_eigenvectors': 3,
  'use_kmean_for_silhouette': False,
  'silhouette_score': 0.46741633379813646,
  'rand_index': 0.9