In [9]:
import os
import glob

def read_cluster_data(file_path):
    with open(file_path, 'r') as file:
        next(file)  # Skip the header
        data = {}
        for line in file:
            x, y, cluster_id = line.strip().split(',')
            point = (float(x), float(y))
            if cluster_id in data:
                data[cluster_id].add(point)
            else:
                data[cluster_id] = {point}
    return data

def compare_clusters(base_data, compare_data):
    # Convert sets of points into a list of frozensets for comparison
    base_clusters = [frozenset(points) for points in base_data.values()]
    comp_clusters = [frozenset(points) for points in compare_data.values()]

    # Ensure each set from base_clusters has exactly one matching set in comp_clusters
    matched = True
    for base_cluster in base_clusters:
        match = any(base_cluster == comp_cluster for comp_cluster in comp_clusters)
        matched &= match
        if not match:
            print(f"No match found for cluster with points: {base_cluster}")
            break

    return matched

# Path to the directory containing output files
directory_path = 'outputs/sanity_checks'

# Find all files in the directory
file_paths = glob.glob(os.path.join(directory_path, '*.txt'))

# Use the first file as the base for comparison
base_file = file_paths[0]
base_data = read_cluster_data(base_file)

# Compare each file to the base
for file_path in file_paths[1:]:
    compare_data = read_cluster_data(file_path)
    if compare_clusters(base_data, compare_data):
        print(f"{os.path.basename(file_path)} matches {os.path.basename(base_file)}")
    else:
        print(f"{os.path.basename(file_path)} does NOT match {os.path.basename(base_file)}")

No match found for cluster with points: frozenset({(270304.0, 3712.0), (285418.0, 28445.0), (297736.0, 64525.0), (266914.0, 40079.0), (230619.0, 29043.0), (327958.0, 3317.0), (264239.0, 80373.0), (324571.0, 73078.0), (289230.0, 94587.0), (311181.0, 21847.0), (254061.0, 4096.0), (265485.0, 10120.0), (328376.0, 18068.0), (277507.0, 60388.0), (276554.0, 51565.0), (255967.0, 39187.0), (282086.0, 34021.0), (238115.0, 6149.0), (301006.0, 8337.0), (237882.0, 91829.0), (283458.0, 81512.0), (251922.0, 63217.0), (247163.0, 83546.0), (287975.0, 5825.0), (278991.0, 12027.0), (325402.0, 42795.0), (217233.0, 51121.0), (309220.0, 60662.0), (315464.0, 18895.0), (329745.0, 30975.0), (317846.0, 46767.0), (240853.0, 26836.0), (221694.0, 11060.0), (244544.0, 53497.0), (324986.0, 53182.0), (307254.0, 33149.0), (300058.0, 75344.0), (226455.0, 37095.0), (318974.0, 2335.0), (309575.0, 29349.0), (245496.0, 59807.0), (268464.0, 78949.0), (304757.0, 55424.0), (294879.0, 42956.0), (220506.0, 46204.0), (232109.0, 

In [8]:
import os
import glob

def read_cluster_data(file_path):
    with open(file_path, 'r') as file:
        next(file)  # Skip the header
        data = []
        for line in file:
            x, y, cluster_id = line.strip().split(',')
            point = (float(x), float(y))
            data.append((point, int(cluster_id)))
        return data

def create_cluster_groups(data):
    clusters = {}
    for point, cluster_id in data:
        if cluster_id in clusters:
            clusters[cluster_id].add(point)
        else:
            clusters[cluster_id] = {point}
    return clusters

def compare_clusters(data1, data2):
    groups1 = [frozenset(group) for group in create_cluster_groups(data1).values()]
    groups2 = [frozenset(group) for group in create_cluster_groups(data2).values()]

    if len(groups1) != len(groups2):
        return False

    for group in groups1:
        if group not in groups2:
            return False
    return True

def main(directory_path):
    # Path pattern to match parallel output files
    file_pattern = os.path.join(directory_path, 'parallel_clusters_*.txt')
    file_paths = glob.glob(file_pattern)

    # Read data from the first file to compare others against it
    base_data = read_cluster_data(file_paths[0])

    print(f"Base file for comparison: {os.path.basename(file_paths[0])}")

    # Compare each file to the base file
    for file_path in file_paths[1:]:
        compare_data = read_cluster_data(file_path)
        if compare_clusters(base_data, compare_data):
            print(f"{os.path.basename(file_path)} matches {os.path.basename(file_paths[0])}")
        else:
            print(f"{os.path.basename(file_path)} does NOT match {os.path.basename(file_paths[0])}")

directory_path = 'outputs/sanity_checks'
main(directory_path)

Base file for comparison: parallel_clusters_50000_100_100_8.txt
parallel_clusters_50000_100_100_2.txt does NOT match parallel_clusters_50000_100_100_8.txt
parallel_clusters_50000_100_100_4.txt does NOT match parallel_clusters_50000_100_100_8.txt
parallel_clusters_50000_100_100_10.txt does NOT match parallel_clusters_50000_100_100_8.txt
parallel_clusters_50000_100_100_6.txt does NOT match parallel_clusters_50000_100_100_8.txt
