In [1]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score

In [2]:
# Step 1: Load Subdatasets
def load_subdataset(file_path):
    return pd.read_csv(file_path)

data = load_subdataset('madelon_full.csv')
data

Unnamed: 0,Index,0,1,2,3,4,5,6,7,8,...,491,492,493,494,495,496,497,498,499,target
0,0,0.561,0.429,0.588,0.394,0.280,0.261,0.567,0.455,0.385,...,0.529,0.478,0.468,0.579,0.429,0.558,0.44,0.399,0.489,-1
1,1,0.512,0.348,0.317,0.515,0.692,0.348,0.693,0.727,0.470,...,0.441,0.533,0.288,0.587,0.432,0.605,0.57,0.523,0.600,-1
2,2,0.610,0.708,0.454,0.227,0.268,0.261,0.390,0.636,0.427,...,0.529,0.561,0.670,0.559,0.481,0.535,0.54,0.484,0.500,-1
3,3,0.439,0.489,0.493,0.485,0.412,0.283,0.300,0.273,0.615,...,0.500,0.461,0.574,0.348,0.377,0.419,0.47,0.466,0.305,1
4,4,0.537,0.536,0.556,0.545,0.323,0.478,0.245,0.636,0.487,...,0.471,0.339,0.406,0.478,0.503,0.558,0.69,0.470,0.568,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,2595,0.756,0.348,0.468,0.379,0.479,0.435,0.498,0.636,0.120,...,0.353,0.522,0.416,0.603,0.432,0.395,0.54,0.509,0.542,1
2596,2596,0.463,0.459,0.391,0.561,0.271,0.478,0.480,0.636,0.333,...,0.647,0.650,0.607,0.526,0.565,0.558,0.55,0.327,0.253,1
2597,2597,0.561,0.464,0.563,0.409,0.256,0.609,0.462,0.364,0.675,...,0.324,0.617,0.324,0.344,0.419,0.488,0.46,0.431,0.600,1
2598,2598,0.366,0.395,0.556,0.485,0.375,0.217,0.534,0.545,0.547,...,0.382,0.344,0.655,0.417,0.601,0.302,0.55,0.890,0.468,1


In [3]:
labels = data["target"].tolist()
unique_labels = set(labels)
print(unique_labels)

{1, -1}


In [4]:
data_cleaned = data.drop(["Index","target"], axis=1)
data_cleaned

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.561,0.429,0.588,0.394,0.280,0.261,0.567,0.455,0.385,0.36,...,0.493,0.529,0.478,0.468,0.579,0.429,0.558,0.44,0.399,0.489
1,0.512,0.348,0.317,0.515,0.692,0.348,0.693,0.727,0.470,0.28,...,0.394,0.441,0.533,0.288,0.587,0.432,0.605,0.57,0.523,0.600
2,0.610,0.708,0.454,0.227,0.268,0.261,0.390,0.636,0.427,0.44,...,0.563,0.529,0.561,0.670,0.559,0.481,0.535,0.54,0.484,0.500
3,0.439,0.489,0.493,0.485,0.412,0.283,0.300,0.273,0.615,0.42,...,0.592,0.500,0.461,0.574,0.348,0.377,0.419,0.47,0.466,0.305
4,0.537,0.536,0.556,0.545,0.323,0.478,0.245,0.636,0.487,0.26,...,0.570,0.471,0.339,0.406,0.478,0.503,0.558,0.69,0.470,0.568
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,0.756,0.348,0.468,0.379,0.479,0.435,0.498,0.636,0.120,0.44,...,0.437,0.353,0.522,0.416,0.603,0.432,0.395,0.54,0.509,0.542
2596,0.463,0.459,0.391,0.561,0.271,0.478,0.480,0.636,0.333,0.56,...,0.697,0.647,0.650,0.607,0.526,0.565,0.558,0.55,0.327,0.253
2597,0.561,0.464,0.563,0.409,0.256,0.609,0.462,0.364,0.675,0.78,...,0.246,0.324,0.617,0.324,0.344,0.419,0.488,0.46,0.431,0.600
2598,0.366,0.395,0.556,0.485,0.375,0.217,0.534,0.545,0.547,0.42,...,0.465,0.382,0.344,0.655,0.417,0.601,0.302,0.55,0.890,0.468


In [5]:
def spectral_clustering(dataframe, labels, similarity_graph, laplacian, number_of_clusters, local_sigma = None, epsilon = None, k_knn = None, k_mknn = None):

    # Pairwise distances
    dimension = dataframe.shape[0]
    dist_mat = squareform(pdist(dataframe))

    if similarity_graph == "full":

        #calculate local sigma
        sigmas = np.zeros(dimension)
        for i in tqdm(range(len(dist_mat))):
            sigmas[i] = sorted(dist_mat[i])[local_sigma]

        # Adjaceny matrix with optimal sigma
        adjacency_matrix = np.zeros([dimension, dimension])
        for i in tqdm(range(dimension)):
            for j in range(i+1, dimension):
                d = np.exp(-1*dist_mat[i,j]**2/(sigmas[i]*sigmas[j]))
                adjacency_matrix[i,j] = d
                adjacency_matrix[j,i] = d


    elif similarity_graph == "eps":

        # Adjaceny matrix with epsilon threshold
        adjacency_matrix = np.zeros([dimension, dimension])

        for i in tqdm(range(dimension)):
            for j in range(i+1, dimension):
                if dist_mat[i,j] < epsilon:
                    d = 1
                else:
                    d = 0
                adjacency_matrix[i,j] = d
                adjacency_matrix[j,i] = d


    elif similarity_graph == "knn":

        # Adjaceny matrix with k-neighbours
        adjacency_matrix = np.zeros([dimension, dimension])

        for i in tqdm(range(dimension)):
            # Sort distances for node i and get indices of the k nearest neighbors
            sorted_indices = np.argsort(dist_mat[i])
            k_nearest_indices = sorted_indices[1:k_knn+1]  # Exclude the node itself

            # Update the adjacency matrix
            adjacency_matrix[i, k_nearest_indices] = 1


    else:

        # Adjaceny matrix with mutual k-neighbours
        adjacency_matrix = np.zeros([dimension, dimension])

        for i in tqdm(range(dimension)):
            # Sort distances for node i and get indices of the k nearest neighbors
            sorted_indices = np.argsort(dist_mat[i])
            k_nearest_indices = sorted_indices[1:k_mknn+1]  # Exclude the node itself

            for neighbor in k_nearest_indices:
                # Check if node i is also among the k-nearest neighbors of the current neighbor
                neighbor_sorted_indices = np.argsort(dist_mat[neighbor])
                if i in neighbor_sorted_indices[1:k_mknn+1]:
                    # Connect nodes if they are mutual k-nearest neighbors
                    adjacency_matrix[i, neighbor] = 1
                    adjacency_matrix[neighbor, i] = 1

    # Calculate degree matrix
    degrees = np.sum(adjacency_matrix, axis=1)
    degree_matrix = np.diag(degrees)

    if laplacian == "sym":

        # Normalized Symmetric laplacian matrix
        d_inv_sqrt = np.zeros_like(degrees)
        nonzero = degrees > 0
        d_inv_sqrt[nonzero] = 1.0 / np.sqrt(degrees[nonzero])
        d_half = np.diag(d_inv_sqrt)
        laplacian_matrix_normalized = d_half @ adjacency_matrix @ d_half

    if laplacian == "rw":

        # Normalized Random Walk laplacian matrix
        d_inv = np.zeros_like(degrees)
        nonzero = degrees > 0
        d_inv[nonzero] = 1.0 / degrees[nonzero]
        d_inverse = np.diag(d_inv)
        laplacian_matrix_normalized = d_inverse @ adjacency_matrix

    if laplacian == "ad":

        # Adaptive Laplacian matrix
        D_local = np.zeros_like(degrees)
        for i in range(len(degrees)):
            neighbors = np.where(adjacency_matrix[i] > 0)[0]
            if len(neighbors) > 0 and degrees[i] > 0:
                D_local[i] = np.sum(degrees[neighbors]) / degrees[i]
            else:
                D_local[i] = 0
        D_local_inv_sqrt = np.zeros_like(D_local)
        nonzero = D_local > 0
        D_local_inv_sqrt[nonzero] = 1.0 / np.sqrt(D_local[nonzero])
        D_local_inv = np.diag(D_local_inv_sqrt)
        laplacian_matrix_normalized = D_local_inv @ adjacency_matrix @ D_local_inv

    if check_symmetric(laplacian_matrix_normalized) :
        # Calculating eigenvalues and eigenvectors for symmetric matrix
        e, v = np.linalg.eigh(laplacian_matrix_normalized)
    else:
        # Calculating eigenvalues and eigenvectors for non-symmetric matrix
        e, v = np.linalg.eig(laplacian_matrix_normalized)
        idx = np.argsort(np.real(e))
        e = np.real(e[idx])
        v = np.real(v[:, idx])

    # Calculate eigengap
    eigengap = np.diff(e)
    optimal_number_of_clusters = np.argmax(eigengap[:10]) + 1

    if number_of_clusters != None:
        # First case: k
        n_clusters = max(number_of_clusters,2)
    else:
        # Second case: optimal number of clusters from eigengap
        n_clusters = max(optimal_number_of_clusters,2)

    results = []
    # adj_filename, laplacian_filename, X_filename = save_matrices(similarity_graph,laplacian, adjacency_matrix, laplacian_matrix_normalized, X)

    # KMeans clustering
    X = v[:, -n_clusters:]
    clustering = KMeans(n_clusters=n_clusters, random_state=42, n_init=100)
    cluster_labels = clustering.fit_predict(X)

    # Calculate evaluation metrics
    sil_score = silhouette_score(dataframe, cluster_labels)
    ar_score = adjusted_rand_score(labels, cluster_labels)

    results.append((sil_score, ar_score, n_clusters,cluster_labels))
    # results.append((sil_score, ar_score, n_clusters,cluster_labels, adj_filename, laplacian_filename, X_filename))

    return results

In [6]:
def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

In [7]:
# # Function to calculate dynamic ranges based on data size and pairwise distances
# def get_dynamic_search_space(data):
#     # Number of points in the dataset
#     n = data.shape[0]

#     # Compute pairwise distances
#     dist_mat = squareform(pdist(data))
#     flat_distances = dist_mat[np.tril_indices(n, -1)]

#     # Dynamic range for local_sigma (based on square root of n)
#     local_sigma_min = max(1, int(np.sqrt(n) / 2))
#     local_sigma_max = int(np.sqrt(n))
    
#     # Dynamic range for epsilon (based on distance percentiles)
#     epsilon_min = np.percentile(flat_distances, 80)  # 80th percentile
#     epsilon_max = np.percentile(flat_distances, 95)  # 95th percentile
    
#     # Dynamic range for k (based on number of data points)
#     k_min = max(5, int(0.01 * n))  # 1% of dataset size, but at least 5
#     k_max = min(int(0.2 * n), n - 1)  # 20% of dataset size, but never more than n-1
    
#     # Ensure k_min does not exceed k_max
#     if k_min > k_max:
#         k_min = max(5, int(0.01 * n))  # Keep dynamic range based on percentage but within limits
    
#     return (local_sigma_min, local_sigma_max), (epsilon_min, epsilon_max), (k_min, k_max)

# # Optimization functions for each parameter

# # Optimize local_sigma for "full" graph
# def optimize_local_sigma(data, labels, laplacians, number_of_clusters):
#     (local_sigma_min, local_sigma_max), _, _ = get_dynamic_search_space(data)

#     def objective_local_sigma(local_sigma):
#         silhouette_scores = []
#         local_sigma = int(local_sigma[0])
#         try:
#             for laplacian in laplacians:
#                 results = spectral_clustering(data, labels, similarity_graph="full", laplacian=laplacian, number_of_clusters=number_of_clusters, local_sigma=local_sigma)
#                 silhouette_scores.append(results[0][0])
#             return -np.mean(silhouette_scores)
#         except (ValueError, np.linalg.LinAlgError) as e:
#             print(f"Skipping local_sigma={local_sigma} due to error: {e}")
#             return 1e6  # Return a large value to penalize the failed set of hyperparameters

#     result = gp_minimize(objective_local_sigma, [(local_sigma_min, local_sigma_max)], n_calls=20, n_random_starts=10, random_state=42)

#     if result.fun < 1e6:
#         best_local_sigma = result.x[0]
#         print(f"Best local sigma: {best_local_sigma}")
#         return result
#     else:
#         print("No valid local_sigma found.")
#         return None


# # Optimize epsilon for "eps" graph
# def optimize_epsilon(data, labels, laplacians, number_of_clusters):
#     _, (epsilon_min, epsilon_max), _ = get_dynamic_search_space(data)

#     def objective_epsilon(epsilon):
#         silhouette_scores = []
#         epsilon = float(epsilon[0])
#         try:
#             for laplacian in laplacians:
#                 results = spectral_clustering(data, labels, similarity_graph="eps", laplacian=laplacian, number_of_clusters=number_of_clusters, epsilon=epsilon)
#                 silhouette_scores.append(results[0][0])
#             return -np.mean(silhouette_scores)
#         except (ValueError, np.linalg.LinAlgError) as e:
#             print(f"Skipping epsilon={epsilon} due to error: {e}")
#             return 1e6  # Return a large value to penalize the failed set of hyperparameters

#     result = gp_minimize(objective_epsilon, [(epsilon_min, epsilon_max)], n_calls=20, n_random_starts=10, random_state=42)

#     if result.fun < 1e6:
#         epsilon = result.x[0]
#         print(f"Best epsilon: {epsilon}")
#         return result
#     else:
#         print("No valid epsilon found.")
#         return None


# # Optimize k for "knn" graph
# def optimize_k_knn(data, labels, laplacians, number_of_clusters):
#     _, _, (k_min, k_max) = get_dynamic_search_space(data)

#     def objective_k_knn(k):
#         silhouette_scores = []
#         k = int(k[0])
#         try:
#             for laplacian in laplacians:
#                 results = spectral_clustering(data, labels, similarity_graph="knn", laplacian=laplacian, number_of_clusters=number_of_clusters, k_knn=k)
#                 silhouette_scores.append(results[0][0])
#             return -np.mean(silhouette_scores)
#         except (ValueError, np.linalg.LinAlgError) as e:
#             print(f"Skipping k={k} due to error: {e}")
#             return 1e6  # Return a large value to penalize the failed set of hyperparameters

#     result = gp_minimize(objective_k_knn, [(k_min, k_max)], n_calls=20, n_random_starts=10, random_state=42)

#     if result.fun < 1e6:
#         k_knn = result.x[0]
#         print(f"Best k for knn: {k_knn}")
#         return result
#     else:
#         print("No valid k for knn found.")
#         return None


# # Optimize k for "mknn" graph
# def optimize_k_mknn(data, labels, laplacians, number_of_clusters):
#     _, _, (k_min, k_max) = get_dynamic_search_space(data)

#     def objective_k_mknn(k):
#         silhouette_scores = []
#         k = int(k[0])
#         try:
#             for laplacian in laplacians:
#                 results = spectral_clustering(data, labels, similarity_graph="mknn", laplacian=laplacian, number_of_clusters=number_of_clusters, k_mknn=k)
#                 silhouette_scores.append(results[0][0])
#             return -np.mean(silhouette_scores)
#         except (ValueError, np.linalg.LinAlgError) as e:
#             print(f"Skipping k={k} due to error: {e}")
#             return 1e6  # Return a large value to penalize the failed set of hyperparameters

#     result = gp_minimize(objective_k_mknn, [(k_min, k_max)], n_calls=20, n_random_starts=10, random_state=42)

#     if result.fun < 1e6:
#         k_mknn = result.x[0]
#         print(f"Best k for mknn: {k_mknn}")
#         return result
#     else:
#         print("No valid k for mknn found.")
#         return None

In [8]:
# # Call the optimization functions
# laplacian_methods = ["sym", "rw", "ad"]
# number_of_clusters = 2

# # Optimize local_sigma for "full" graph
# result_local_sigma = optimize_local_sigma(data_cleaned, true_labels, laplacian_methods, number_of_clusters)
# best_local_sigma = result_local_sigma.x[0]

# # Optimize epsilon for "eps" graph
# result_epsilon = optimize_epsilon(data_cleaned, true_labels, laplacian_methods, number_of_clusters)
# best_epsilon = round(result_epsilon.x[0], 3)

# # Optimize k for "knn" graph
# result_k_knn = optimize_k_knn(data_cleaned, true_labels, laplacian_methods, number_of_clusters)
# best_k_knn = result_k_knn.x[0]

# # Optimize k for "mknn" graph
# result_k_mknn = optimize_k_mknn(data_cleaned, true_labels, laplacian_methods, number_of_clusters)
# best_k_mknn = result_k_mknn.x[0]

In [9]:
# Function to calculate dynamic search spaces
def get_dynamic_search_space(data):
    n = data.shape[0]
    dist_mat = squareform(pdist(data))
    flat_distances = dist_mat[np.tril_indices(n, -1)]

    local_sigma_min = max(1, int(np.sqrt(n) / 2))
    local_sigma_max = int(np.sqrt(n))
    epsilon_min = np.percentile(flat_distances, 70)
    epsilon_max = np.percentile(flat_distances, 95)
    k_min = max(5, int(0.01 * n))
    k_max = min(int(0.2 * n), n - 1)

    return (local_sigma_min, local_sigma_max), (epsilon_min, epsilon_max), (k_min, k_max)


def optimize_and_validate_parameter(data, labels, laplacians, number_of_clusters, parameter_name, similarity_graph, dynamic_ranges):
    (local_sigma_range, epsilon_range, k_range) = dynamic_ranges
    param_range = {
        "local_sigma": local_sigma_range,
        "epsilon": epsilon_range,
        "k_knn": k_range,
        "k_mknn": k_range
    }[parameter_name]

    def objective(param_value):
        param_value = param_value[0]
        all_scores = []
        valid_laplacians = True

        for laplacian in laplacians:
            try:
                results = spectral_clustering(
                    data, labels, similarity_graph=similarity_graph, laplacian=laplacian,
                    number_of_clusters=number_of_clusters, **{parameter_name: param_value}
                )
                all_scores.append(results[0][0])  # Silhouette score
            except Exception as e:
                print(f"Parameter {parameter_name}={param_value} failed for Laplacian {laplacian}: {e}")
                valid_laplacians = False
                break

        if not valid_laplacians or len(all_scores) == 0:
            return 1e6  # Penalize invalid parameter values

        return -np.mean(all_scores)  # Minimize negative mean silhouette score

    # Perform optimization
    result = gp_minimize(objective, [param_range], n_calls=20, n_random_starts=10, random_state=42)

    if result.fun < 1e6:
        best_value = result.x[0]
        print(f"Initial best {parameter_name}: {best_value}")

        # Validate the best parameter dynamically
        if validate_parameter_across_laplacians(parameter_name, best_value, data, laplacians, similarity_graph):
            return best_value
        else:
            print(f"Re-optimizing {parameter_name} due to validation failure.")
            return secondary_optimization(data, labels, laplacians, number_of_clusters, parameter_name, similarity_graph, param_range)

    else:
        print(f"No valid {parameter_name} found in primary optimization.")
        return secondary_optimization(data, labels, laplacians, number_of_clusters, parameter_name, similarity_graph, param_range)

def validate_parameter_across_laplacians(param_name, value, data, laplacians, similarity_graph):
    for laplacian in laplacians:
        try:
            spectral_clustering(
                data, None, similarity_graph=similarity_graph, laplacian=laplacian,
                number_of_clusters=3, **{param_name: value}
            )
        except Exception as e:
            print(f"Validation failed for {param_name}={value} on Laplacian {laplacian}: {e}")
            return False
    return True

def secondary_optimization(data, labels, laplacians, number_of_clusters, parameter_name, similarity_graph, param_range):
    def objective(param_value):
        param_value = param_value[0]
        scores = []
        for laplacian in laplacians:
            try:
                results = spectral_clustering(
                    data, labels, similarity_graph=similarity_graph, laplacian=laplacian,
                    number_of_clusters=number_of_clusters, **{parameter_name: param_value}
                )
                scores.append(results[0][0])
            except Exception:
                continue
        if len(scores) == 0:
            return 1e6  # Penalize if all attempts fail
        return -np.mean(scores)

    result = gp_minimize(objective, [param_range], n_calls=10, n_random_starts=5, random_state=42)

    if result.fun < 1e6:
        print(f"Fallback optimization for {parameter_name}: {result.x[0]}")
        return result.x[0]
    else:
        print(f"Fallback failed for {parameter_name}. Returning mid-range value.")
        return np.mean(param_range)  # Safe fallback


In [10]:
laplacians = ["sym", "rw", "ad"]
number_of_clusters = 2

# Optimize parameters for dataset
dynamic_ranges = get_dynamic_search_space(data_cleaned)

best_local_sigma = optimize_and_validate_parameter(
    data_cleaned, labels, laplacians, number_of_clusters, "local_sigma", "full", dynamic_ranges
)
best_epsilon = optimize_and_validate_parameter(
    data_cleaned, labels, laplacians, number_of_clusters, "epsilon", "eps", dynamic_ranges
)
best_k_knn = optimize_and_validate_parameter(
    data_cleaned, labels, laplacians, number_of_clusters, "k_knn", "knn", dynamic_ranges
)
best_k_mknn = optimize_and_validate_parameter(
    data_cleaned, labels, laplacians, number_of_clusters, "k_mknn", "mknn", dynamic_ranges
)


100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1485.85it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 602.97it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1488.58it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 610.72it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1485.11it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 617.69it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1501.24it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 617.76it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1498.60it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 619.31it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1498.79it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 617.51it/s]
100%|███████████████████████

Initial best local_sigma: 45


100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1497.91it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 610.98it/s]


Validation failed for local_sigma=45 on Laplacian sym: The 'labels_true' parameter of adjusted_rand_score must be an array-like. Got None instead.
Re-optimizing local_sigma due to validation failure.


100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1496.85it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 601.18it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1495.94it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 602.88it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1495.37it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 612.70it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1496.31it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 617.40it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1497.24it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 608.55it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1491.45it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 616.96it/s]
100%|███████████████████████

Fallback optimization for local_sigma: 45


100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1937.94it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1932.22it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1955.03it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1929.89it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1906.06it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1913.94it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1954.11it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1932.02it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1921.64it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1919.39it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1909.96it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1931.36it/s]
100%|███████████████████████

Initial best epsilon: 4.669420600683419


100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1934.11it/s]


Validation failed for epsilon=4.669420600683419 on Laplacian sym: The 'labels_true' parameter of adjusted_rand_score must be an array-like. Got None instead.
Re-optimizing epsilon due to validation failure.


100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1913.66it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1921.68it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1927.59it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1938.51it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1937.28it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1926.78it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1971.81it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1962.27it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1958.14it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1952.42it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1940.33it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1956.13it/s]
100%|███████████████████████

Fallback optimization for epsilon: 4.669447617224119


100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4865.10it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4845.67it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4857.75it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4980.27it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4886.49it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4884.09it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4812.74it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4840.16it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4852.86it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4838.34it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4945.81it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4859.56it/s]
100%|███████████████████████

Initial best k_knn: 461


100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4882.46it/s]


Validation failed for k_knn=461 on Laplacian sym: The 'labels_true' parameter of adjusted_rand_score must be an array-like. Got None instead.
Re-optimizing k_knn due to validation failure.


100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4884.21it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4882.06it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4879.77it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4908.09it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4893.98it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4892.80it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4860.07it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4886.15it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4858.83it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4879.67it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4878.49it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4889.14it/s]
100%|███████████████████████

Fallback optimization for k_knn: 419


100%|███████████████████████████████████████| 2600/2600 [03:45<00:00, 11.51it/s]
100%|███████████████████████████████████████| 2600/2600 [03:46<00:00, 11.50it/s]
100%|███████████████████████████████████████| 2600/2600 [03:45<00:00, 11.50it/s]
100%|███████████████████████████████████████| 2600/2600 [01:03<00:00, 41.15it/s]
100%|███████████████████████████████████████| 2600/2600 [01:03<00:00, 41.25it/s]
100%|███████████████████████████████████████| 2600/2600 [01:03<00:00, 41.16it/s]
100%|███████████████████████████████████████| 2600/2600 [03:41<00:00, 11.74it/s]
100%|███████████████████████████████████████| 2600/2600 [03:41<00:00, 11.74it/s]
100%|███████████████████████████████████████| 2600/2600 [03:41<00:00, 11.74it/s]
100%|███████████████████████████████████████| 2600/2600 [02:52<00:00, 15.08it/s]
100%|███████████████████████████████████████| 2600/2600 [02:52<00:00, 15.07it/s]
100%|███████████████████████████████████████| 2600/2600 [02:52<00:00, 15.08it/s]
100%|███████████████████████

Initial best k_mknn: 112


100%|███████████████████████████████████████| 2600/2600 [00:59<00:00, 44.00it/s]


Validation failed for k_mknn=112 on Laplacian sym: The 'labels_true' parameter of adjusted_rand_score must be an array-like. Got None instead.
Re-optimizing k_mknn due to validation failure.


100%|███████████████████████████████████████| 2600/2600 [03:39<00:00, 11.84it/s]
100%|███████████████████████████████████████| 2600/2600 [03:39<00:00, 11.84it/s]
100%|███████████████████████████████████████| 2600/2600 [03:42<00:00, 11.71it/s]
100%|███████████████████████████████████████| 2600/2600 [01:02<00:00, 41.80it/s]
100%|███████████████████████████████████████| 2600/2600 [01:03<00:00, 40.89it/s]
100%|███████████████████████████████████████| 2600/2600 [01:03<00:00, 41.19it/s]
100%|███████████████████████████████████████| 2600/2600 [03:38<00:00, 11.89it/s]
100%|███████████████████████████████████████| 2600/2600 [03:35<00:00, 12.08it/s]
100%|███████████████████████████████████████| 2600/2600 [03:33<00:00, 12.15it/s]
100%|███████████████████████████████████████| 2600/2600 [02:49<00:00, 15.37it/s]
100%|███████████████████████████████████████| 2600/2600 [02:49<00:00, 15.31it/s]
100%|███████████████████████████████████████| 2600/2600 [01:41<00:00, 25.50it/s]
100%|███████████████████████

Fallback optimization for k_mknn: 117


In [11]:
similarity_graphs = ["full", "eps", "knn", "mknn"]
laplacian_methods = ["sym", "rw","ad"]
number_of_clusters = 2
# best_local_sigma = 5
# best_epsilon = 1.138
# best_k_knn = 8
# best_k_mknn = 6

silhouette_scores = []
adjusted_rand_scores = []
clusters = []
sim_graph = []
laplacian = []
cluster_labels = []
hyperparameters = []
adj_files = []
laplacian_files = []
X_files = []
dist_files = []
eigenval = []
eigenvec = []

In [12]:
for graph in similarity_graphs:

    for laplace in laplacian_methods:
        metrics = spectral_clustering(data_cleaned, labels, graph, laplace, number_of_clusters, best_local_sigma, best_epsilon, best_k_knn, best_k_mknn)

        # for si, ar, cl, l, af, lf, xf, dm,e,v in metrics:
        for si, ar, cl, l in metrics:
            sim_graph.append(graph)
            laplacian.append(laplace)
            silhouette_scores.append(si)
            adjusted_rand_scores.append(ar)
            clusters.append(cl)
            cluster_labels.append(l)
            # Append consolidated hyperparameters for each similarity graph type
            if graph == "full":
                hyperparameters.append(f"local_sigma={best_local_sigma}")
            elif graph == "eps":
                hyperparameters.append(f"epsilon={best_epsilon}")
            elif graph == "knn":
                hyperparameters.append(f"k_nn={best_k_knn}")
            elif graph == "mknn":
                hyperparameters.append(f"k_mknn={best_k_mknn}")
            else:
                hyperparameters.append("None")
            # adj_files.append(af)
            # laplacian_files.append(lf)
            # X_files.append(xf)
            # eigenval.append(e)
            # eigenvec.append(v)

100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 2222.48it/s]
100%|██████████████████████████████████████| 2600/2600 [00:02<00:00, 884.54it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 2086.55it/s]
100%|██████████████████████████████████████| 2600/2600 [00:03<00:00, 850.88it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 2059.56it/s]
100%|██████████████████████████████████████| 2600/2600 [00:03<00:00, 856.33it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 2640.20it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 2625.49it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 2625.32it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 6652.25it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 6712.84it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 6720.93it/s]
100%|███████████████████████

In [13]:
experiment_madelon = pd.DataFrame(list(zip(sim_graph,laplacian,silhouette_scores,adjusted_rand_scores,clusters, hyperparameters, cluster_labels)),
             columns= ["graph","laplacian", "silhouette", "adjusted_rand","number_of_clusters","hyperparameters", "cluster_labels"])
experiment_madelon["graph_laplacian"] = experiment_madelon["graph"] + "_" + experiment_madelon["laplacian"]
experiment_madelon

Unnamed: 0,graph,laplacian,silhouette,adjusted_rand,number_of_clusters,hyperparameters,cluster_labels,graph_laplacian
0,full,sym,0.010964,0.027233,2,local_sigma=45,"[1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, ...",full_sym
1,full,rw,0.010964,0.027233,2,local_sigma=45,"[1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, ...",full_rw
2,full,ad,0.010964,0.027233,2,local_sigma=45,"[1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, ...",full_ad
3,eps,sym,0.014542,0.000827,2,epsilon=4.669447617224119,"[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, ...",eps_sym
4,eps,rw,0.016198,0.000265,2,epsilon=4.669447617224119,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",eps_rw
5,eps,ad,0.0148,0.000884,2,epsilon=4.669447617224119,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",eps_ad
6,knn,sym,0.010907,0.026471,2,k_nn=419,"[0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, ...",knn_sym
7,knn,rw,0.010907,0.026471,2,k_nn=419,"[0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, ...",knn_rw
8,knn,ad,0.010907,0.026471,2,k_nn=419,"[0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, ...",knn_ad
9,mknn,sym,0.039701,1e-06,2,k_mknn=117,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",mknn_sym


In [14]:
# Save the DataFrame to a CSV file
experiment_madelon.to_csv('experiment_madelon_full.csv', index=False)

# File is now saved in the current working directory
print("CSV file saved as 'experiment_madelon_full.csv'")

CSV file saved as 'experiment_madelon_full.csv'
