In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import math
import seaborn as sns

from sklearn import datasets

from scipy.spatial.distance import pdist, squareform
from scipy import linalg

import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical
from sklearn.preprocessing import MinMaxScaler

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, adjusted_rand_score, adjusted_mutual_info_score

In [2]:
madelon_train = 'data/madelon_train.data'
madelon_train_labels = 'data/madelon_train.labels'
madelon_valid = 'data/madelon_valid.data'
madelon_valid_labels = 'data/madelon_valid.labels'

In [3]:
# Load madelon train data
madelon_train_df = pd.read_csv(madelon_train, delimiter=' ', header=None)
madelon_train_labels_df = pd.read_csv(madelon_train_labels, delimiter=' ', header=None, names=['target'])

madelon_train_withlabels = pd.concat([madelon_train_df, madelon_train_labels_df], axis=1)
madelon_train_withlabels.drop([500], axis=1, inplace=True)
madelon_train_withlabels

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,485,477,537,479,452,471,491,476,475,473,...,481,477,485,511,485,481,479,475,496,-1
1,483,458,460,487,587,475,526,479,485,469,...,478,487,338,513,486,483,492,510,517,-1
2,487,542,499,468,448,471,442,478,480,477,...,481,492,650,506,501,480,489,499,498,-1
3,480,491,510,485,495,472,417,474,502,476,...,480,474,572,454,469,475,482,494,461,1
4,484,502,528,489,466,481,402,478,487,468,...,479,452,435,486,508,481,504,495,511,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,490,505,503,474,463,461,519,476,518,467,...,479,449,588,499,506,475,463,507,501,1
1996,480,475,476,480,495,482,515,479,480,484,...,474,473,424,454,570,476,493,465,485,-1
1997,480,517,631,470,485,474,535,476,493,466,...,483,479,687,488,488,483,500,523,481,-1
1998,484,481,505,478,542,477,518,477,510,472,...,483,526,750,486,529,484,473,527,485,1


In [4]:
# Load madelon validation data
madelon_valid_df = pd.read_csv(madelon_valid, delimiter=' ', header=None)
madelon_valid_label_df = pd.read_csv(madelon_valid_labels, delimiter=' ', header=None, names=['target'])

madelon_valid_withlabels = pd.concat([madelon_valid_df, madelon_valid_label_df], axis=1)
madelon_valid_withlabels.drop([500], axis=1, inplace=True)
madelon_valid_withlabels

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,483,454,513,495,523,469,453,477,506,479,...,480,543,259,413,520,485,498,523,510,-1
1,485,508,493,487,478,472,504,476,479,475,...,480,535,534,514,452,484,495,548,477,-1
2,483,521,507,475,493,486,421,475,496,483,...,476,498,495,508,528,486,465,508,503,-1
3,474,504,576,480,553,483,524,478,483,483,...,475,470,463,509,525,479,467,552,517,1
4,495,474,523,479,495,488,485,476,497,478,...,471,522,343,509,520,475,493,506,491,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,493,458,503,478,517,479,472,478,444,477,...,475,485,443,517,486,474,489,506,506,1
596,481,484,481,490,449,481,467,478,469,483,...,485,508,599,498,527,481,490,455,451,1
597,485,485,530,480,444,487,462,475,509,494,...,474,502,368,453,482,478,481,484,517,1
598,477,469,528,485,483,469,482,477,494,476,...,476,453,638,471,538,470,490,613,492,1


In [5]:
# Combine madelon training and validation data
data = pd.concat([madelon_train_withlabels, madelon_valid_withlabels], axis=0)
data.reset_index(drop=True, inplace=True)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,485,477,537,479,452,471,491,476,475,473,...,481,477,485,511,485,481,479,475,496,-1
1,483,458,460,487,587,475,526,479,485,469,...,478,487,338,513,486,483,492,510,517,-1
2,487,542,499,468,448,471,442,478,480,477,...,481,492,650,506,501,480,489,499,498,-1
3,480,491,510,485,495,472,417,474,502,476,...,480,474,572,454,469,475,482,494,461,1
4,484,502,528,489,466,481,402,478,487,468,...,479,452,435,486,508,481,504,495,511,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,493,458,503,478,517,479,472,478,444,477,...,475,485,443,517,486,474,489,506,506,1
2596,481,484,481,490,449,481,467,478,469,483,...,485,508,599,498,527,481,490,455,451,1
2597,485,485,530,480,444,487,462,475,509,494,...,474,502,368,453,482,478,481,484,517,1
2598,477,469,528,485,483,469,482,477,494,476,...,476,453,638,471,538,470,490,613,492,1


In [6]:
# csv_file_path = "data/madelon_full.csv"
# data.to_csv(csv_file_path, index=False)

In [7]:
data_df = data.drop(["target"], axis=1)
data_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,485,477,537,479,452,471,491,476,475,473,...,477,481,477,485,511,485,481,479,475,496
1,483,458,460,487,587,475,526,479,485,469,...,463,478,487,338,513,486,483,492,510,517
2,487,542,499,468,448,471,442,478,480,477,...,487,481,492,650,506,501,480,489,499,498
3,480,491,510,485,495,472,417,474,502,476,...,491,480,474,572,454,469,475,482,494,461
4,484,502,528,489,466,481,402,478,487,468,...,488,479,452,435,486,508,481,504,495,511
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,493,458,503,478,517,479,472,478,444,477,...,469,475,485,443,517,486,474,489,506,506
2596,481,484,481,490,449,481,467,478,469,483,...,506,485,508,599,498,527,481,490,455,451
2597,485,485,530,480,444,487,462,475,509,494,...,442,474,502,368,453,482,478,481,484,517
2598,477,469,528,485,483,469,482,477,494,476,...,473,476,453,638,471,538,470,490,613,492


In [8]:
# Create a scaler object
scaler = MinMaxScaler()

# Normalize the dataset (excluding the target column)
data_normalized = scaler.fit_transform(data_df)

# Convert it back to a DataFrame for convenience
data_normalized_df = pd.DataFrame(data_normalized, columns=data_df.columns)
data_normalized_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
0,0.560976,0.429185,0.588028,0.393939,0.280488,0.260870,0.566787,0.454545,0.384615,0.36,...,0.492958,0.529412,0.477778,0.467564,0.578947,0.428571,0.558140,0.44,0.398577,0.489474
1,0.512195,0.347639,0.316901,0.515152,0.692073,0.347826,0.693141,0.727273,0.470085,0.28,...,0.394366,0.441176,0.533333,0.287638,0.587045,0.431818,0.604651,0.57,0.523132,0.600000
2,0.609756,0.708155,0.454225,0.227273,0.268293,0.260870,0.389892,0.636364,0.427350,0.44,...,0.563380,0.529412,0.561111,0.669523,0.558704,0.480519,0.534884,0.54,0.483986,0.500000
3,0.439024,0.489270,0.492958,0.484848,0.411585,0.282609,0.299639,0.272727,0.615385,0.42,...,0.591549,0.500000,0.461111,0.574051,0.348178,0.376623,0.418605,0.47,0.466192,0.305263
4,0.536585,0.536481,0.556338,0.545455,0.323171,0.478261,0.245487,0.636364,0.487179,0.26,...,0.570423,0.470588,0.338889,0.406365,0.477733,0.503247,0.558140,0.69,0.469751,0.568421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,0.756098,0.347639,0.468310,0.378788,0.478659,0.434783,0.498195,0.636364,0.119658,0.44,...,0.436620,0.352941,0.522222,0.416157,0.603239,0.431818,0.395349,0.54,0.508897,0.542105
2596,0.463415,0.459227,0.390845,0.560606,0.271341,0.478261,0.480144,0.636364,0.333333,0.56,...,0.697183,0.647059,0.650000,0.607099,0.526316,0.564935,0.558140,0.55,0.327402,0.252632
2597,0.560976,0.463519,0.563380,0.409091,0.256098,0.608696,0.462094,0.363636,0.675214,0.78,...,0.246479,0.323529,0.616667,0.324357,0.344130,0.418831,0.488372,0.46,0.430605,0.600000
2598,0.365854,0.394850,0.556338,0.484848,0.375000,0.217391,0.534296,0.545455,0.547009,0.42,...,0.464789,0.382353,0.344444,0.654835,0.417004,0.600649,0.302326,0.55,0.889680,0.468421


In [9]:
# Add the target column back for visualization
data_normalized_df['target'] = data['target']
data_normalized_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,491,492,493,494,495,496,497,498,499,target
0,0.560976,0.429185,0.588028,0.393939,0.280488,0.260870,0.566787,0.454545,0.384615,0.36,...,0.529412,0.477778,0.467564,0.578947,0.428571,0.558140,0.44,0.398577,0.489474,-1
1,0.512195,0.347639,0.316901,0.515152,0.692073,0.347826,0.693141,0.727273,0.470085,0.28,...,0.441176,0.533333,0.287638,0.587045,0.431818,0.604651,0.57,0.523132,0.600000,-1
2,0.609756,0.708155,0.454225,0.227273,0.268293,0.260870,0.389892,0.636364,0.427350,0.44,...,0.529412,0.561111,0.669523,0.558704,0.480519,0.534884,0.54,0.483986,0.500000,-1
3,0.439024,0.489270,0.492958,0.484848,0.411585,0.282609,0.299639,0.272727,0.615385,0.42,...,0.500000,0.461111,0.574051,0.348178,0.376623,0.418605,0.47,0.466192,0.305263,1
4,0.536585,0.536481,0.556338,0.545455,0.323171,0.478261,0.245487,0.636364,0.487179,0.26,...,0.470588,0.338889,0.406365,0.477733,0.503247,0.558140,0.69,0.469751,0.568421,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,0.756098,0.347639,0.468310,0.378788,0.478659,0.434783,0.498195,0.636364,0.119658,0.44,...,0.352941,0.522222,0.416157,0.603239,0.431818,0.395349,0.54,0.508897,0.542105,1
2596,0.463415,0.459227,0.390845,0.560606,0.271341,0.478261,0.480144,0.636364,0.333333,0.56,...,0.647059,0.650000,0.607099,0.526316,0.564935,0.558140,0.55,0.327402,0.252632,1
2597,0.560976,0.463519,0.563380,0.409091,0.256098,0.608696,0.462094,0.363636,0.675214,0.78,...,0.323529,0.616667,0.324357,0.344130,0.418831,0.488372,0.46,0.430605,0.600000,1
2598,0.365854,0.394850,0.556338,0.484848,0.375000,0.217391,0.534296,0.545455,0.547009,0.42,...,0.382353,0.344444,0.654835,0.417004,0.600649,0.302326,0.55,0.889680,0.468421,1


In [10]:
csv_file_path = "data/madelon_full.csv"
data_normalized_df.index.name = "index"
data_normalized_df.to_csv(csv_file_path, index=True)

In [11]:
# Get labels and map them to a set for unique class identification
true_labels = data["target"].tolist()
unique_labels = set(true_labels)
print(unique_labels)
print(true_labels)

{1, -1}
[-1, -1, -1, 1, 1, 1, 1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, -1, -1, -1, 1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, 1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, 1, 1, -1, 1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, 1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, -1, -1, -1, -1, 1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, -1, 1, -1, -1, 1, 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1, 1, 1, -1, -1, -1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 1, 1, 1, 1

In [12]:
# # Create a scatter plot
# plt.figure(figsize=(10, 6))
# sns.scatterplot(data=data_df, x=data_df.columns[0], y=data_df.columns[1], hue=labels, palette='viridis', style=labels, markers=["s", "D"], alpha=0.7)
# plt.title("Scatter Plot of Madelon Dataset")
# plt.xlabel(data_df.columns[0])
# plt.ylabel(data_df.columns[1])
# plt.legend(title='Labels', labels=unique_labels)
# plt.show()


# sns.pairplot(data, hue='target', vars=data.columns[:-1])
# plt.show()

In [13]:
data_cleaned = data_normalized_df.drop(["target"], axis=1)
data_cleaned

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,490,491,492,493,494,495,496,497,498,499
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.560976,0.429185,0.588028,0.393939,0.280488,0.260870,0.566787,0.454545,0.384615,0.36,...,0.492958,0.529412,0.477778,0.467564,0.578947,0.428571,0.558140,0.44,0.398577,0.489474
1,0.512195,0.347639,0.316901,0.515152,0.692073,0.347826,0.693141,0.727273,0.470085,0.28,...,0.394366,0.441176,0.533333,0.287638,0.587045,0.431818,0.604651,0.57,0.523132,0.600000
2,0.609756,0.708155,0.454225,0.227273,0.268293,0.260870,0.389892,0.636364,0.427350,0.44,...,0.563380,0.529412,0.561111,0.669523,0.558704,0.480519,0.534884,0.54,0.483986,0.500000
3,0.439024,0.489270,0.492958,0.484848,0.411585,0.282609,0.299639,0.272727,0.615385,0.42,...,0.591549,0.500000,0.461111,0.574051,0.348178,0.376623,0.418605,0.47,0.466192,0.305263
4,0.536585,0.536481,0.556338,0.545455,0.323171,0.478261,0.245487,0.636364,0.487179,0.26,...,0.570423,0.470588,0.338889,0.406365,0.477733,0.503247,0.558140,0.69,0.469751,0.568421
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2595,0.756098,0.347639,0.468310,0.378788,0.478659,0.434783,0.498195,0.636364,0.119658,0.44,...,0.436620,0.352941,0.522222,0.416157,0.603239,0.431818,0.395349,0.54,0.508897,0.542105
2596,0.463415,0.459227,0.390845,0.560606,0.271341,0.478261,0.480144,0.636364,0.333333,0.56,...,0.697183,0.647059,0.650000,0.607099,0.526316,0.564935,0.558140,0.55,0.327402,0.252632
2597,0.560976,0.463519,0.563380,0.409091,0.256098,0.608696,0.462094,0.363636,0.675214,0.78,...,0.246479,0.323529,0.616667,0.324357,0.344130,0.418831,0.488372,0.46,0.430605,0.600000
2598,0.365854,0.394850,0.556338,0.484848,0.375000,0.217391,0.534296,0.545455,0.547009,0.42,...,0.464789,0.382353,0.344444,0.654835,0.417004,0.600649,0.302326,0.55,0.889680,0.468421


In [14]:
def spectral_clustering(dataframe, labels, similarity_graph, laplacian, number_of_clusters, local_sigma = None, epsilon = None, k_knn = None, k_mknn = None):

    # Pairwise distances
    dimension = dataframe.shape[0]
    dist_mat = squareform(pdist(dataframe))

    if similarity_graph == "full":

        #calculate local sigma
        sigmas = np.zeros(dimension)
        for i in tqdm(range(len(dist_mat))):
            sigmas[i] = sorted(dist_mat[i])[local_sigma]

        # Adjaceny matrix with optimal sigma
        adjacency_matrix = np.zeros([dimension, dimension])
        for i in tqdm(range(dimension)):
            for j in range(i+1, dimension):
                d = np.exp(-1*dist_mat[i,j]**2/(sigmas[i]*sigmas[j]))
                adjacency_matrix[i,j] = d
                adjacency_matrix[j,i] = d


    elif similarity_graph == "eps":

        # Adjaceny matrix with epsilon threshold
        adjacency_matrix = np.zeros([dimension, dimension])

        for i in tqdm(range(dimension)):
            for j in range(i+1, dimension):
                if dist_mat[i,j] < epsilon:
                    d = 1
                else:
                    d = 0
                adjacency_matrix[i,j] = d
                adjacency_matrix[j,i] = d


    elif similarity_graph == "knn":

        # Adjaceny matrix with k-neighbours
        adjacency_matrix = np.zeros([dimension, dimension])

        for i in tqdm(range(dimension)):
            # Sort distances for node i and get indices of the k nearest neighbors
            sorted_indices = np.argsort(dist_mat[i])
            k_nearest_indices = sorted_indices[1:k_knn+1]  # Exclude the node itself

            # Update the adjacency matrix
            adjacency_matrix[i, k_nearest_indices] = 1


    else:

        # Adjaceny matrix with mutual k-neighbours
        adjacency_matrix = np.zeros([dimension, dimension])

        for i in tqdm(range(dimension)):
            # Sort distances for node i and get indices of the k nearest neighbors
            sorted_indices = np.argsort(dist_mat[i])
            k_nearest_indices = sorted_indices[1:k_mknn+1]  # Exclude the node itself

            for neighbor in k_nearest_indices:
                # Check if node i is also among the k-nearest neighbors of the current neighbor
                neighbor_sorted_indices = np.argsort(dist_mat[neighbor])
                if i in neighbor_sorted_indices[1:k_mknn+1]:
                    # Connect nodes if they are mutual k-nearest neighbors
                    adjacency_matrix[i, neighbor] = 1
                    adjacency_matrix[neighbor, i] = 1

    # Calculate degree matrix
    degrees = np.sum(adjacency_matrix, axis=1)
    degree_matrix = np.diag(degrees)

    if laplacian == "sym":

        # Normalized Symmetric laplacian matrix
        d_inv_sqrt = np.zeros_like(degrees)
        nonzero = degrees > 0
        d_inv_sqrt[nonzero] = 1.0 / np.sqrt(degrees[nonzero])
        d_half = np.diag(d_inv_sqrt)
        laplacian_matrix_normalized = d_half @ adjacency_matrix @ d_half

    if laplacian == "rw":

        # Normalized Random Walk laplacian matrix
        d_inv = np.zeros_like(degrees)
        nonzero = degrees > 0
        d_inv[nonzero] = 1.0 / degrees[nonzero]
        d_inverse = np.diag(d_inv)
        laplacian_matrix_normalized = d_inverse @ adjacency_matrix

    if laplacian == "ad":

        # Adaptive Laplacian matrix
        D_local = np.zeros_like(degrees)
        for i in range(len(degrees)):
            neighbors = np.where(adjacency_matrix[i] > 0)[0]
            if len(neighbors) > 0 and degrees[i] > 0:
                D_local[i] = np.sum(degrees[neighbors]) / degrees[i]
            else:
                D_local[i] = 0
        D_local_inv_sqrt = np.zeros_like(D_local)
        nonzero = D_local > 0
        D_local_inv_sqrt[nonzero] = 1.0 / np.sqrt(D_local[nonzero])
        D_local_inv = np.diag(D_local_inv_sqrt)
        laplacian_matrix_normalized = D_local_inv @ adjacency_matrix @ D_local_inv

    if check_symmetric(laplacian_matrix_normalized) :
        # Calculating eigenvalues and eigenvectors for symmetric matrix
        e, v = np.linalg.eigh(laplacian_matrix_normalized)
    else:
        # Calculating eigenvalues and eigenvectors for non-symmetric matrix
        e, v = np.linalg.eig(laplacian_matrix_normalized)
        idx = np.argsort(np.real(e))
        e = np.real(e[idx])
        v = np.real(v[:, idx])

    # Calculate eigengap
    eigengap = np.diff(e)
    optimal_number_of_clusters = np.argmax(eigengap[:10]) + 1

    if number_of_clusters != None:
        # First case: k
        n_clusters = number_of_clusters
    else:
        # Second case: optimal number of clusters from eigengap
        n_clusters = optimal_number_of_clusters

    results = []

    n_clusters = max(n_clusters,2)
    X = v[:, -n_clusters:]

    # adj_filename, laplacian_filename, X_filename = save_matrices(similarity_graph,laplacian, adjacency_matrix, laplacian_matrix_normalized, X)

    # KMeans clustering
    clustering = KMeans(n_clusters=n_clusters, random_state=42, n_init=100)
    cluster_labels = clustering.fit_predict(X)

    # Calculate evaluation metrics
    sil_score = silhouette_score(dataframe, cluster_labels)
    ar_score = adjusted_rand_score(labels, cluster_labels)

    results.append((sil_score, ar_score, n_clusters,cluster_labels))

    return results
    # return results, adj_filename, laplacian_filename, X_filename

In [15]:
def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

In [16]:
def save_matrices(sim_graph, laplace, adjacency_matrix, laplacian_matrix, X_matrix):
    # Unique filenames for matrices based on parameters
    adj_filename = f"adjacency_{sim_graph}_{laplace}.csv"
    laplacian_filename = f"laplacian_{sim_graph}_{laplace}.csv"
    X_filename = f"X_{sim_graph}_{laplace}.csv"

    # Save matrices
    pd.DataFrame(adjacency_matrix).to_csv(adj_filename, index=False)
    pd.DataFrame(laplacian_matrix).to_csv(laplacian_filename, index=False)
    pd.DataFrame(X_matrix).to_csv(X_filename, index=False)

    return adj_filename, laplacian_filename, X_filename

In [17]:
# Load saved matrices dynamically
def load_matrices(similarity_graphs, laplacian_methods, directory="./"):
    saved_matrices = {}
    for graph in similarity_graphs:
        for laplace in laplacian_methods:
            adj_file = os.path.join(directory, f"adjacency_{graph}_{laplace}.csv")
            lap_file = os.path.join(directory, f"laplacian_{graph}_{laplace}.csv")
            X_file = os.path.join(directory, f"X_{graph}_{laplace}.csv")
            matrices = {}
            if os.path.exists(adj_file):
                matrices["adjacency"] = pd.read_csv(adj_file).to_numpy()
            if os.path.exists(lap_file):
                matrices["laplacian"] = pd.read_csv(lap_file).to_numpy()
            if os.path.exists(X_file):
                matrices["X"] = pd.read_csv(X_file).to_numpy()
            saved_matrices[f"{graph}_{laplace}"] = matrices
    return saved_matrices

In [18]:
# Function to calculate dynamic ranges based on data size and pairwise distances
def get_dynamic_search_space(data):
    # Number of points in the dataset
    n = data.shape[0]

    # Compute pairwise distances
    dist_mat = squareform(pdist(data))
    flat_distances = dist_mat[np.tril_indices(n, -1)]

    # Dynamic range for local_sigma (based on square root of n)
    local_sigma_min = max(1, int(np.sqrt(n) / 2))
    local_sigma_max = int(np.sqrt(n))
    
    # Dynamic range for epsilon (based on distance percentiles)
    epsilon_min = np.percentile(flat_distances, 80)  # 80th percentile
    epsilon_max = np.percentile(flat_distances, 95)  # 95th percentile
    
    # Dynamic range for k (based on number of data points)
    k_min = max(5, int(0.01 * n))  # 1% of dataset size, but at least 5
    k_max = min(int(0.2 * n), n - 1)  # 20% of dataset size, but never more than n-1
    
    # Ensure k_min does not exceed k_max
    if k_min > k_max:
        k_min = max(5, int(0.01 * n))  # Keep dynamic range based on percentage but within limits
    
    return (local_sigma_min, local_sigma_max), (epsilon_min, epsilon_max), (k_min, k_max)

# Optimization functions for each parameter

# Optimize local_sigma for "full" graph
def optimize_local_sigma(data, labels, laplacians, number_of_clusters):
    (local_sigma_min, local_sigma_max), _, _ = get_dynamic_search_space(data)

    def objective_local_sigma(local_sigma):
        silhouette_scores = []
        local_sigma = int(local_sigma[0])
        try:
            for laplacian in laplacians:
                results = spectral_clustering(data, labels, similarity_graph="full", laplacian=laplacian, number_of_clusters=number_of_clusters, local_sigma=local_sigma)
                silhouette_scores.append(results[0][0])
            return -np.mean(silhouette_scores)
        except (ValueError, np.linalg.LinAlgError) as e:
            print(f"Skipping local_sigma={local_sigma} due to error: {e}")
            return 1e6  # Return a large value to penalize the failed set of hyperparameters

    result = gp_minimize(objective_local_sigma, [(local_sigma_min, local_sigma_max)], n_calls=20, n_random_starts=10, random_state=42)

    if result.fun < 1e6:
        best_local_sigma = result.x[0]
        print(f"Best local sigma: {best_local_sigma}")
        return result
    else:
        print("No valid local_sigma found.")
        return None


# Optimize epsilon for "eps" graph
def optimize_epsilon(data, labels, laplacians, number_of_clusters):
    _, (epsilon_min, epsilon_max), _ = get_dynamic_search_space(data)

    def objective_epsilon(epsilon):
        silhouette_scores = []
        epsilon = float(epsilon[0])
        try:
            for laplacian in laplacians:
                results = spectral_clustering(data, labels, similarity_graph="eps", laplacian=laplacian, number_of_clusters=number_of_clusters, epsilon=epsilon)
                silhouette_scores.append(results[0][0])
            return -np.mean(silhouette_scores)
        except (ValueError, np.linalg.LinAlgError) as e:
            print(f"Skipping epsilon={epsilon} due to error: {e}")
            return 1e6  # Return a large value to penalize the failed set of hyperparameters

    result = gp_minimize(objective_epsilon, [(epsilon_min, epsilon_max)], n_calls=20, n_random_starts=10, random_state=42)

    if result.fun < 1e6:
        epsilon = result.x[0]
        print(f"Best epsilon: {epsilon}")
        return result
    else:
        print("No valid epsilon found.")
        return None


# Optimize k for "knn" graph
def optimize_k_knn(data, labels, laplacians, number_of_clusters):
    _, _, (k_min, k_max) = get_dynamic_search_space(data)

    def objective_k_knn(k):
        silhouette_scores = []
        k = int(k[0])
        try:
            for laplacian in laplacians:
                results = spectral_clustering(data, labels, similarity_graph="knn", laplacian=laplacian, number_of_clusters=number_of_clusters, k_knn=k)
                silhouette_scores.append(results[0][0])
            return -np.mean(silhouette_scores)
        except (ValueError, np.linalg.LinAlgError) as e:
            print(f"Skipping k={k} due to error: {e}")
            return 1e6  # Return a large value to penalize the failed set of hyperparameters

    result = gp_minimize(objective_k_knn, [(k_min, k_max)], n_calls=20, n_random_starts=10, random_state=42)

    if result.fun < 1e6:
        k_knn = result.x[0]
        print(f"Best k for knn: {k_knn}")
        return result
    else:
        print("No valid k for knn found.")
        return None


# Optimize k for "mknn" graph
def optimize_k_mknn(data, labels, laplacians, number_of_clusters):
    _, _, (k_min, k_max) = get_dynamic_search_space(data)

    def objective_k_mknn(k):
        silhouette_scores = []
        k = int(k[0])
        try:
            for laplacian in laplacians:
                results = spectral_clustering(data, labels, similarity_graph="mknn", laplacian=laplacian, number_of_clusters=number_of_clusters, k_mknn=k)
                silhouette_scores.append(results[0][0])
            return -np.mean(silhouette_scores)
        except (ValueError, np.linalg.LinAlgError) as e:
            print(f"Skipping k={k} due to error: {e}")
            return 1e6  # Return a large value to penalize the failed set of hyperparameters

    result = gp_minimize(objective_k_mknn, [(k_min, k_max)], n_calls=20, n_random_starts=10, random_state=42)

    if result.fun < 1e6:
        k_mknn = result.x[0]
        print(f"Best k for mknn: {k_mknn}")
        return result
    else:
        print("No valid k for mknn found.")
        return None


In [19]:
# Call the optimization functions
laplacian_methods = ["sym", "rw", "ad"]
number_of_clusters = 2

# Optimize local_sigma for "full" graph
result_local_sigma = optimize_local_sigma(data_cleaned, true_labels, laplacian_methods, number_of_clusters)
best_local_sigma = result_local_sigma.x[0]

# Optimize epsilon for "eps" graph
result_epsilon = optimize_epsilon(data_cleaned, true_labels, laplacian_methods, number_of_clusters)
best_epsilon = round(result_epsilon.x[0], 3)

# Optimize k for "knn" graph
result_k_knn = optimize_k_knn(data_cleaned, true_labels, laplacian_methods, number_of_clusters)
best_k_knn = result_k_knn.x[0]

# Optimize k for "mknn" graph
result_k_mknn = optimize_k_mknn(data_cleaned, true_labels, laplacian_methods, number_of_clusters)
best_k_mknn = result_k_mknn.x[0]

100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1499.12it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 596.78it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1501.58it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 618.42it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1500.24it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 620.20it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1500.11it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 620.40it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1499.67it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 615.57it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1512.16it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 615.94it/s]
100%|███████████████████████

Best local sigma: 45


100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1940.05it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1951.50it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1953.38it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1920.28it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1945.93it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1939.86it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1941.23it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1963.21it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1961.04it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1960.00it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1962.07it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1956.43it/s]
100%|███████████████████████

Best epsilon: 4.66939274873593


100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4896.47it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4969.41it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4873.53it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4862.47it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4875.36it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4812.82it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4869.57it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 3800.37it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4910.34it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4894.62it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4883.96it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4004.93it/s]
100%|███████████████████████

Best k for knn: 471


100%|███████████████████████████████████████| 2600/2600 [03:46<00:00, 11.50it/s]
100%|███████████████████████████████████████| 2600/2600 [03:45<00:00, 11.55it/s]
100%|███████████████████████████████████████| 2600/2600 [03:44<00:00, 11.56it/s]
100%|███████████████████████████████████████| 2600/2600 [01:02<00:00, 41.36it/s]
100%|███████████████████████████████████████| 2600/2600 [01:02<00:00, 41.54it/s]
100%|███████████████████████████████████████| 2600/2600 [01:02<00:00, 41.45it/s]
100%|███████████████████████████████████████| 2600/2600 [03:40<00:00, 11.81it/s]
100%|███████████████████████████████████████| 2600/2600 [03:39<00:00, 11.83it/s]
100%|█████████████████████████████████████| 2600/2600 [1:35:03<00:00,  2.19s/it]
100%|███████████████████████████████████████| 2600/2600 [02:55<00:00, 14.79it/s]
100%|███████████████████████████████████████| 2600/2600 [02:54<00:00, 14.90it/s]
100%|███████████████████████████████████████| 2600/2600 [02:52<00:00, 15.04it/s]
100%|███████████████████████

Best k for mknn: 53


In [20]:
similarity_graphs = ["full", "eps", "knn", "mknn"]
laplacian_methods = ["sym", "rw","ad"]
number_of_clusters = 3
# best_local_sigma = 11
# best_epsilon = 5
# best_k_knn = 29
# best_k_mknn = 28

silhouette_scores = []
adjusted_rand_scores = []
clusters = []
sim_graph = []
laplacian = []
cluster_labels = []
hyperparameters = []

In [21]:
for graph in similarity_graphs:

    for laplace in laplacian_methods:
        metrics = spectral_clustering(data_cleaned, true_labels, graph, laplace, number_of_clusters, best_local_sigma, best_epsilon, best_k_knn, best_k_mknn)
        # metrics, adj_file, lap_file, X_file = spectral_clustering(data_df, true_labels, graph, laplace, number_of_clusters, best_local_sigma, best_epsilon, best_k_knn, best_k_mknn)

        for si, ar, cl, l in metrics:
            sim_graph.append(graph)
            laplacian.append(laplace)
            silhouette_scores.append(si)
            adjusted_rand_scores.append(ar)
            clusters.append(cl)
            cluster_labels.append(l)
            # Append consolidated hyperparameters for each similarity graph type
            if graph == "full":
                hyperparameters.append(f"local_sigma={best_local_sigma}")
            elif graph == "eps":
                hyperparameters.append(f"epsilon={best_epsilon}")
            elif graph == "knn":
                hyperparameters.append(f"k_nn={best_k_knn}")
            elif graph == "mknn":
                hyperparameters.append(f"k_mknn={best_k_mknn}")
            else:
                hyperparameters.append("None")

100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1481.29it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 595.63it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1467.42it/s]
100%|██████████████████████████████████████| 2600/2600 [00:04<00:00, 599.78it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1501.50it/s]
100%|██████████████████████████████████████| 2600/2600 [00:05<00:00, 499.89it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1939.87it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1924.58it/s]
100%|█████████████████████████████████████| 2600/2600 [00:01<00:00, 1836.64it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4896.02it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4871.13it/s]
100%|█████████████████████████████████████| 2600/2600 [00:00<00:00, 4881.15it/s]
100%|███████████████████████

In [22]:
experiment_iris = pd.DataFrame(list(zip(sim_graph,laplacian,silhouette_scores,adjusted_rand_scores,clusters, hyperparameters, cluster_labels)),
             columns= ["graph","laplacian", "silhouette", "adjusted_rand","number_of_clusters","hyperparameters", "cluster_labels"])
experiment_iris["graph_laplacian"] = experiment_iris["graph"] + "_" + experiment_iris["laplacian"]
experiment_iris

Unnamed: 0,graph,laplacian,silhouette,adjusted_rand,number_of_clusters,hyperparameters,cluster_labels,graph_laplacian
0,full,sym,0.008349,0.021962,3,local_sigma=45,"[2, 0, 1, 1, 0, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1, ...",full_sym
1,full,rw,0.008349,0.021962,3,local_sigma=45,"[2, 0, 1, 1, 0, 1, 2, 1, 1, 1, 2, 2, 2, 2, 1, ...",full_rw
2,full,ad,0.008361,0.021447,3,local_sigma=45,"[2, 1, 0, 0, 1, 0, 2, 0, 0, 0, 2, 2, 2, 2, 0, ...",full_ad
3,eps,sym,0.010486,0.001419,3,epsilon=4.669,"[1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 0, 1, 1, ...",eps_sym
4,eps,rw,0.010953,0.000459,3,epsilon=4.669,"[1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 1, 1, ...",eps_rw
5,eps,ad,0.010567,0.001404,3,epsilon=4.669,"[2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, ...",eps_ad
6,knn,sym,0.008348,0.019085,3,k_nn=471,"[0, 2, 1, 1, 2, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, ...",knn_sym
7,knn,rw,0.008348,0.019085,3,k_nn=471,"[0, 2, 1, 1, 2, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, ...",knn_rw
8,knn,ad,0.008348,0.019085,3,k_nn=471,"[0, 2, 1, 1, 2, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, ...",knn_ad
9,mknn,sym,0.007839,1e-06,3,k_mknn=53,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",mknn_sym


In [23]:
# Finding the best results for each metric and their indices
best_silhouette_index = experiment_madelon['silhouette'].idxmax()
best_adjusted_rand_index = experiment_madelon['adjusted_rand'].idxmax()
adjusted_mutual_info_index = experiment_madelon['adjusted_mutual_info'].idxmax()

best_silhouette = experiment_madelon.loc[best_silhouette_index]
best_adjusted_rand = experiment_madelon.loc[best_adjusted_rand_index]
best_adjusted_mutual_info = experiment_madelon.loc[adjusted_mutual_info_index]

# Create a clearer DataFrame with the best results and their indices
best_results = pd.DataFrame({
    'Metric': ['Best Silhouette', 'Best Adjusted Rand', 'Best Adjusted Mutual Info'],
    'Index': [
        best_silhouette_index,
        best_adjusted_rand_index,
        adjusted_mutual_info_index
    ],
    'Graph': [
        best_silhouette['graph'],
        best_adjusted_rand['graph'],
        best_adjusted_mutual_info['graph']
    ],
    'Laplacian': [
        best_silhouette['laplacian'],
        best_adjusted_rand['laplacian'],
        best_adjusted_mutual_info['laplacian']
    ],
    'Silhouette Score': [
        best_silhouette['silhouette'],
        None,  # Placeholder for clarity
        None   # Placeholder for clarity
    ],
    'Adjusted Rand Index': [
        None,  # Placeholder for clarity
        best_adjusted_rand['adjusted_rand'],
        None   # Placeholder for clarity
    ],
    'Adjusted Mutual Info': [
        None,  # Placeholder for clarity
        None,  # Placeholder for clarity
        best_adjusted_mutual_info['adjusted_mutual_info']
    ],
    'Clusters': [
        best_silhouette['clusters'],
        best_adjusted_rand['clusters'],
        best_adjusted_mutual_info['clusters']
    ]
})

# Display the best results
best_results

NameError: name 'experiment_madelon' is not defined

In [None]:
# Create a figure and axis
fig, ax = plt.subplots(figsize=(20, 6))

# Define an offset to avoid overlap
offset = 0.05

# Plot silhouette coefficient with an offset
ax.plot(experiment_madelon['graph_laplacian'], 
        experiment_madelon['silhouette'] + offset, 
        marker='o', color='b', label='Silhouette Coefficient', markersize=6)

# Plot Adjusted Rand Index with an offset
ax.plot(experiment_madelon['graph_laplacian'], 
        experiment_madelon['adjusted_rand'], 
        marker='x', color='r', label='Adjusted Rand Index', markersize=6)

# Plot Adjusted Mutual Information with an offset
ax.plot(experiment_madelon['graph_laplacian'], 
        experiment_madelon['adjusted_mutual_info'] - offset, 
        marker='s', color='g', label='Adjusted Mutual Info', markersize=6)

# Set labels
ax.set_xlabel('Graph Laplacian')
ax.set_ylabel('Metric Values')

# Adjust y-axis limits dynamically
ax.set_ylim(-1, 1)  # Set limits to fit the offsets

# Add a title
plt.title('Experiment: Test Dataset')

# Add a legend
ax.legend(loc='upper left')

# Show grid for better readability
plt.grid()

# Show the plot
plt.show()

In [None]:
# Visualization function
def visualize_clusters(data, cluster_labels, title):
    """
    Visualize clustered data using PCA.
    
    Parameters:
    data (DataFrame): The original data used for clustering.
    cluster_labels (ndarray): Cluster labels from the clustering algorithm.
    title (str): Title of the plot.
    """
    # Reduce data to 2D using PCA if necessary
    if data.shape[1] > 2:
        pca = PCA(n_components=2)
        reduced_data = pca.fit_transform(data)
    else:
        reduced_data = data.values  # If data is already 2D
    
    # Create scatter plot with clusters
    plt.figure(figsize=(6, 4))
    scatter = plt.scatter(reduced_data[:, 0], reduced_data[:, 1], c=cluster_labels, cmap='viridis', alpha=0.7)
    
    # Adding title and labels
    plt.title(f'Cluster Visualization - {title}')
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.colorbar(scatter, label='Cluster Label')
    plt.show()

In [None]:
# Extracting cluster labels for best silhouette, adjusted rand, and adjusted mutual info indices
best_silhouette_clusters = cluster_labels[best_silhouette_index]
best_adjusted_rand_clusters = cluster_labels[best_adjusted_rand_index]
best_adjusted_mutual_info_clusters = cluster_labels[adjusted_mutual_info_index]

# Visualizing the best clusters for each metric
visualize_clusters(data_df, best_silhouette_clusters, 'Best Silhouette')
visualize_clusters(data_df, best_adjusted_rand_clusters, 'Best Adjusted Rand Index')
visualize_clusters(data_df, best_adjusted_mutual_info_clusters, 'Best Adjusted Mutual Info')