In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import math
import seaborn as sns

from scipy.spatial.distance import pdist, squareform,cdist
from scipy import linalg
import scipy.sparse as sp

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score,adjusted_rand_score
from sklearn.metrics.pairwise import cosine_distances

from skopt import gp_minimize
from skopt.space import Real, Integer, Categorical

In [2]:
# Define paths to your CSV files
path_20newsgroups_66 = "20newsgroups_66.csv"

# Read the CSV files into pandas DataFrames
newsgroups_66 = pd.read_csv(path_20newsgroups_66)
labels = newsgroups_66["target"] 
data_df = newsgroups_66.drop(columns=["Index","target"])  # Replace "target" with your actual target column name

In [3]:
set(labels)

{0, 1, 2}

In [4]:
# Function to calculate dynamic ranges based on data size and pairwise distances
def get_dynamic_search_space(data):
    # Number of points in the dataset
    n = data.shape[0]

    # Compute pairwise distances
    dist_mat = squareform(pdist(data))
    flat_distances = dist_mat[np.tril_indices(n, -1)]

    # Dynamic range for local_sigma (based on square root of n)
    local_sigma_min = max(1, int(np.sqrt(n) / 2))
    local_sigma_max = int(np.sqrt(n))
    
    # Dynamic range for epsilon (based on distance percentiles)
    epsilon_min = np.percentile(flat_distances, 80)  # 80th percentile
    epsilon_max = np.percentile(flat_distances, 95)  # 95th percentile
    
    # Dynamic range for k (based on number of data points)
    k_min = max(5, int(0.01 * n))  # 1% of dataset size, but at least 5
    k_max = min(int(0.2 * n), n - 1)  # 20% of dataset size, but never more than n-1
    
    # Ensure k_min does not exceed k_max
    if k_min > k_max:
        k_min = max(5, int(0.01 * n))  # Keep dynamic range based on percentage but within limits
    
    return (local_sigma_min, local_sigma_max), (epsilon_min, epsilon_max), (k_min, k_max)

# Optimization functions for each parameter

# Optimize local_sigma for "full" graph
def optimize_local_sigma(data, labels, laplacians, number_of_clusters):
    (local_sigma_min, local_sigma_max), _, _ = get_dynamic_search_space(data)

    def objective_local_sigma(local_sigma):
        silhouette_scores = []
        local_sigma = int(local_sigma[0])
        try:
            for laplacian in laplacians:
                results = spectral_clustering(data, labels, similarity_graph="full", laplacian=laplacian, number_of_clusters=number_of_clusters, local_sigma=local_sigma)
                silhouette_scores.append(results[0][0])
            return -np.mean(silhouette_scores)
        except (ValueError, np.linalg.LinAlgError) as e:
            print(f"Skipping local_sigma={local_sigma} due to error: {e}")
            return 1e6  # Return a large value to penalize the failed set of hyperparameters

    result = gp_minimize(objective_local_sigma, [(local_sigma_min, local_sigma_max)], n_calls=20, n_random_starts=10, random_state=42)

    if result.fun < 1e6:
        best_local_sigma = result.x[0]
        print(f"Best local sigma: {best_local_sigma}")
        return result
    else:
        print("No valid local_sigma found.")
        return None


# Optimize epsilon for "eps" graph
def optimize_epsilon(data, labels, laplacians, number_of_clusters):
    _, (epsilon_min, epsilon_max), _ = get_dynamic_search_space(data)

    def objective_epsilon(epsilon):
        silhouette_scores = []
        epsilon = float(epsilon[0])
        try:
            for laplacian in laplacians:
                results = spectral_clustering(data, labels, similarity_graph="eps", laplacian=laplacian, number_of_clusters=number_of_clusters, epsilon=epsilon)
                silhouette_scores.append(results[0][0])
            return -np.mean(silhouette_scores)
        except (ValueError, np.linalg.LinAlgError) as e:
            print(f"Skipping epsilon={epsilon} due to error: {e}")
            return 1e6  # Return a large value to penalize the failed set of hyperparameters

    result = gp_minimize(objective_epsilon, [(epsilon_min, epsilon_max)], n_calls=20, n_random_starts=10, random_state=42)

    if result.fun < 1e6:
        epsilon = result.x[0]
        print(f"Best epsilon: {epsilon}")
        return result
    else:
        print("No valid epsilon found.")
        return None


# Optimize k for "knn" graph
def optimize_k_knn(data, labels, laplacians, number_of_clusters):
    _, _, (k_min, k_max) = get_dynamic_search_space(data)

    def objective_k_knn(k):
        silhouette_scores = []
        k = int(k[0])
        try:
            for laplacian in laplacians:
                results = spectral_clustering(data, labels, similarity_graph="knn", laplacian=laplacian, number_of_clusters=number_of_clusters, k_knn=k)
                silhouette_scores.append(results[0][0])
            return -np.mean(silhouette_scores)
        except (ValueError, np.linalg.LinAlgError) as e:
            print(f"Skipping k={k} due to error: {e}")
            return 1e6  # Return a large value to penalize the failed set of hyperparameters

    result = gp_minimize(objective_k_knn, [(k_min, k_max)], n_calls=20, n_random_starts=10, random_state=42)

    if result.fun < 1e6:
        k_knn = result.x[0]
        print(f"Best k for knn: {k_knn}")
        return result
    else:
        print("No valid k for knn found.")
        return None


# Optimize k for "mknn" graph
def optimize_k_mknn(data, labels, laplacians, number_of_clusters):
    _, _, (k_min, k_max) = get_dynamic_search_space(data)

    def objective_k_mknn(k):
        silhouette_scores = []
        k = int(k[0])
        try:
            for laplacian in laplacians:
                results = spectral_clustering(data, labels, similarity_graph="mknn", laplacian=laplacian, number_of_clusters=number_of_clusters, k_mknn=k)
                silhouette_scores.append(results[0][0])
            return -np.mean(silhouette_scores)
        except (ValueError, np.linalg.LinAlgError) as e:
            print(f"Skipping k={k} due to error: {e}")
            return 1e6  # Return a large value to penalize the failed set of hyperparameters

    result = gp_minimize(objective_k_mknn, [(k_min, k_max)], n_calls=20, n_random_starts=10, random_state=42)

    if result.fun < 1e6:
        k_mknn = result.x[0]
        print(f"Best k for mknn: {k_mknn}")
        return result
    else:
        print("No valid k for mknn found.")
        return None


In [5]:
def spectral_clustering(dataframe, labels, similarity_graph, laplacian, number_of_clusters, local_sigma = None, epsilon = None, k_knn = None, k_mknn = None):
    
    # Pairwise distances
    dimension = dataframe.shape[0]
    sparse_dataframe = sp.csr_matrix(dataframe)
    dist_mat = cosine_distances(sparse_dataframe)

    sample_size = dist_mat.shape[0]
    
    rows, cols, weights = [], [], []  # Accumulate edges and weights
    
    if similarity_graph == "full":

        #calculate local sigma
        sigmas = np.zeros(dimension)
        for i in tqdm(range(len(dist_mat))):
            sigmas[i] = np.partition(dist_mat[i], local_sigma)[local_sigma]

        # Adjacency matrix with local sigma
        for i in tqdm(range(dimension), desc='Building full adjacency matrix'):
            for j in range(i + 1, dimension):  
                d = np.exp(-1 * dist_mat[i, j] ** 2 / (sigmas[i] * sigmas[j]))
                if d > 0:  # Only store non-zero weights
                    rows.append(i)
                    cols.append(j)
                    weights.append(d)
                    rows.append(j)
                    cols.append(i)
                    weights.append(d)
    
    elif similarity_graph == "eps":

        # Adjacency matrix with epsilon threshold
        for i in tqdm(range(dimension), desc='Building epsilon adjacency matrix'):
            for j in range(i + 1, dimension):
                if dist_mat[i, j] < epsilon:
                    rows.append(i)
                    cols.append(j)
                    rows.append(j)
                    cols.append(i)
        weights = np.ones(len(rows))  # All edges have equal weight (binary adjacency)

    elif similarity_graph == "knn":

        # Adjaceny matrix with k-neighbours
        for i in tqdm(range(dimension), desc='Building k-NN adjacency matrix'):
            # Sort distances and get indices of the k nearest neighbors
            sorted_indices = np.argsort(dist_mat[i])
            k_nearest_indices = sorted_indices[1:k_knn + 1]  # Exclude the node itself

            # Add directed edges from i to its k-nearest neighbors
            for neighbor in k_nearest_indices:
                rows.append(i)
                cols.append(neighbor)
                weights.append(1)


    elif similarity_graph == "mknn":

        # Adjaceny matrix with mutual k-neighbours
        for i in tqdm(range(dimension), desc='Building mutual k-NN adjacency matrix'):
            # Sort distances and get indices of the k nearest neighbors
            sorted_indices = np.argsort(dist_mat[i])
            k_nearest_indices = sorted_indices[1:k_mknn + 1]  # Exclude the node itself
        
            for neighbor in k_nearest_indices:
                # Only add edge if mutual (both are in each other's KNN)
                neighbor_sorted_indices = np.argsort(dist_mat[neighbor])
                if i in neighbor_sorted_indices[1:k_mknn + 1]:
                    rows.append(i)
                    cols.append(neighbor)
                    rows.append(neighbor)
                    cols.append(i)
                    weights.append(1)
                    weights.append(1)


    adjacency_matrix = sp.csr_matrix((weights, (rows, cols)), shape=(dimension, dimension))

    # Degree matrix (sparse)
    degree_vector = np.ravel(adjacency_matrix.sum(axis=1))
    degree_matrix = sp.diags(degree_vector)  # Sparse diagonal matrix

    # # Identify isolated nodes (degree = 0)
    # isolated_nodes = np.where(degree_matrix.diagonal() == 0)[0]
    # non_isolated_mask = np.isin(np.arange(dimension), isolated_nodes, invert=True)

    # # Handle non-isolated nodes only for clustering
    # adjacency_matrix_non_isolated = adjacency_matrix[non_isolated_mask][:, non_isolated_mask]
    
    # degree_vector_non_isolated = np.ravel(adjacency_matrix_non_isolated.sum(axis=1))

    # if laplacian == "sym":

    #     # Normalized Symmetric laplacian matrix
    #     d_half = sp.diags(1.0 / np.sqrt(degree_vector_non_isolated))
    #     laplacian_matrix_normalized = d_half @ adjacency_matrix_non_isolated @ d_half
        
    # if laplacian == "rw":

    #     # Normalized Random Walk laplacian matrix
    #     d_inverse = sp.diags(1.0 / degree_vector_non_isolated)
    #     laplacian_matrix_normalized = d_inverse @ adjacency_matrix_non_isolated

    # if laplacian == "ad":
        
    #     # Adaptive Laplacian matrix
    #     D_local = np.zeros_like(degree_vector_non_isolated)
        
    #     for i in range(len(degree_vector_non_isolated)):
    #         neighbors = adjacency_matrix_non_isolated[i].nonzero()[1]
    #         if len(neighbors) > 0:
    #             D_local[i] = adjacency_matrix_non_isolated[neighbors].sum() / degree_vector_non_isolated[i]
    #         else:
    #             D_local[i] = 0
        
    #     D_local_inv = sp.diags(1.0 / np.sqrt(D_local))
    #     laplacian_matrix_normalized = D_local_inv @ adjacency_matrix_non_isolated @ D_local_inv

    # laplacian_matrix_normalized_densed = laplacian_matrix_normalized.toarray()

    if laplacian == "sym":

        # Normalized Symmetric Laplacian matrix
        d_inv_sqrt = np.zeros_like(degree_vector, dtype=float)
        nonzero = degree_vector > 0
        d_inv_sqrt[nonzero] = 1.0 / np.sqrt(degree_vector[nonzero])
        # Create sparse diagonal matrix
        d_half = sp.diags(d_inv_sqrt)
        laplacian_matrix_normalized = d_half @ adjacency_matrix @ d_half
    
    elif laplacian == "rw":
    
        # Normalized Random Walk Laplacian matrix
        d_inv = np.zeros_like(degree_vector, dtype=float)
        nonzero = degree_vector > 0
        d_inv[nonzero] = 1.0 / degree_vector[nonzero]
        # Create sparse diagonal matrix
        d_inverse = sp.diags(d_inv)
        laplacian_matrix_normalized = d_inverse @ adjacency_matrix
    
    elif laplacian == "ad":
    
        # Adaptive Laplacian matrix
        D_local = np.zeros_like(degree_vector, dtype=float)
        for i in range(len(degree_vector)):
            neighbors = adjacency_matrix[i].nonzero()[1]
            if len(neighbors) > 0 and degree_vector[i] > 0:
                D_local[i] = degree_vector[neighbors].sum() / degree_vector[i]
            else:
                D_local[i] = 0.0
        D_local_inv_sqrt = np.zeros_like(D_local, dtype=float)
        nonzero = D_local > 0
        D_local_inv_sqrt[nonzero] = 1.0 / np.sqrt(D_local[nonzero])
        # Create sparse diagonal matrix
        D_local_inv = sp.diags(D_local_inv_sqrt)
        laplacian_matrix_normalized = D_local_inv @ adjacency_matrix @ D_local_inv

    laplacian_matrix_normalized_densed = laplacian_matrix_normalized.toarray()
    
    if check_symmetric(laplacian_matrix_normalized_densed) :
        # Calculating eigenvalues and eigenvectors for symmetric matrix
        e, v = np.linalg.eigh(laplacian_matrix_normalized_densed)
    else:
        # Calculating eigenvalues and eigenvectors for non-symmetric matrix
        e, v = np.linalg.eig(laplacian_matrix_normalized_densed)
        idx = np.argsort(np.real(e))
        e = np.real(e[idx])
        v = np.real(v[:, idx]) 

    # Calculate eigengap
    eigengap = np.diff(e)
    optimal_number_of_clusters = np.argmax(eigengap[:10]) + 1   

    if number_of_clusters != None:
        # First case: k
        n_clusters = max(number_of_clusters,2)
    else:
        # Second case: optimal number of clusters from eigengap
        n_clusters = max(optimal_number_of_clusters,2)
    
    results = []

    # KMeans clustering
    X = v[:, -n_clusters:]
    clustering = KMeans(n_clusters=n_clusters, random_state=42, n_init=100)
    cluster_labels = clustering.fit_predict(X)
    
    sil_score = silhouette_score(dataframe, cluster_labels)
    ar_score = adjusted_rand_score(labels, cluster_labels)

    results.append((sil_score, ar_score, n_clusters,cluster_labels))

    return results
    
    # for i, current_k in enumerate(k_values):
    #     if i == 0:
    #         # First case: k
    #         X = v[:, -number_of_clusters:]
    #         n_clusters = number_of_clusters
    #     elif i == 1:
    #         # Second case: 2k
    #         X = v[:, -(2*number_of_clusters):]
    #         n_clusters = number_of_clusters
    #     else:
    #         # Third case: optimal number of clusters from eigengap
    #         X = v[:, -optimal_number_of_clusters:]
    #         n_clusters = optimal_number_of_clusters
    
    #     # KMeans clustering
    #     clustering = KMeans(n_clusters=n_clusters, random_state=42)
    #     non_isolated_labels = clustering.fit_predict(X)

    #     # Full label assignment
    #     full_labels = np.full(dimension, -1)
    #     full_labels[non_isolated_mask] = non_isolated_labels
    
    #     # Assign isolated nodes based on nearest non-isolated neighbors
    #     if len(isolated_nodes) > 0:
    #         isolated_points = dataframe.iloc[isolated_nodes]
    
    #         # Calculate distances to non-isolated points in original space
    #         non_isolated_points = dataframe.iloc[non_isolated_mask]
    #         distances_to_non_isolated = cdist(isolated_points, non_isolated_points, metric='cosine')
    
    #         # Assign labels based on the closest non-isolated point
    #         nearest_non_isolated_indices = np.argmin(distances_to_non_isolated, axis=1)
    #         isolated_labels = full_labels[non_isolated_mask][nearest_non_isolated_indices]
    #         full_labels[isolated_nodes] = isolated_labels

    #     # Calculate evaluation metrics
    #     sil_score = silhouette_coefficient(dataframe, full_labels)[3]
    #     ar_score = adjusted_rand_score(labels, full_labels)
    #     am_score =  adjusted_mutual_info_score(labels, full_labels)

    #     results.append((sil_score, ar_score, am_score, current_k,full_labels))

    # return results

In [6]:
def check_symmetric(a, rtol=1e-05, atol=1e-08):
    return np.allclose(a, a.T, rtol=rtol, atol=atol)

In [7]:
# # Call the optimization functions
laplacian_methods = ["sym", "rw", "ad"]
number_of_clusters = 3

# Optimize local_sigma for "full" graph
result_local_sigma = optimize_local_sigma(data_df, labels, laplacian_methods, number_of_clusters)
best_local_sigma = result_local_sigma.x[0]

# Optimize epsilon for "eps" graph
result_epsilon = optimize_epsilon(data_df, labels, laplacian_methods, number_of_clusters)
best_epsilon = round(result_epsilon.x[0], 3)

# Optimize k for "knn" graph
result_k_knn = optimize_k_knn(data_df, labels, laplacian_methods, number_of_clusters)
best_k_knn = result_k_knn.x[0]

# Optimize k for "mknn" graph
result_k_mknn = optimize_k_mknn(data_df, labels, laplacian_methods, number_of_clusters)
best_k_mknn = result_k_mknn.x[0]

100%|████████████████████████████████████| 1755/1755 [00:00<00:00, 59390.23it/s]
Building full adjacency matrix: 100%|██████| 1755/1755 [00:02<00:00, 840.26it/s]
100%|████████████████████████████████████| 1755/1755 [00:00<00:00, 53991.63it/s]
Building full adjacency matrix: 100%|██████| 1755/1755 [00:02<00:00, 835.89it/s]
100%|████████████████████████████████████| 1755/1755 [00:00<00:00, 53566.91it/s]
Building full adjacency matrix: 100%|██████| 1755/1755 [00:02<00:00, 847.30it/s]
100%|████████████████████████████████████| 1755/1755 [00:00<00:00, 56373.33it/s]
Building full adjacency matrix: 100%|██████| 1755/1755 [00:02<00:00, 833.29it/s]
100%|████████████████████████████████████| 1755/1755 [00:00<00:00, 56439.89it/s]
Building full adjacency matrix: 100%|██████| 1755/1755 [00:02<00:00, 834.50it/s]
100%|████████████████████████████████████| 1755/1755 [00:00<00:00, 53867.96it/s]
Building full adjacency matrix: 100%|██████| 1755/1755 [00:02<00:00, 838.20it/s]
100%|███████████████████████

Best local sigma: 37


Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 3689.24it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 3538.52it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 3559.32it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 3520.80it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 3523.12it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 3556.01it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 3552.76it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 3519.19it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 3545.80it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 3517.46it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 3511.56it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 3538.90it/s]
Building epsilon adjacency m

Best epsilon: 1283.042013607922


Building k-NN adjacency matrix: 100%|█████| 1755/1755 [00:00<00:00, 7197.72it/s]
Building k-NN adjacency matrix: 100%|█████| 1755/1755 [00:00<00:00, 6865.79it/s]
Building k-NN adjacency matrix: 100%|█████| 1755/1755 [00:00<00:00, 6968.74it/s]
Building k-NN adjacency matrix: 100%|█████| 1755/1755 [00:00<00:00, 8111.23it/s]
Building k-NN adjacency matrix: 100%|█████| 1755/1755 [00:00<00:00, 8459.39it/s]
Building k-NN adjacency matrix: 100%|█████| 1755/1755 [00:00<00:00, 8327.88it/s]
Building k-NN adjacency matrix: 100%|█████| 1755/1755 [00:00<00:00, 6894.14it/s]
Building k-NN adjacency matrix: 100%|█████| 1755/1755 [00:00<00:00, 6738.88it/s]
Building k-NN adjacency matrix: 100%|█████| 1755/1755 [00:00<00:00, 6744.19it/s]
Building k-NN adjacency matrix: 100%|█████| 1755/1755 [00:00<00:00, 7316.13it/s]
Building k-NN adjacency matrix: 100%|█████| 1755/1755 [00:00<00:00, 7301.95it/s]
Building k-NN adjacency matrix: 100%|█████| 1755/1755 [00:00<00:00, 7331.11it/s]
Building k-NN adjacency matr

Best k for knn: 50


Building mutual k-NN adjacency matrix: 100%|█| 1755/1755 [1:44:50<00:00,  3.58s/
Building mutual k-NN adjacency matrix: 100%|█| 1755/1755 [00:49<00:00, 35.77it/s
Building mutual k-NN adjacency matrix: 100%|█| 1755/1755 [00:48<00:00, 35.87it/s
Building mutual k-NN adjacency matrix: 100%|█| 1755/1755 [00:13<00:00, 133.11it/
Building mutual k-NN adjacency matrix: 100%|█| 1755/1755 [00:13<00:00, 132.33it/
Building mutual k-NN adjacency matrix: 100%|█| 1755/1755 [00:13<00:00, 131.25it/
Building mutual k-NN adjacency matrix: 100%|█| 1755/1755 [00:48<00:00, 36.31it/s
Building mutual k-NN adjacency matrix: 100%|█| 1755/1755 [00:47<00:00, 36.88it/s
Building mutual k-NN adjacency matrix: 100%|█| 1755/1755 [00:48<00:00, 36.47it/s
Building mutual k-NN adjacency matrix: 100%|█| 1755/1755 [00:37<00:00, 47.05it/s
Building mutual k-NN adjacency matrix: 100%|█| 1755/1755 [00:36<00:00, 47.99it/s
Building mutual k-NN adjacency matrix: 100%|█| 1755/1755 [00:36<00:00, 47.95it/s
Building mutual k-NN adjacen

Best k for mknn: 50


In [8]:
similarity_graphs = ["full", "eps", "knn", "mknn"]
laplacian_methods = ["sym", "rw","ad"]
number_of_clusters = 3
# best_local_sigma = 9
# best_epsilon = 1.141
# best_k_knn = 20
# best_k_mknn = 24

silhouette_scores = []
adjusted_rand_scores = []
clusters = []
sim_graph = []
laplacian = []
cluster_labels = []
hyperparameters = []
adj_files = []
laplacian_files = []
X_files = []

In [9]:
for graph in similarity_graphs:

    for laplace in laplacian_methods:
        metrics = spectral_clustering(data_df, labels, graph, laplace, number_of_clusters, best_local_sigma, best_epsilon, best_k_knn, best_k_mknn)

        for si, ar, cl, l in metrics:
            sim_graph.append(graph)
            laplacian.append(laplace)
            silhouette_scores.append(si)
            adjusted_rand_scores.append(ar)
            clusters.append(cl)
            cluster_labels.append(l)
            # Append consolidated hyperparameters for each similarity graph type
            if graph == "full":
                hyperparameters.append(f"local_sigma={best_local_sigma}")
            elif graph == "eps":
                hyperparameters.append(f"epsilon={best_epsilon}")
            elif graph == "knn":
                hyperparameters.append(f"k_nn={best_k_knn}")
            elif graph == "mknn":
                hyperparameters.append(f"k_mknn={best_k_mknn}")
            else:
                hyperparameters.append("None")

100%|████████████████████████████████████| 1755/1755 [00:00<00:00, 46730.30it/s]
Building full adjacency matrix: 100%|█████| 1755/1755 [00:01<00:00, 1362.50it/s]
100%|████████████████████████████████████| 1755/1755 [00:00<00:00, 73816.72it/s]
Building full adjacency matrix: 100%|█████| 1755/1755 [00:01<00:00, 1407.07it/s]
100%|████████████████████████████████████| 1755/1755 [00:00<00:00, 67983.70it/s]
Building full adjacency matrix: 100%|█████| 1755/1755 [00:01<00:00, 1426.15it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 5154.85it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 4929.00it/s]
Building epsilon adjacency matrix: 100%|██| 1755/1755 [00:00<00:00, 5131.11it/s]
Building k-NN adjacency matrix: 100%|████| 1755/1755 [00:00<00:00, 11991.96it/s]
Building k-NN adjacency matrix: 100%|████| 1755/1755 [00:00<00:00, 11899.21it/s]
Building k-NN adjacency matrix: 100%|████| 1755/1755 [00:00<00:00, 11584.69it/s]
Building mutual k-NN adjacen

In [10]:
experiment_20newsgroups = pd.DataFrame(list(zip(sim_graph,laplacian,silhouette_scores,adjusted_rand_scores,clusters, hyperparameters, cluster_labels)),
             columns= ["graph","laplacian", "silhouette", "adjusted_rand","number_of_clusters","hyperparameters", "cluster_labels"])
experiment_20newsgroups["graph_laplacian"] = experiment_20newsgroups["graph"] + "_" + experiment_20newsgroups["laplacian"]
experiment_20newsgroups

Unnamed: 0,graph,laplacian,silhouette,adjusted_rand,number_of_clusters,hyperparameters,cluster_labels,graph_laplacian
0,full,sym,0.239562,0.031459,3,local_sigma=37,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, ...",full_sym
1,full,rw,0.179558,9e-05,3,local_sigma=37,"[1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, ...",full_rw
2,full,ad,0.200517,0.156528,3,local_sigma=37,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",full_ad
3,eps,sym,0.174639,1e-06,3,epsilon=1283.042,"[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",eps_sym
4,eps,rw,0.174639,1e-06,3,epsilon=1283.042,"[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",eps_rw
5,eps,ad,0.174639,1e-06,3,epsilon=1283.042,"[0, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",eps_ad
6,knn,sym,0.063477,0.02486,3,k_nn=50,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",knn_sym
7,knn,rw,0.063477,0.02486,3,k_nn=50,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",knn_rw
8,knn,ad,0.063477,0.02486,3,k_nn=50,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...",knn_ad
9,mknn,sym,-0.153034,0.000147,3,k_mknn=50,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",mknn_sym


In [None]:
# Save the DataFrame to a CSV file
experiment_20newsgroups.to_csv('experiment_20newsgroups_66.csv', index=False)

# File is now saved in the current working directory
print("CSV file saved as 'experiment_20newsgroups_66.csv'")

In [11]:
# Finding the best results for each metric and their indices
best_silhouette_index = experiment_20newsgroups['silhouette'].idxmax()
best_adjusted_rand_index = experiment_20newsgroups['adjusted_rand'].idxmax()

best_silhouette = experiment_20newsgroups.loc[best_silhouette_index]
best_adjusted_rand = experiment_20newsgroups.loc[best_adjusted_rand_index]

# Create a clearer DataFrame with the best results and their indices
best_results = pd.DataFrame({
    'Metric': ['Best Silhouette', 'Best Adjusted Rand'],
    'Index': [
        best_silhouette_index,
        best_adjusted_rand_index
    ],
    'Graph': [
        best_silhouette['graph'],
        best_adjusted_rand['graph']
    ],
    'Laplacian': [
        best_silhouette['laplacian'],
        best_adjusted_rand['laplacian']
    ],
    'Silhouette Score': [
        best_silhouette['silhouette'],
        None,  # Placeholder for clarity
    ],
    'Adjusted Rand Index': [
        None,  # Placeholder for clarity
        best_adjusted_rand['adjusted_rand']
    ]
})

# Display the best results
best_results

Unnamed: 0,Metric,Index,Graph,Laplacian,Silhouette Score,Adjusted Rand Index
0,Best Silhouette,0,full,sym,0.239562,
1,Best Adjusted Rand,2,full,ad,,0.156528
