In [None]:
# FCM

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

from algorithms.fcm import FCM
from algorithms.gk import GK
from algorithms.nPyFCM import nPyFMC
from algorithms.nPyGK import nPyGK
from tqdm import tqdm

from sklearn.metrics import (
    adjusted_rand_score,
    adjusted_mutual_info_score,
    homogeneity_completeness_v_measure,
    silhouette_score,

)

def fuzzy_indices(X, membership, centers, m=2):
        # Number of samples and clusters
    n_samples = X.shape[0]
    # Partition Coefficient (PC)
    pc = np.sum(membership ** 2) / n_samples
    # Separation Index (SE)
    compactness = np.sum([
        np.sum((membership[:,k] ** m) * np.linalg.norm(X - centers[k], axis=1) ** 2)
        for k in range(centers.shape[0])
    ])
    # Separation for XBI
    sep = np.min([
        np.linalg.norm(centers[i] - centers[j]) ** 2
        for i in range(len(centers)) for j in range(i + 1, len(centers))
    ])
    xbi = compactness / (n_samples * sep) if sep > 0 else np.inf
    # Global centroid for Fukuyama-Sugeno Index
    global_centroid = np.mean(X, axis=0)

    # Separation term for Fukuyama-Sugeno Index
    separation = np.sum([
        np.sum(membership[:, k] ** m) * np.linalg.norm(centers[k] - global_centroid) ** 2
        for k in range(len(centers))
    ])
    fsi = compactness - separation

    return pc, xbi, fsi


def hard_indices(X, labels, true_labels):
    n_samples = X.shape[0]
    unique_labels = np.unique(labels)
    if len(unique_labels) < 2 or -1 in unique_labels:
        return [np.nan] * 9

    centers = np.array([X[labels == label].mean(axis=0) for label in unique_labels])
    membership = np.zeros((n_samples, len(unique_labels)))
    for i, label in enumerate(labels):
        membership[i, np.where(unique_labels == label)[0][0]] = 1

    pc = np.sum(membership ** 2) / n_samples
    compactness = np.sum([
        np.sum((membership[:, k] ** 2) * np.linalg.norm(X - centers[k], axis=1) ** 2)
        for k in range(len(unique_labels))
    ])

    sep = np.min([
        np.linalg.norm(centers[i] - centers[j]) ** 2
        for i in range(len(centers)) for j in range(i + 1, len(centers))
    ])
    xbi = compactness / (n_samples * sep) if sep > 0 else np.inf

    global_centroid = np.mean(X, axis=0)
    separation = np.sum([
        np.sum(membership[:, k] ** 2) * np.linalg.norm(centers[k] - global_centroid) ** 2
        for k in range(len(centers))
    ])
    fsi = compactness - separation

    ss = silhouette_score(X, labels)
    ars = adjusted_rand_score(true_labels, labels)
    ami = adjusted_mutual_info_score(true_labels, labels)
    h, c, v = homogeneity_completeness_v_measure(true_labels, labels)

    return pc, xbi, fsi, ss, ars, ami, h, c, v

In [None]:
def bootstrap_clustering(X, y, n_clusters=3, n_iter=100):
    results = {name: [] for name in ['FCM']}            

    for _ in tqdm(range(n_iter)):
        idx = np.random.choice(len(X), len(X), replace=True)
        X_sample = X[idx]
        y_sample = y[idx]

        # FCM
        fmc = FCM(n_clusters=3,m=2,max_iter=300)
        fcm_centers = fmc.fit(X_sample)
        fcm_labels = fmc.predict(X_sample)
        pc, xbi, fsi = fuzzy_indices(X_sample, fcm_labels, fcm_centers)
        labels = np.argmax(fcm_labels,axis=-1)
        ss = silhouette_score(X_sample, labels)
        ars = adjusted_rand_score(y_sample, labels)
        ami = adjusted_mutual_info_score(y_sample, labels)
        h, c, v = homogeneity_completeness_v_measure(y_sample, labels)
        results['FCM'].append((pc, xbi, fsi, ss, ars, ami, h, c, v))
    # Summarize results
    summary = {}
    for method, scores in results.items():
        scores = np.array(scores, dtype=np.float64)
        summary[method] = {
            'mean': np.nanmean(scores, axis=0).round(4),
            'std': np.nanstd(scores, axis=0).round(4),
        }
    return summary

# Run the evaluation
df = pd.read_csv("Data/iris.csv")
X_data = df.iloc[:, :-1].values  # Features
true_labels = df.iloc[:, -1].values  # True labels

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(true_labels)

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X_data)
result = bootstrap_clustering(X, y, n_clusters=3, n_iter=100)
result

In [None]:
# Other than fuzzy c mean  for iris dataset

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.cluster import KMeans,DBSCAN, SpectralClustering, AgglomerativeClustering
from algorithms.gk import GK
from algorithms.nPyFCM import nPyFCM
from algorithms.nPyGK import nPyGK
from tqdm import tqdm

from sklearn.metrics import (
    adjusted_rand_score,
    adjusted_mutual_info_score,
    homogeneity_completeness_v_measure,
    silhouette_score,

)

def fuzzy_indices(X, membership, centers, m=2):
        # Number of samples and clusters
    n_samples = X.shape[0]
    # Partition Coefficient (PC)
    pc = np.sum(membership ** 2) / n_samples
    # Separation Index (SE)
    compactness = np.sum([
        np.sum((membership[k,:] ** m) * np.linalg.norm(X - centers[k], axis=1) ** 2)
        for k in range(centers.shape[0])
    ])
    # Separation for XBI
    sep = np.min([
        np.linalg.norm(centers[i] - centers[j]) ** 2
        for i in range(len(centers)) for j in range(i + 1, len(centers))
    ])
    xbi = compactness / (n_samples * sep) if sep > 0 else np.inf
    # Global centroid for Fukuyama-Sugeno Index
    global_centroid = np.mean(X, axis=0)

    # Separation term for Fukuyama-Sugeno Index
    separation = np.sum([
        np.sum(membership[k,:] ** m) * np.linalg.norm(centers[k] - global_centroid) ** 2
        for k in range(len(centers))
    ])
    fsi = compactness - separation

    return pc, xbi, fsi


def hard_indices(X, labels, true_labels):
    n_samples = X.shape[0]
    unique_labels = np.unique(labels)
    if len(unique_labels) < 2 or -1 in unique_labels:
        return [np.nan] * 9

    centers = np.array([X[labels == label].mean(axis=0) for label in unique_labels])
    membership = np.zeros((n_samples, len(unique_labels)))
    for i, label in enumerate(labels):
        membership[i, np.where(unique_labels == label)[0][0]] = 1

    pc = np.sum(membership ** 2) / n_samples
    compactness = np.sum([
        np.sum((membership[:, k] ** 2) * np.linalg.norm(X - centers[k], axis=1) ** 2)
        for k in range(len(unique_labels))
    ])

    sep = np.min([
        np.linalg.norm(centers[i] - centers[j]) ** 2
        for i in range(len(centers)) for j in range(i + 1, len(centers))
    ])
    xbi = compactness / (n_samples * sep) if sep > 0 else np.inf

    global_centroid = np.mean(X, axis=0)
    separation = np.sum([
        np.sum(membership[:, k] ** 2) * np.linalg.norm(centers[k] - global_centroid) ** 2
        for k in range(len(centers))
    ])
    fsi = compactness - separation

    ss = silhouette_score(X, labels)
    ars = adjusted_rand_score(true_labels, labels)
    ami = adjusted_mutual_info_score(true_labels, labels)
    h, c, v = homogeneity_completeness_v_measure(true_labels, labels)

    return pc, xbi, fsi, ss, ars, ami, h, c, v



def bootstrap_clustering(X, y, n_clusters=3, n_iter=100):
    results = {name: [] for name in ['GK','DBSCAN', 'Spectral', 'Hierarchical','nPyFCM','nPyGK']}            

    for _ in tqdm(range(n_iter)):
        idx = np.random.choice(len(X), len(X), replace=True)
        X_sample = X[idx]
        y_sample = y[idx]

        # GK
        gk = GK(n_clusters=3,m=2,max_iter=300)
        gk_centers = gk.fit(X_sample)
        gk_labels = gk.predict(X_sample)
        pc, xbi, fsi = fuzzy_indices(X_sample, gk_labels, gk_centers)
        labels = np.argmax(gk_labels,axis=0)
        ss = silhouette_score(X_sample, labels)
        ars = adjusted_rand_score(y_sample, labels)
        ami = adjusted_mutual_info_score(y_sample, labels)
        h, c, v = homogeneity_completeness_v_measure(y_sample, labels)
        results['GK'].append((pc, xbi, fsi, ss, ars, ami, h, c, v))

        # nPyFMC
        nPyfcm = nPyFCM(n_clusters=3,m=2,max_iter=100,n_pyth=3,alpha=0.514286)
        nPyfcm_centers = nPyfcm.fit(X_sample)
        nPyfcm_labels = nPyfcm.predict(X_sample)
        pc, xbi, fsi = fuzzy_indices(X_sample, nPyfcm_labels, nPyfcm_centers)
        labels = np.argmax(nPyfcm_labels,axis=0)
        ss = silhouette_score(X_sample, labels)
        ars = adjusted_rand_score(y_sample, labels)
        ami = adjusted_mutual_info_score(y_sample, labels)
        h, c, v = homogeneity_completeness_v_measure(y_sample, labels)
        results['nPyFCM'].append((pc, xbi, fsi, ss, ars, ami, h, c, v))

        # nPyGK
        nPygk = nPyGK(n_clusters=3,m=2,max_iter=100,n_pyth=5,alpha=1.8)
        nPygk_centers = nPygk.fit(X_sample)
        nPygk_labels = nPygk.predict(X_sample)
        pc, xbi, fsi = fuzzy_indices(X_sample, nPygk_labels, nPygk_centers)
        labels = np.argmax(nPygk_labels,axis=0)
        ss = silhouette_score(X_sample, labels)
        ars = adjusted_rand_score(y_sample, labels)
        ami = adjusted_mutual_info_score(y_sample, labels)
        h, c, v = homogeneity_completeness_v_measure(y_sample, labels)
        results['nPyGK'].append((pc, xbi, fsi, ss, ars, ami, h, c, v))

        # DBSCAN
        db = DBSCAN(eps=0.6, min_samples=4).fit(X_sample)
        labels = db.fit_predict(X)
        # Filter noise
        mask = labels != -1
        
        X_filtered = X_sample[mask]
        labels_filtered = labels[mask]
        y_filtered = y_sample[mask]
        results['DBSCAN'].append(hard_indices(X_filtered, labels_filtered, y_filtered))

        # Spectral Clustering
        sc = SpectralClustering(n_clusters=n_clusters, affinity='nearest_neighbors').fit(X_sample)
        results['Spectral'].append(hard_indices(X_sample, sc.labels_, y_sample))

        # Hierarchical
        ac = AgglomerativeClustering(n_clusters=n_clusters).fit(X_sample)
        results['Hierarchical'].append(hard_indices(X_sample, ac.labels_, y_sample))
    # Summarize results
    summary = {}
    for method, scores in results.items():
        scores = np.array(scores, dtype=np.float64)
        np.set_printoptions(precision=4)
        summary[method] = {
            'mean': np.nanmean(scores, axis=0),
            'std': np.nanstd(scores, axis=0),
        }
    return summary

# Run the evaluation
df = pd.read_csv("Data/iris.csv")
X_data = df.iloc[:, :-1].values  # Features
true_labels = df.iloc[:, -1].values  # True labels

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(true_labels)

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X_data)
result = bootstrap_clustering(X, y, n_clusters=3, n_iter=100)
result

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

from sklearn.cluster import DBSCAN
from tqdm import tqdm

from sklearn.metrics import (
    adjusted_rand_score,
    adjusted_mutual_info_score,
    homogeneity_completeness_v_measure,
    silhouette_score,
)



def hard_indices(X, labels, true_labels):
    n_samples = X.shape[0]
    unique_labels = np.unique(labels)
    if len(unique_labels) < 2 or -1 in unique_labels:
        return [np.nan] * 9

    centers = np.array([X[labels == label].mean(axis=0) for label in unique_labels])
    membership = np.zeros((n_samples, len(unique_labels)))
    for i, label in enumerate(labels):
        membership[i, np.where(unique_labels == label)[0][0]] = 1

    pc = np.sum(membership ** 2) / n_samples
    compactness = np.sum([
        np.sum((membership[:, k] ** 2) * np.linalg.norm(X - centers[k], axis=1) ** 2)
        for k in range(len(unique_labels))
    ])

    sep = np.min([
        np.linalg.norm(centers[i] - centers[j]) ** 2
        for i in range(len(centers)) for j in range(i + 1, len(centers))
    ])
    xbi = compactness / (n_samples * sep) if sep > 0 else np.inf

    global_centroid = np.mean(X, axis=0)
    separation = np.sum([
        np.sum(membership[:, k] ** 2) * np.linalg.norm(centers[k] - global_centroid) ** 2
        for k in range(len(centers))
    ])
    fsi = compactness - separation

    ss = silhouette_score(X, labels)
    ars = adjusted_rand_score(true_labels, labels)
    ami = adjusted_mutual_info_score(true_labels, labels)
    h, c, v = homogeneity_completeness_v_measure(true_labels, labels)

    return pc, xbi, fsi, ss, ars, ami, h, c, v


df = pd.read_csv("Data/iris.csv")
X_data = df.iloc[:, :-1].values  # Features
true_labels = df.iloc[:, -1].values  # True labels

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(true_labels)

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X_data)


def bootstrap_clustering(X, y, n_clusters=3, n_iter=100):
    results = {name: [] for name in ['DBSCAN']}        

    for _ in tqdm(range(n_iter)):
        idx = np.random.choice(len(X), len(X), replace=True)
        X_sample = X[idx]
        y_sample = y[idx]
        # DBSCAN
    db = DBSCAN(eps=0.6, min_samples=4).fit(X_sample)
    labels = db.fit_predict(X)

    # Filter noise
    mask = labels != -1
    X_filtered = X_sample[mask]
    labels_filtered = labels[mask]
    y_filtered = y_sample[mask]
    results['DBSCAN'].append(hard_indices(X_filtered, labels_filtered, y_filtered))

    summary = {}
    for method, scores in results.items():
        scores = np.array(scores, dtype=np.float64)
        summary[method] = {
            'mean': np.nanmean(scores, axis=0).round(4),
            'std': np.nanstd(scores, axis=0).round(4),
        }
    return summary

df = pd.read_csv("Data/iris.csv")
X_data = df.iloc[:, :-1].values  # Features
true_labels = df.iloc[:, -1].values  # True labels

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(true_labels)

# Standardize features
scaler = StandardScaler()
X_sample = scaler.fit_transform(X_data)

result = bootstrap_clustering(X, y, n_clusters=3, n_iter=100)
result

In [None]:
# Other than fuzzy c mean  for iris dataset

import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

from algorithms.nPyGK import nPyGK
from tqdm import tqdm

from sklearn.metrics import (
    adjusted_rand_score,
    adjusted_mutual_info_score,
    homogeneity_completeness_v_measure,
    silhouette_score,

)

def fuzzy_indices(X, membership, centers, m=2):
        # Number of samples and clusters
    n_samples = X.shape[0]
    # Partition Coefficient (PC)
    pc = np.sum(membership ** 2) / n_samples
    # Separation Index (SE)
    compactness = np.sum([
        np.sum((membership[k,:] ** m) * np.linalg.norm(X - centers[k], axis=1) ** 2)
        for k in range(centers.shape[0])
    ])
    # Separation for XBI
    sep = np.min([
        np.linalg.norm(centers[i] - centers[j]) ** 2
        for i in range(len(centers)) for j in range(i + 1, len(centers))
    ])
    xbi = compactness / (n_samples * sep) if sep > 0 else np.inf
    # Global centroid for Fukuyama-Sugeno Index
    global_centroid = np.mean(X, axis=0)

    # Separation term for Fukuyama-Sugeno Index
    separation = np.sum([
        np.sum(membership[k,:] ** m) * np.linalg.norm(centers[k] - global_centroid) ** 2
        for k in range(len(centers))
    ])
    fsi = compactness - separation

    return pc, xbi, fsi


def hard_indices(X, labels, true_labels):
    n_samples = X.shape[0]
    unique_labels = np.unique(labels)
    if len(unique_labels) < 2 or -1 in unique_labels:
        return [np.nan] * 9

    centers = np.array([X[labels == label].mean(axis=0) for label in unique_labels])
    membership = np.zeros((n_samples, len(unique_labels)))
    for i, label in enumerate(labels):
        membership[i, np.where(unique_labels == label)[0][0]] = 1

    pc = np.sum(membership ** 2) / n_samples
    compactness = np.sum([
        np.sum((membership[:, k] ** 2) * np.linalg.norm(X - centers[k], axis=1) ** 2)
        for k in range(len(unique_labels))
    ])

    sep = np.min([
        np.linalg.norm(centers[i] - centers[j]) ** 2
        for i in range(len(centers)) for j in range(i + 1, len(centers))
    ])
    xbi = compactness / (n_samples * sep) if sep > 0 else np.inf

    global_centroid = np.mean(X, axis=0)
    separation = np.sum([
        np.sum(membership[:, k] ** 2) * np.linalg.norm(centers[k] - global_centroid) ** 2
        for k in range(len(centers))
    ])
    fsi = compactness - separation

    ss = silhouette_score(X, labels)
    ars = adjusted_rand_score(true_labels, labels)
    ami = adjusted_mutual_info_score(true_labels, labels)
    h, c, v = homogeneity_completeness_v_measure(true_labels, labels)

    return pc, xbi, fsi, ss, ars, ami, h, c, v



def bootstrap_clustering(X, y, n_clusters=3, n_iter=100):
    results = {name: [] for name in ['GK','DBSCAN', 'Spectral', 'Hierarchical','nPyFCM','nPyGK']}            

    for _ in tqdm(range(n_iter)):
        idx = np.random.choice(len(X), len(X), replace=True)
        X_sample = X[idx]
        y_sample = y[idx]

        # nPyGK
        nPygk = nPyGK(n_clusters=3,m=2,max_iter=100,n_pyth=5,alpha=1.8)
        nPygk_centers = nPygk.fit(X_sample)
        nPygk_labels = nPygk.predict(X_sample)
        pc, xbi, fsi = fuzzy_indices(X_sample, nPygk_labels, nPygk_centers)
        labels = np.argmax(nPygk_labels,axis=0)
        ss = silhouette_score(X_sample, labels)
        ars = adjusted_rand_score(y_sample, labels)
        ami = adjusted_mutual_info_score(y_sample, labels)
        h, c, v = homogeneity_completeness_v_measure(y_sample, labels)
        results['nPyGK'].append((pc, xbi, fsi, ss, ars, ami, h, c, v))

    # Summarize results
    summary = {}
    for method, scores in results.items():
        scores = np.array(scores, dtype=np.float64)
        np.set_printoptions(precision=4)
        summary[method] = {
            'mean': np.nanmean(scores, axis=0),
            'std': np.nanstd(scores, axis=0),
        }
    return summary

def generate_dataframe(X,true_label_encoded, number_of_clusters, m, MAX_ITER):
    data = []
    # Loop through n_pyth and alpha values and get the results
    for n_pyth in np.arange(1, 6):
        for alpha in np.linspace(0.1, n_pyth, 50):
            nPygk = nPyGK(n_clusters=number_of_clusters, m=m, max_iter=MAX_ITER, n_pyth=n_pyth, alpha=alpha)
            cluster_centers = nPygk.fit(X)
            predicted_labels = nPygk.predict(X)
            metrices = calculate_indices(X, predicted_labels, cluster_centers,true_label_encoded)
            # Add n_pyth, alpha, and dictionary values (a, b, c, ..., g) to the data list
            data.append({'n_pyth': n_pyth, 'alpha': alpha, **metrices})
    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(data)
    return df




def find_max_indices_for_n_pyth(df, unique_output_dir,m):

    # Automatically identify metric columns (excluding n_pyth and alpha)
    df = df.dropna()
    metric_columns = [col for col in df.columns if col not in ["n_pyth", "alpha"]]

    results = []

    # Loop through unique values of n_pyth
    for n_pyth in df["n_pyth"].unique():
        filtered_df = df[df["n_pyth"] == n_pyth]

        # Find the max or min value and corresponding alpha for each metric column
        for col in metric_columns:
            if col in ["xie beni index","fukuyama sugeno index"]:
                # For Xie-Beni index, find the minimum value
                min_row = filtered_df.loc[filtered_df[col].dropna().idxmin()]
                results.append({
                    "n_pyth": n_pyth,
                    "metric": col,
                    "value": min_row[col],
                    "alpha": min_row["alpha"]
                })
            else:
                # For other metrics, find the maximum value
                max_row = filtered_df.loc[filtered_df[col].dropna().idxmax()]
                results.append({
                    "n_pyth": n_pyth,
                    "metric": col,
                    "value": max_row[col],
                    "alpha": max_row["alpha"]
                })

    # Create a DataFrame from the results
    results_df = pd.DataFrame(results)

    # Generate the LaTeX table from the entire DataFrame
    latex_table = results_df.to_latex(
        index=False,  # Do not include row indices
        caption="Best Values of Different Indices for Values of n_pyth and Alpha",  
        label="tab 1: Max indices"  # Optional label for referencing the table
    )

    # Save the LaTeX table to a file
    with open(f"{unique_output_dir}/Clustering_Metrics_Table_maximum.tex", "w") as file:
        file.write(latex_table)

    print(f"LaTeX table saved to {unique_output_dir}/Clustering_Metrics_Table_maximum.tex")

# Run the evaluation
df = pd.read_csv("Data/iris.csv")
X_data = df.iloc[:, :-1].values  # Features
true_labels = df.iloc[:, -1].values  # True labels

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(true_labels)

# Standardize features
scaler = StandardScaler()
X = scaler.fit_transform(X_data)
result = bootstrap_clustering(X, y, n_clusters=3, n_iter=100)
result

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler

from algorithms.nPyGK import nPyGK
from tqdm import tqdm

from sklearn.metrics import (
    adjusted_rand_score,
    adjusted_mutual_info_score,
    homogeneity_completeness_v_measure,
    silhouette_score,

)


def fuzzy_indices(X, membership, centers, m=2.1):
        # Number of samples and clusters
    n_samples = X.shape[0]
    # Partition Coefficient (PC)
    pc = np.sum(membership ** 2) / n_samples
    # Separation Index (SE)
    compactness = np.sum([
        np.sum((membership[k,:] ** m) * np.linalg.norm(X - centers[k], axis=1) ** 2)
        for k in range(centers.shape[0])
    ])
    # Separation for XBI
    sep = np.min([
        np.linalg.norm(centers[i] - centers[j]) ** 2
        for i in range(len(centers)) for j in range(i + 1, len(centers))
    ])
    xbi = compactness / (n_samples * sep) if sep > 0 else np.inf
    # Global centroid for Fukuyama-Sugeno Index
    global_centroid = np.mean(X, axis=0)

    # Separation term for Fukuyama-Sugeno Index
    separation = np.sum([
        np.sum(membership[k,:] ** m) * np.linalg.norm(centers[k] - global_centroid) ** 2
        for k in range(len(centers))
    ])
    fsi = compactness - separation

    return pc, xbi, fsi

def bootstrap_clustering_analysis(X, y, n_clusters=3, m=2, n_iter=100, n_pyth_range=range(1,6), alpha_points=25):
    """
    Perform bootstrap clustering analysis with varying n_pyth and alpha values.
    
    Returns:
        - Raw results from all bootstrap iterations
        - Summary statistics (mean, std) for each parameter combination
    """
    raw_results = []
    
    for _ in tqdm(range(n_iter), desc="Bootstrap iterations"):
        idx = np.random.choice(len(X), len(X), replace=True)
        X_sample = X[idx]
        y_sample = y[idx]
        
        for n_pyth in n_pyth_range:
            alpha_values = np.linspace(0.2, n_pyth, alpha_points)
            
            for alpha in alpha_values:
                try:
                    nPygk = nPyGK(n_clusters=n_clusters, m=m, max_iter=100, 
                                 n_pyth=n_pyth, alpha=alpha)
                    centers = nPygk.fit(X_sample)
                    predicted_labels = nPygk.predict(X_sample)
                    
                    # Calculate all metrics
                    pc, xbi, fsi = fuzzy_indices(X_sample, predicted_labels, centers)
                    labels = np.argmax(predicted_labels, axis=0)
                    ss = silhouette_score(X_sample, labels)
                    ars = adjusted_rand_score(y_sample, labels)
                    ami = adjusted_mutual_info_score(y_sample, labels)
                    h, c, v = homogeneity_completeness_v_measure(y_sample, labels)
                    
                    raw_results.append({
                        'n_pyth': n_pyth,
                        'alpha': alpha,
                        'PC': pc,
                        'XBI': xbi,
                        'FSI': fsi,
                        'Silhouette': ss,
                        'ARS': ars,
                        'AMI': ami,
                        'Homogeneity': h,
                        'Completeness': c,
                        'V-measure': v,
                        'iteration': _
                    })
                except Exception as e:
                    print(f"Failed for n_pyth={n_pyth}, alpha={alpha}: {str(e)}")
                    continue
    
    # Verify we have the expected columns
    required_columns = {'n_pyth', 'alpha'}
    if not required_columns.issubset(raw_df.columns):
        missing = required_columns - set(raw_df.columns)
        raise ValueError(f"Missing required columns: {missing}")
    
    raw_df = pd.DataFrame(raw_results)
    # Now safe to groupby
    summary_df = raw_df.groupby(['n_pyth', 'alpha']).agg(['mean', 'std']).reset_index()
    summary_df.columns = ['_'.join(col).strip() for col in summary_df.columns.values]
    summary_df = summary_df.rename(columns={'n_pyth_': 'n_pyth', 'alpha_': 'alpha'})

def find_best_parameters(summary_df, output_dir=None):
    """
    For each n_pyth, find the alpha that gives the best average value for each metric.
    
    Returns:
        DataFrame with best parameters for each n_pyth and metric
    """
    # Define optimization direction for each metric
    metric_info = {
        'PC': {'direction': 'max', 'full_name': 'Partition Coefficient'},
        'XBI': {'direction': 'min', 'full_name': 'Xie-Beni Index'},
        'FSI': {'direction': 'min', 'full_name': 'Fukuyama-Sugeno Index'},
        'Silhouette': {'direction': 'max', 'full_name': 'Silhouette Score'},
        'ARS': {'direction': 'max', 'full_name': 'Adjusted Rand Score'},
        'AMI': {'direction': 'max', 'full_name': 'Adjusted Mutual Info'},
        'Homogeneity': {'direction': 'max', 'full_name': 'Homogeneity'},
        'Completeness': {'direction': 'max', 'full_name': 'Completeness'},
        'V-measure': {'direction': 'max', 'full_name': 'V-measure'}
    }
    
    best_params = []
    
    for n_pyth in summary_df['n_pyth'].unique():
        n_pyth_subset = summary_df[summary_df['n_pyth'] == n_pyth]
        
        for metric, info in metric_info.items():
            mean_col = f'{metric}_mean'
            std_col = f'{metric}_std'
            
            if info['direction'] == 'max':
                best_row = n_pyth_subset.loc[n_pyth_subset[mean_col].idxmax()]
            else:
                best_row = n_pyth_subset.loc[n_pyth_subset[mean_col].idxmin()]
            
            best_params.append({
                'n_pyth': n_pyth,
                'metric': info['full_name'],
                'best_alpha': best_row['alpha'],
                'mean_value': best_row[mean_col],
                'std_value': best_row[std_col],
                'optimization': info['direction']
            })
    
    best_df = pd.DataFrame(best_params)
    
    if output_dir:
        # Save results
        best_df.to_csv(f"{output_dir}/best_parameters.csv", index=False)
        
        # Generate LaTeX table
        latex_table = best_df.to_latex(
            index=False,
            caption="Best parameters from bootstrap analysis",
            label="tab:best_params",
            float_format="%.4f",
            column_format='l' * (len(best_df.columns) + 1)
        )
        
        with open(f"{output_dir}/best_parameters.tex", "w") as f:
            f.write(latex_table)
    
    return best_df

# Example usage
if __name__ == "__main__":
    # Load and prepare data
    df = pd.read_csv("Data/iris.csv")
    X = df.iloc[:, :-1].values
    y = LabelEncoder().fit_transform(df.iloc[:, -1].values)
    X = StandardScaler().fit_transform(X)
    
    # Run bootstrap analysis
    raw_results, summary_stats = bootstrap_clustering_analysis(
        X, y,
        n_clusters=3,
        m=2,
        n_iter=100,
        n_pyth_range=range(1,6),
        alpha_points=50
    )
    
    # Find best parameters
    best_params = find_best_parameters(summary_stats, "results")
    
    # Print results
    print("Best parameters for each n_pyth:")
    print(best_params)
    
    # Additional analysis: Best overall parameters
    print("\nBest overall parameters across all n_pyth values:")
    for metric in ['PC', 'Silhouette', 'ARS', 'V-measure']:  # Example metrics to highlight
        if metric in ['XBI', 'FSI']:
            best_overall = best_params[best_params['metric'].str.contains(metric)].nsmallest(1, 'mean_value')
        else:
            best_overall = best_params[best_params['metric'].str.contains(metric)].nlargest(1, 'mean_value')
        print(best_overall.to_string(index=False))

In [6]:
import pandas as pd
import numpy as np
import datetime
from sklearn.preprocessing import LabelEncoder, StandardScaler
# MinMaxScaler
from sklearn.metrics import (
    adjusted_rand_score,
    adjusted_mutual_info_score,
    homogeneity_completeness_v_measure,
    silhouette_score,

)
# from algorithms.nPyGK import nPyGK
from algorithms.nPyGK import nPyGK
from tqdm import tqdm

def load_and_preprocess_data(input_csv):
    df = pd.read_csv(input_csv)
    X = df.iloc[:, :-1].values  # Features
    true_labels = df.iloc[:, -1].values  # True labels

    # Encode labels
    label_encoder = LabelEncoder()
    true_labels_encoded = label_encoder.fit_transform(true_labels)

    # Standardize features
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    return X, true_labels_encoded

def calculate_indices(X, predicted_labels, cluster_centers,true_label_encoded):
    # Number of samples and clusters
    n_samples = X.shape[0]
    n_clusters = len(cluster_centers)
    # Partition Coefficient (PC)
    pc = np.sum(predicted_labels**2) / n_samples
    
    
    cluster_dists = [
        np.sum((predicted_labels[k,:]**2) * np.linalg.norm(X - cluster_centers[k], axis=1) ** 2)
        for k in range(n_clusters)
    ]
    compactness = np.sum(cluster_dists)

    # Minimum separation distance (between-cluster dispersion)
    separation = np.min([
        np.linalg.norm(cluster_centers[i] - cluster_centers[j])**2
        for i in range(n_clusters)
        for j in range(i + 1, n_clusters)
    ])

    # Xie-Beni Index
    xb = compactness / (n_samples * separation) if separation > 0 else np.inf
    xb = np.minimum(xb,1)

    # Global centroid for Fukuyama-Sugeno Index
    global_centroid = np.mean(X, axis=0)

    # Separation term for Fukuyama-Sugeno Index
    separation_fs = np.sum([
        np.sum(predicted_labels[k,:]**2) * np.linalg.norm(cluster_centers[k] - global_centroid) ** 2
        for k in range(n_clusters)
    ])

    # Fukuyama-Sugeno Index
    se = compactness - separation_fs   
    
    predicted_labels = np.argmax(predicted_labels,axis=0)
    if len(np.unique(predicted_labels)) > 1:
        ss = silhouette_score(X,predicted_labels)
        ars = adjusted_rand_score(predicted_labels, true_label_encoded)
        amis = adjusted_mutual_info_score(predicted_labels, true_label_encoded)
        hh, cc, vv = homogeneity_completeness_v_measure(predicted_labels, true_label_encoded)
    else:
        ss = np.nan
        ars = np.nan
        amis = np.nan
        hh, cc, vv = np.nan,np.nan,np.nan
    return {
        "partition coefficient": pc,
        "fukuyama sugeno index": se,
        "xie beni index": xb,
        "silhouette score": ss,
        "adjusted rand score": ars,
        "adjusted mutual info score": amis,
        "homogeneity": hh,
        "completeness": cc,
        "v measure": vv,
    }


def generate_dataframe(X, true_label_encoded, number_of_clusters, m, MAX_ITER, n_iter=100):
    all_data = []

    # Loop through n_pyth and alpha values
    for n_pyth in np.arange(1, 6):
        for alpha in np.linspace(0.2, n_pyth, 50):
            metrics_list = []
            for _ in tqdm(range(n_iter), desc=f"n_pyth={n_pyth}, alpha={alpha:.4f}"):
                idx = np.random.choice(len(X), len(X), replace=True)
                X_sample = X[idx]
                y_sample = true_label_encoded[idx]

                nPygk = nPyGK(n_clusters=number_of_clusters, m=m, max_iter=MAX_ITER, n_pyth=n_pyth, alpha=alpha)
                cluster_centers = nPygk.fit(X_sample)
                predicted_labels = nPygk.predict(X_sample)

                metrics = calculate_indices(X_sample, predicted_labels, cluster_centers, y_sample)
                metrics_list.append(metrics)

            # Compute mean and std for all metrics over the 100 iterations
            mean_metrics = {k + "_mean": np.nanmean([d[k] for d in metrics_list]) for k in metrics_list[0].keys()}
            std_metrics = {k + "_std": np.nanstd([d[k] for d in metrics_list]) for k in metrics_list[0].keys()}

            # Append row with means and stds for this (n_pyth, alpha) pair
            all_data.append({'n_pyth': n_pyth, 'alpha': alpha, **mean_metrics, **std_metrics})

    df = pd.DataFrame(all_data)
    return df



def find_max_indices_for_n_pyth(df, unique_output_dir, m):
    df = df.dropna()

    # Identify only mean metric columns (exclude std columns)
    metric_columns = [col for col in df.columns if col.endswith("_mean")]
    
    results = []

    for n_pyth in df["n_pyth"].unique():
        filtered_df = df[df["n_pyth"] == n_pyth]

        for col in metric_columns:
            metric_name = col.replace("_mean", "")
            std_col = metric_name + "_std"

            if metric_name in ["xie beni index", "fukuyama sugeno index"]:
                # For these, we want to find the minimum mean
                best_row = filtered_df.loc[filtered_df[col].idxmin()]
            else:
                # For others, we want the maximum mean
                best_row = filtered_df.loc[filtered_df[col].idxmax()]

            results.append({
                "n_pyth": n_pyth,
                "metric": metric_name,
                "mean_value": best_row[col],
                "std_value": best_row[std_col],
                "alpha": best_row["alpha"]
            })

    # Create DataFrame
    results_df = pd.DataFrame(results)

    # Save LaTeX table
    latex_table = results_df.to_latex(
        index=False,
        float_format="%.4f",
        caption="Best Mean Values and Corresponding Standard Deviations for Each Metric and n_pyth",
        label="tab:max_indices_with_std"
    )

    with open(f"{unique_output_dir}/Clustering_Metrics_Table_maximum_with_std.tex", "w") as f:
        f.write(latex_table)

    print(f"LaTeX table saved to {unique_output_dir}/Clustering_Metrics_Table_maximum_with_std.tex")

    return results_df


    
    
# Load and preprocess the data
X, y_encoded = load_and_preprocess_data("Data/synthetic_data.csv")

# Generate the results dataframe
results_df = generate_dataframe(X, y_encoded, number_of_clusters=3, m=2.1, MAX_ITER=150, n_iter=100)

# # Save intermediate DataFrame if needed
# results_df.to_csv("clustering_metrics_summary.csv", index=False)

# Find and export best index values
find_max_indices_for_n_pyth(results_df, unique_output_dir="results", m=2.1)



n_pyth=1, alpha=0.2000: 100%|██████████| 100/100 [00:00<00:00, 111.87it/s]
n_pyth=1, alpha=0.2163: 100%|██████████| 100/100 [00:00<00:00, 114.64it/s]
n_pyth=1, alpha=0.2327: 100%|██████████| 100/100 [00:00<00:00, 111.85it/s]
n_pyth=1, alpha=0.2490: 100%|██████████| 100/100 [00:00<00:00, 106.90it/s]
n_pyth=1, alpha=0.2653: 100%|██████████| 100/100 [00:00<00:00, 101.52it/s]
n_pyth=1, alpha=0.2816: 100%|██████████| 100/100 [00:01<00:00, 99.69it/s]
n_pyth=1, alpha=0.2980: 100%|██████████| 100/100 [00:01<00:00, 93.92it/s]
n_pyth=1, alpha=0.3143: 100%|██████████| 100/100 [00:01<00:00, 91.85it/s]
n_pyth=1, alpha=0.3306: 100%|██████████| 100/100 [00:01<00:00, 89.44it/s]
n_pyth=1, alpha=0.3469: 100%|██████████| 100/100 [00:01<00:00, 95.32it/s]
n_pyth=1, alpha=0.3633: 100%|██████████| 100/100 [00:01<00:00, 88.44it/s]
n_pyth=1, alpha=0.3796: 100%|██████████| 100/100 [00:01<00:00, 87.35it/s]
n_pyth=1, alpha=0.3959: 100%|██████████| 100/100 [00:01<00:00, 69.89it/s]
n_pyth=1, alpha=0.4122: 100%|████

LaTeX table saved to results/Clustering_Metrics_Table_maximum_with_std.tex





Unnamed: 0,n_pyth,metric,mean_value,std_value,alpha
0,1,partition coefficient,2.998208,1.22829e-15,0.2
1,1,fukuyama sugeno index,-455.384687,28.47413,1.0
2,1,xie beni index,0.116106,0.009137613,1.0
3,1,silhouette score,0.51214,0.01692608,0.510204
4,1,adjusted rand score,0.784676,0.03661461,0.967347
5,1,adjusted mutual info score,0.746542,0.03316814,0.967347
6,1,homogeneity,0.747683,0.0330501,0.967347
7,1,completeness,0.747271,0.03306254,0.967347
8,1,v measure,0.747476,0.03304616,0.967347
9,2,partition coefficient,3.0,0.0,0.2
