In [23]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from algorithms.new_euclidean_npy import nPyGK

# Generate a dataset with overlapping clusters
def generate_overlapping_data(n_samples=500, centers=3, cluster_std=5, random_state=42):
    """
    Generate synthetic data with overlapping clusters.
    
    Parameters:
        n_samples (int): Total number of samples.
        centers (int): Number of cluster centers.
        cluster_std (float): Standard deviation of clusters (controls overlap).
        random_state (int): Random state for reproducibility.

    Returns:
        X (array): Feature data.
        y_true (array): True cluster labels.
    """
    X, y_true = make_blobs(
        n_samples=n_samples, centers=centers, cluster_std=cluster_std, random_state=random_state
    )
    return X, y_true


# # Visualize the dataset
# plt.scatter(X[:, 0], X[:, 1], c=y_true, cmap="viridis", s=30)
# plt.title("Synthetic Overlapping Data")
# plt.xlabel("Feature 1")
# plt.ylabel("Feature 2")
# plt.show()

# # Apply K-Means clustering
# kmeans = KMeans(n_clusters=3, random_state=42)

# kmeans_labels = kmeans.fit_predict(X)
# kmeans_center = kmeans.cluster_centers_

# # Apply Fuzzy C-Means clustering

# # npygk_labels =  np.argmax(npygk_pred,axis=0) 


# print(f"Silhouette Score for K-Means: {kmeans_score:.3f}")
# print(f"Silhouette Score for Fuzzy C-Means: {npygk_score:.3f}")

# # Plot K-Means clustering results
# plt.scatter(X[:, 0], X[:, 1], c=kmeans_labels, cmap="viridis", s=30)
# plt.title("K-Means Clustering")
# plt.xlabel("Feature 1")
# plt.ylabel("Feature 2")
# plt.show()

# # Plot Fuzzy C-Means clustering results
# plt.scatter(X[:, 0], X[:, 1], c=npygk_labels, cmap="viridis", s=30)
# plt.title("nPyGK Clustering")
# plt.xlabel("Feature 1")
# plt.ylabel("Feature 2")
# plt.show()

# Separation Index (SE)
 # Generate data
X, y_true = generate_overlapping_data()
npygk = nPyGK(n_clusters=3, n_pyth=2, alpha=1.8)
npygk_centers = npygk.fit(X)  # Fit the model
npygk_pred = npygk.predict(X)

cluster_dist = [np.sum(npygk_pred[:,k]**2*np.linalg.norm(X-npygk_centers[k],axis=1)**2 ) for k in range(3)]
min_distance = np.min([np.linalg.norm(npygk_centers[i]-npygk_centers[j])**2 for i in range(3)
                       for j in range(i+1,3)])

se = np.sum(cluster_dist)/(min_distance*500)
se

0.20564807838338534

In [None]:

def xie_beni_index(centers,predict,n_clusters):
    cluster_dists = [
        np.sum(predict[:,k]**2 * np.linalg.norm(X - centers[k], axis=1) ** 2
        ) for k in range(n_clusters)
    ]
    min_distances = np.min([np.linalg.norm(centers[i] - centers[j])
                        for i in range(3)
                        for j in range(i + 1, 3)])
    xb = np.sum(cluster_dists) / np.maximum((500 * min_distances),10e-5)  
    return xb


def best_xie_beni_index(X, n_clusters):
    results = []  # List to store the best alpha for each n_pyth

    for n_pyth in range(1, 100):  # Loop over n_pyth values
        best_alpha = None
        best_index = float('inf')  # Initialize with a large value

        for alpha in np.linspace(0.5, n_pyth, 50):  # Loop over alpha values
            npygk = nPyGK(n_clusters=n_clusters, n_pyth=n_pyth, alpha=alpha)
            npygk_centers = npygk.fit(X)  # Fit the model
            npygk_pred = npygk.predict(X)  # Get cluster predictions

            # Calculate the Xie-Beni index
            index = xie_beni_index(npygk_centers, npygk_pred, n_clusters=n_clusters)

            # Update the best alpha for the current n_pyth
            if index < best_index:
                best_index = index
                best_alpha = alpha

        # Append the best result for the current n_pyth
        results.append((n_pyth, best_alpha))

    return results

best_xie_beni_index(X,3)

In [5]:
import pandas as pd

# Function to extract and parse LaTeX table from a .tex file
def tex_to_dataframe(file_path):
    # Read the .tex file
    with open(file_path, 'r') as file:
        lines = file.readlines()

    # Extract table rows
    data = []
    for line in lines:
        if "&" in line:  # Identify rows with data
            row = line.replace("\\\\", "").strip().split("&")
            row = [cell.strip() for cell in row]  # Clean each cell
            data.append(row)

    # Use the first row as the header, remaining as data
    return pd.DataFrame(data[1:], columns=data[0])


input_tex = "Data_Results/Wine_data_validity/clustering_metrics_table.tex"
df = tex_to_dataframe(input_tex)
rows_to_select = range(9, len(df), 10)  
df_new = df.iloc[rows_to_select]

In [6]:
# Create latex table
def create_latex_table(df, unique_output_dir):
    """ Generate and save LaTeX table."""
    selected_columns = [
        "n_pyth", 
        "alpha",
        "partition coefficient",
        "fukuyama sugeno index",
        "xie beni index",
        "silhouette score",
        "adjusted rand score",
        "adjusted mutual info score",
        "homogeneity",
        "completeness",
        "v measure"
    ]

    df_selected = df[selected_columns].dropna()
    latex_table = df_selected.to_latex(
        index=False,
        caption="Clustering Metrics for Different Values of n_pyth",
        label="tab:clustering_metrics"
    )

    with open(f"{unique_output_dir}/clustering_metrics_table.tex", "w") as file:
        file.write(latex_table)

    print(f"LaTeX table saved to {unique_output_dir}/clustering_metrics_table.tex")

create_latex_table(df_new,'Data_Results')

LaTeX table saved to Data_Results/clustering_metrics_table.tex
