In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import davies_bouldin_score, silhouette_score, calinski_harabasz_score
from scipy.spatial.distance import cdist
import time

In [2]:
def initialize_membership(n_samples, n_clusters):
    u = np.random.rand(n_samples, n_clusters)
    return u / np.sum(u, axis=1, keepdims=True)

In [3]:
def compute_centroids(data, membership, m):
    um = membership ** m
    return (um.T @ data) / np.sum(um.T, axis=1, keepdims=True)

In [4]:
def update_membership(data, centroids, m):
    distances = cdist(data, centroids, metric='euclidean')
    distances = np.fmax(distances, 1e-10)
    inv_distances = 1.0 / distances
    exponent = 2.0 / (m - 1)
    return (inv_distances / np.sum(inv_distances[:, :, None] ** exponent, axis=1)).squeeze()

In [5]:
def create_u_bar(labels, n_clusters, supervised_ratio):
    n_samples = len(labels)
    supervised_count = int(n_samples * supervised_ratio)
    supervised_indices = np.random.choice(n_samples, supervised_count, replace=False)
    
    u_bar = np.zeros((n_samples, n_clusters))
    le = LabelEncoder()
    int_labels = le.fit_transform(labels)
    u_bar[supervised_indices, int_labels[supervised_indices]] = 1.0
    
    return u_bar

In [6]:
def cfcm_clustering(data, labels, n_clusters, m=2, epsilon=1e-5, max_iter=300, supervised_ratio=0.3):
    n_samples = data.shape[0]
    membership_matrix = initialize_membership(n_samples, n_clusters)
    u_bar = create_u_bar(labels, n_clusters, supervised_ratio)
    
    for _ in range(max_iter):
        prev_membership = membership_matrix.copy()
        centroids = compute_centroids(data, membership_matrix - u_bar, m)
        membership_matrix = update_membership(data, centroids, m)
        membership_matrix = u_bar + (1 - np.sum(u_bar, axis=1, keepdims=True)) * membership_matrix
        
        if np.linalg.norm(membership_matrix - prev_membership) < epsilon:
            break
    
    return centroids, membership_matrix

In [7]:
def partition_coefficient(U):
    return np.sum(U**2) / U.shape[0]

def classification_entropy(U):
    return -np.sum(U * np.log(U + 1e-10)) / U.shape[0]

def separation(X, U, V, M):
    distances = np.linalg.norm(X[:, np.newaxis, :] - V, axis=2) ** 2  # Shape (n_samples, n_clusters)
    weighted_distances = (U**M) * distances  # Cùng shape (n_samples, n_clusters)
    return np.sum(weighted_distances)  # Tổng khoảng cách có trọng số


def hypervolume(U, M):
    return np.sum(U**M)

def cs(X, U, V, M):
    return np.sum(U**M * np.linalg.norm(X[:, np.newaxis, :] - V, axis=2) ** 2)

In [8]:
import time

def evaluate_and_print_results(title, X, U, V, process_time, step):
    M = 2  # Hệ số fuzziness

    # Làm tròn giá trị hiển thị
    def wdvl(val):
        return str(round(val, 4))

    labels_pred = np.argmax(U, axis=1)

    results = [
        title.ljust(8),                      # Tên thuật toán
        wdvl(process_time).ljust(6),         # Thời gian chạy
        str(step).ljust(6),                  # Số bước lặp
        wdvl(davies_bouldin_score(X, labels_pred)).ljust(6),  # DB index
        wdvl(partition_coefficient(U)).ljust(6),  # PC
        wdvl(classification_entropy(U)).ljust(6),  # CE
        wdvl(separation(X, U, V, M)).ljust(8),  # S
        wdvl(calinski_harabasz_score(X, labels_pred)).ljust(10),  # CH
        wdvl(silhouette_score(X, labels_pred)).ljust(6),  # SI
        wdvl(hypervolume(U, M)).ljust(6),  # FHV
        wdvl(cs(X, U, V, M)).ljust(6),  # CS
    ]
    
    print("  ".join(results))  # In theo cột


In [None]:
# In tiêu đề bảng
print("Alg      Time   Step   DB-    PC-    CE-    S-      CH+        SI+    FHV+   CS-")
print("-" * 80)
datasets = [ "D:/Truong_NCKH/Data/Iris.csv", "D:/Truong_NCKH/Data/Wine.csv"]
# Chạy thuật toán và in kết quả
for dataset in datasets:
    df = pd.read_csv(dataset)
    labels = df.iloc[:, -1].values
    data = StandardScaler().fit_transform(df.iloc[:, :-1].values)
    
    n_clusters = len(np.unique(labels))
    
    start_time = time.perf_counter()  
    centroids, membership_matrix = cfcm_clustering(data, labels, n_clusters)
    process_time = time.perf_counter() - start_time  

    evaluate_and_print_results("CFCM", data, membership_matrix, centroids, process_time=process_time, step=5)


Alg      Time   Step   DB-    PC-    CE-    S-      CH+        SI+    FHV+   CS-
--------------------------------------------------------------------------------
CFCM      0.0841  5       1.2422  0.3753  8.7195  911.2534  2.8809      -0.3225  66.4267  911.2534
