In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

# Leitura do arquivo e remoção da última coluna
file = 'datasetTC4.dat'
data = pd.read_csv(file, header=None, sep=' ')
data = data.iloc[:, :-1]

# Normalização dos dados
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data)

# Função para calcular o índice de Dunn
def dunn_index(X, labels, centroids):
    X_array = X.values if isinstance(X, pd.DataFrame) else X
    dist_matrix = np.linalg.norm(X_array - centroids[labels], axis=-1)

    min_diameter = np.inf
    for k in range(len(centroids)):
        cluster_points = X_array[labels == k]
        cluster_diameter = np.max(np.linalg.norm(cluster_points - centroids[k], axis=-1))
        min_diameter = min(min_diameter, cluster_diameter)

    mask = labels != labels[:, np.newaxis]
    min_intercluster_distance = np.min(dist_matrix[mask.any(axis=1)])

    if min_diameter == 0:
        return np.nan
    else:
        return min_intercluster_distance / min_diameter

# Função para calcular o índice de Davies-Bouldin
def davies_bouldin_index(X, labels, centroids):
    X_array = X.values if isinstance(X, pd.DataFrame) else X
    num_clusters = len(centroids)
    distances_matrix = np.zeros((num_clusters, num_clusters))

    for i in range(num_clusters):
        for j in range(i + 1, num_clusters):
            distances_matrix[i, j] = np.linalg.norm(centroids[i] - centroids[j])
            distances_matrix[j, i] = distances_matrix[i, j]

    max_cluster_distances = np.zeros(num_clusters)

    for k in range(num_clusters):
        cluster_points = X_array[labels == k]
        mean_distance = np.mean(np.linalg.norm(cluster_points - centroids[k], axis=-1))
        max_cluster_distances[k] = max([distances_matrix[k, j] for j in range(num_clusters) if j != k])

    return np.sum(max_cluster_distances) / num_clusters

# Função para calcular a métrica de Calinski-Harabasz
def calinski_harabasz_index(X, labels, centroids):
    X_array = X.values if isinstance(X, pd.DataFrame) else X
    num_samples, num_features = X_array.shape
    num_clusters = len(centroids)

    overall_mean = np.mean(X_array, axis=0)
    overall_ss = np.sum(np.sum((X_array - overall_mean) ** 2, axis=1))

    between_cluster_ss = 0
    within_cluster_ss = 0

    for k in range(num_clusters):
        cluster_points = X_array[labels == k]
        cluster_size = len(cluster_points)

        cluster_mean = np.mean(cluster_points, axis=0)
        between_cluster_ss += cluster_size * np.sum((cluster_mean - overall_mean) ** 2)

        within_cluster_ss += np.sum(np.sum((cluster_points - cluster_mean) ** 2, axis=1))

    return (between_cluster_ss / (num_clusters - 1)) / (within_cluster_ss / (num_samples - num_clusters))

# Realizar a análise para diferentes valores de K
Kmax = 10

# Inicializar listas para armazenar os resultados dos índices
dunn_scores = []
davies_bouldin_scores = []
calinski_harabasz_scores = []

for k in range(2, Kmax + 1):
    kmeans = KMeans(n_clusters=k, random_state=np.random.randint(1000))
    labels = kmeans.fit_predict(data_normalized)
    centroids = kmeans.cluster_centers_

    # Calcular os índices para cada valor de K
    dunn_scores.append(dunn_index(data_normalized, labels, centroids))
    davies_bouldin_scores.append(davies_bouldin_index(data_normalized, labels, centroids))
    calinski_harabasz_scores.append(calinski_harabasz_index(data_normalized, labels, centroids))

# Encontrar o valor ótimo de K para cada índice
optimal_k_dunn = np.argmax(dunn_scores) + 2  # Adiciona 2 porque começamos com K=2
optimal_k_davies_bouldin = np.argmin(davies_bouldin_scores) + 2
optimal_k_calinski_harabasz = np.argmax(calinski_harabasz_scores) + 2

# Imprimir os resultados
print("Resultado do índice de Dunn:")
print(" - Valor ótimo de K:", optimal_k_dunn)
print(" - Índices de Dunn para diferentes valores de K:", dunn_scores)

print("\nResultado do índice de Davies-Bouldin:")
print(" - Valor ótimo de K:", optimal_k_davies_bouldin)
print(" - Índices de Davies-Bouldin para diferentes valores de K:", davies_bouldin_scores)

print("\nResultado do índice de Calinski-Harabasz:")
print(" - Valor ótimo de K:", optimal_k_calinski_harabasz)
print(" - Índices de Calinski-Harabasz para diferentes valores de K:", calinski_harabasz_scores)
