In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [2]:
file = 'datasetTC4.dat'
data = pd.read_csv(file, header=None, sep=' ')
data = data.iloc[:, :-1]
data.head()

Unnamed: 0,0,1,2,3,4,5
0,63.03,22.55,39.61,40.48,98.67,-0.25
1,39.06,10.06,25.02,29.0,114.41,4.56
2,68.83,22.22,50.09,46.61,105.99,-3.53
3,69.3,24.65,44.31,44.64,101.87,11.21
4,49.71,9.65,28.32,40.06,108.17,7.92


In [3]:
# Passo 1 - Normalizar os dados
scaler = StandardScaler()
data_normalized = scaler.fit_transform(data)
data_normalized

array([[ 0.14722652,  0.50111133, -0.66512805, -0.18460234, -1.44783071,
        -0.70794606],
       [-1.24570706, -0.74889057, -1.45276272, -1.04124965, -0.26402779,
        -0.57967342],
       [ 0.48427345,  0.46808485, -0.0993699 ,  0.27282344, -0.89729467,
        -0.79541679],
       ...,
       [ 0.05541029,  0.51512256, -0.31098936, -0.31369641,  0.58283504,
        -0.77354911],
       [-0.88599664, -0.88600047, -0.55877847, -0.47711606,  0.04734096,
        -0.69567882],
       [-1.54904929, -1.24829085, -0.82546218, -1.05841244,  0.45347411,
        -0.70661266]])

## Questão Única: Implementar os algoritmos K-médias e K-medianas usando o conjunto de dados disponibilizado no SIGAA (datasetTC4.dat).

### 1.1 Usando o algoritmo K-médias, estimar o número de agrupamentos através dos ı́ndices de validação:

## 1 - Dunn

In [4]:
# Função para calcular o índice de Dunn
def dunn_index(X, labels, centroids):
    X_array = X.values if isinstance(X, pd.DataFrame) else X
    dist_matrix = np.linalg.norm(X_array - centroids[labels], axis=-1)

    min_diameter = np.inf
    for k in range(len(centroids)):
        cluster_points = X_array[labels == k]
        cluster_diameter = np.max(np.linalg.norm(cluster_points - centroids[k], axis=-1))
        min_diameter = min(min_diameter, cluster_diameter)

    mask = labels != labels[:, np.newaxis]
    min_intercluster_distance = np.min(dist_matrix[mask.any(axis=1)])

    if min_diameter == 0:
        return np.nan
    else:
        return min_intercluster_distance / min_diameter

## 2 - Davies-Bouldin

In [5]:
# Função para calcular o índice de Davies-Bouldin
def davies_bouldin_index(X, labels, centroids):
    X_array = X.values if isinstance(X, pd.DataFrame) else X
    num_clusters = len(centroids)
    distances_matrix = np.zeros((num_clusters, num_clusters))

    for i in range(num_clusters):
        for j in range(i + 1, num_clusters):
            distances_matrix[i, j] = np.linalg.norm(centroids[i] - centroids[j])
            distances_matrix[j, i] = distances_matrix[i, j]

    max_cluster_distances = np.zeros(num_clusters)

    for k in range(num_clusters):
        cluster_points = X_array[labels == k]
        mean_distance = np.mean(np.linalg.norm(cluster_points - centroids[k], axis=-1))
        max_cluster_distances[k] = max([distances_matrix[k, j] for j in range(num_clusters) if j != k])

    return np.sum(max_cluster_distances) / num_clusters

## 3 - Calinski-Harabasz

In [6]:
# Função para calcular a métrica de Calinski-Harabasz
def calinski_harabasz_index(X, labels, centroids):
    X_array = X.values if isinstance(X, pd.DataFrame) else X
    num_samples, num_features = X_array.shape
    num_clusters = len(centroids)

    overall_mean = np.mean(X_array, axis=0)
    overall_ss = np.sum(np.sum((X_array - overall_mean) ** 2, axis=1))

    between_cluster_ss = 0
    within_cluster_ss = 0

    for k in range(num_clusters):
        cluster_points = X_array[labels == k]
        cluster_size = len(cluster_points)

        cluster_mean = np.mean(cluster_points, axis=0)
        between_cluster_ss += cluster_size * np.sum((cluster_mean - overall_mean) ** 2)

        within_cluster_ss += np.sum(np.sum((cluster_points - cluster_mean) ** 2, axis=1))

    return (between_cluster_ss / (num_clusters - 1)) / (within_cluster_ss / (num_samples - num_clusters))

In [7]:
# Realizar a análise para diferentes valores de K
Kmax = 10

# Inicializar listas para armazenar os resultados dos índices
dunn_scores = []
davies_bouldin_scores = []
calinski_harabasz_scores = []

for k in range(2, Kmax + 1):
    kmeans = KMeans(n_clusters=k, random_state=np.random.randint(1000))
    labels = kmeans.fit_predict(data_normalized)
    centroids = kmeans.cluster_centers_

    # Calcular os índices para cada valor de K
    dunn_scores.append(dunn_index(data_normalized, labels, centroids))
    davies_bouldin_scores.append(davies_bouldin_index(data_normalized, labels, centroids))
    calinski_harabasz_scores.append(calinski_harabasz_index(data_normalized, labels, centroids))

# Encontrar o valor ótimo de K para cada índice
optimal_k_dunn = np.argmax(dunn_scores) + 2  # Adiciona 2 porque começamos com K=2
optimal_k_davies_bouldin = np.argmin(davies_bouldin_scores) + 2
optimal_k_calinski_harabasz = np.argmax(calinski_harabasz_scores) + 2

# Imprimir os resultados
print("Resultado do índice de Dunn:")
print(" - Valor ótimo de K:", optimal_k_dunn)
print(" - Índices de Dunn para diferentes valores de K:", dunn_scores)

print("\nResultado do índice de Davies-Bouldin:")
print(" - Valor ótimo de K:", optimal_k_davies_bouldin)
print(" - Índices de Davies-Bouldin para diferentes valores de K:", davies_bouldin_scores)

print("\nResultado do índice de Calinski-Harabasz:")
print(" - Valor ótimo de K:", optimal_k_calinski_harabasz)
print(" - Índices de Calinski-Harabasz para diferentes valores de K:", calinski_harabasz_scores)

Resultado do índice de Dunn:
 - Valor ótimo de K: 4
 - Índices de Dunn para diferentes valores de K: [0.11159227241921335, 0.055987098703777854, nan, nan, nan, nan, nan, nan, nan]

Resultado do índice de Davies-Bouldin:
 - Valor ótimo de K: 2
 - Índices de Davies-Bouldin para diferentes valores de K: [3.077053165076801, 3.665931438483645, 12.871727821274801, 12.807325112711743, 12.733667168941963, 12.817792421129942, 12.837443300069733, 12.763230074239752, 12.816821790291181]

Resultado do índice de Calinski-Harabasz:
 - Valor ótimo de K: 2
 - Índices de Calinski-Harabasz para diferentes valores de K: [189.336302284451, 153.57004061168072, 134.11615615633877, 123.03593251654657, 118.47732266440362, 112.4223433739261, 106.81011901133024, 102.15576662798888, 97.88932855233584]


## (i) Qual valor para o número de agrupamentos foi sugerido por cada técnica de validação?

## (ii) Se houve divergência entre os resultados sugeridos pelos ı́ndices, o que justifica tal divergência?

## 1.2 Rodar o algoritmo K-medianas usando o número de agrupamentos escolhidos no Subitem 1