In [None]:
!pip install numpy matplotlib scipy scikit-learn pandas seaborn

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.datasets import load_iris
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import pdist, squareform
from collections import defaultdict
import multiprocessing as mp

def compute_delta_pair(args):
    X, labels, i, j = args
    cluster_i = X[labels == i]
    cluster_j = X[labels == j]
    return np.min(pairwise_distances(cluster_i, cluster_j))

if __name__ == "__main__":
    # Dunn Index Calculation - Serial
    def dunn_index_serial(X, labels):
        distances = squareform(pdist(X))
        unique_labels = np.unique(labels)
        deltas = []
        diameters = []

        for i in range(len(unique_labels)):
            for j in range(i + 1, len(unique_labels)):
                cluster_i = X[labels == unique_labels[i]]
                cluster_j = X[labels == unique_labels[j]]
                delta = np.min(pairwise_distances(cluster_i, cluster_j))
                deltas.append(delta)

        for k in unique_labels:
            cluster_k = X[labels == k]
            diameter = np.max(pdist(cluster_k)) if len(cluster_k) > 1 else 0
            diameters.append(diameter)

        return np.min(deltas) / np.max(diameters)

    def dunn_index_parallel(X, labels):
        distances = squareform(pdist(X))
        unique_labels = np.unique(labels)
        args_list = [(X, labels, unique_labels[i], unique_labels[j]) 
                    for i in range(len(unique_labels)) 
                    for j in range(i + 1, len(unique_labels))]

        with mp.Pool(processes=mp.cpu_count()) as pool:
            deltas = pool.map(compute_delta_pair, args_list)

        diameters = []
        for k in unique_labels:
            cluster_k = X[labels == k]
            diameter = np.max(pdist(cluster_k)) if len(cluster_k) > 1 else 0
            diameters.append(diameter)

        return np.min(deltas) / np.max(diameters)

    # Dados - toy dataset
    data = load_iris()
    X = data.data
    labels = data.target

    # Repetindo o experimento com filtro para ao menos dois clusters
    times_serial = []
    times_parallel = []
    valid_sizes = []
    sizes = list(range(10, 151, 10))

    for size in sizes:
        X_subset = X[:size]
        labels_subset = labels[:size]

        if len(np.unique(labels_subset)) < 2:
            continue  # pula se não houver pelo menos dois clusters

        valid_sizes.append(size)

        start = time.time()
        dunn_index_serial(X_subset, labels_subset)
        times_serial.append(time.time() - start)

        start = time.time()
        dunn_index_parallel(X_subset, labels_subset)
        times_parallel.append(time.time() - start)

    # Gráfico atualizado
    plt.figure(figsize=(10, 6))
    plt.plot(valid_sizes, times_serial, label="Serial", marker='o')
    plt.plot(valid_sizes, times_parallel, label="Paralelo (Multiprocessing)", marker='x')
    plt.xlabel("Tamanho do dataset (n amostras)")
    plt.ylabel("Tempo de execução (s)")
    plt.title("Comparação de desempenho: Serial vs Paralelo (Índice de Dunn)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()


Process SpawnPoolWorker-1:
Traceback (most recent call last):
  File "/Users/macbookpro/miniconda3/envs/cad/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/macbookpro/miniconda3/envs/cad/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/macbookpro/miniconda3/envs/cad/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/macbookpro/miniconda3/envs/cad/lib/python3.10/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
AttributeError: Can't get attribute 'compute_delta_pair' on <module '__main__' (built-in)>
Process SpawnPoolWorker-6:
Process SpawnPoolWorker-2:
Process SpawnPoolWorker-4:
Process SpawnPoolWorker-9:
Process SpawnPoolWorker-5:
Process SpawnPoolWorker-3:
Process SpawnPoolWorker-8:
Process SpawnPoolWorker-7:
Traceback (most recent call last):
  File "/Users/macbookpro/miniconda3/envs/cad/

KeyboardInterrupt: 

testar na gpu de casa

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.datasets import load_iris
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import pdist, squareform
import multiprocessing as mp
from numba import cuda, float32
cuda.select_device(0)

# ---------- SERIAL VERSION ----------
def dunn_index_serial(X, labels):
    distances = squareform(pdist(X))
    unique_labels = np.unique(labels)
    deltas = []
    diameters = []

    for i in range(len(unique_labels)):
        for j in range(i + 1, len(unique_labels)):
            cluster_i = X[labels == unique_labels[i]]
            cluster_j = X[labels == unique_labels[j]]
            delta = np.min(pairwise_distances(cluster_i, cluster_j))
            deltas.append(delta)

    for k in unique_labels:
        cluster_k = X[labels == k]
        diameter = np.max(pdist(cluster_k)) if len(cluster_k) > 1 else 0
        diameters.append(diameter)

    return np.min(deltas) / np.max(diameters)

# ---------- PARALLEL CPU VERSION ----------
def compute_delta_pair(args):
    X, labels, i, j = args
    cluster_i = X[labels == i]
    cluster_j = X[labels == j]
    return np.min(pairwise_distances(cluster_i, cluster_j))

def dunn_index_parallel(X, labels):
    unique_labels = np.unique(labels)
    args_list = [(X, labels, unique_labels[i], unique_labels[j])
                 for i in range(len(unique_labels))
                 for j in range(i + 1, len(unique_labels))]

    with mp.Pool(processes=mp.cpu_count()) as pool:
        deltas = pool.map(compute_delta_pair, args_list)

    diameters = []
    for k in unique_labels:
        cluster_k = X[labels == k]
        diameter = np.max(pdist(cluster_k)) if len(cluster_k) > 1 else 0
        diameters.append(diameter)

    return np.min(deltas) / np.max(diameters)

# ---------- GPU VERSION ----------
@cuda.jit
def euclidean_distance_kernel(a, b, result):
    i, j = cuda.grid(2)
    if i < a.shape[0] and j < b.shape[0]:
        temp = 0.
        for k in range(a.shape[1]):
            diff = a[i, k] - b[j, k]
            temp += diff * diff
        result[i, j] = temp ** 0.5

def gpu_pairwise_distances(cluster_i, cluster_j):
    a = np.array(cluster_i, dtype=np.float32)
    b = np.array(cluster_j, dtype=np.float32)
    result = np.zeros((a.shape[0], b.shape[0]), dtype=np.float32)

    d_a = cuda.to_device(a)
    d_b = cuda.to_device(b)
    d_result = cuda.device_array_like(result)

    threadsperblock = (16, 16)
    blockspergrid_x = (a.shape[0] + threadsperblock[0] - 1) // threadsperblock[0]
    blockspergrid_y = (b.shape[0] + threadsperblock[1] - 1) // threadsperblock[1]
    blockspergrid = (blockspergrid_x, blockspergrid_y)

    euclidean_distance_kernel[blockspergrid, threadsperblock](d_a, d_b, d_result)
    return d_result.copy_to_host()

def dunn_index_cuda(X, labels):
    unique_labels = np.unique(labels)
    deltas = []

    for i in range(len(unique_labels)):
        for j in range(i + 1, len(unique_labels)):
            cluster_i = X[labels == unique_labels[i]]
            cluster_j = X[labels == unique_labels[j]]
            distances = gpu_pairwise_distances(cluster_i, cluster_j)
            deltas.append(np.min(distances))

    diameters = []
    for k in unique_labels:
        cluster_k = X[labels == k]
        if len(cluster_k) > 1:
            dists = pdist(cluster_k)
            diameters.append(np.max(dists))
        else:
            diameters.append(0)

    return np.min(deltas) / np.max(diameters)

# ---------- EXECUTION ----------
data = load_iris()
X = data.data
labels = data.target

sizes = list(range(30, 151, 30))
times_serial = []
times_parallel = []
times_cuda = []
valid_sizes = []

for size in sizes:
    X_subset = X[:size]
    labels_subset = labels[:size]
    if len(np.unique(labels_subset)) < 2:
        continue

    valid_sizes.append(size)

    start = time.time()
    dunn_index_serial(X_subset, labels_subset)
    times_serial.append(time.time() - start)

    start = time.time()
    dunn_index_parallel(X_subset, labels_subset)
    times_parallel.append(time.time() - start)

    try:
        start = time.time()
        dunn_index_cuda(X_subset.astype(np.float32), labels_subset)
        times_cuda.append(time.time() - start)
    except cuda.cudadrv.error.CudaSupportError:
        times_cuda.append(None)

# ---------- PLOT ----------
plt.figure(figsize=(10, 6))
plt.plot(valid_sizes, times_serial, label="Serial (CPU)", marker='o')
plt.plot(valid_sizes, times_parallel, label="Paralelo (Multiprocessing)", marker='x')
if any(times_cuda):
    cuda_times_cleaned = [t if t is not None else 0 for t in times_cuda]
    plt.plot(valid_sizes, cuda_times_cleaned, label="CUDA (GPU)", marker='^')
plt.xlabel("Tamanho do dataset (n amostras)")
plt.ylabel("Tempo de execução (s)")
plt.title("Comparação de desempenho: Serial vs Paralelo CPU vs CUDA (Índice de Dunn)")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("comparacao_cuda_dunn.png")
plt.show()