## DATASET TOY

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.datasets import load_iris
from sklearn.metrics import pairwise_distances
from sklearn.preprocessing import LabelEncoder
from scipy.spatial.distance import pdist, squareform
from collections import defaultdict
import multiprocessing as mp

def compute_delta_pair(args):
    X, labels, i, j = args
    cluster_i = X[labels == i]
    cluster_j = X[labels == j]
    return np.min(pairwise_distances(cluster_i, cluster_j))

if __name__ == "__main__":
    def dunn_index_serial(X, labels):
        distances = squareform(pdist(X))
        unique_labels = np.unique(labels)
        deltas = []
        diameters = []

        for i in range(len(unique_labels)):
            for j in range(i + 1, len(unique_labels)):
                cluster_i = X[labels == unique_labels[i]]
                cluster_j = X[labels == unique_labels[j]]
                delta = np.min(pairwise_distances(cluster_i, cluster_j))
                deltas.append(delta)

        for k in unique_labels:
            cluster_k = X[labels == k]
            diameter = np.max(pdist(cluster_k)) if len(cluster_k) > 1 else 0
            diameters.append(diameter)

        return np.min(deltas) / np.max(diameters)

    def dunn_index_parallel(X, labels):
        distances = squareform(pdist(X))
        unique_labels = np.unique(labels)
        args_list = [(X, labels, unique_labels[i], unique_labels[j]) 
                    for i in range(len(unique_labels)) 
                    for j in range(i + 1, len(unique_labels))]

        with mp.Pool(processes=mp.cpu_count()) as pool:
            deltas = pool.map(compute_delta_pair, args_list)

        diameters = []
        for k in unique_labels:
            cluster_k = X[labels == k]
            diameter = np.max(pdist(cluster_k)) if len(cluster_k) > 1 else 0
            diameters.append(diameter)

        return np.min(deltas) / np.max(diameters)

    data = load_iris()
    X = data.data
    labels = data.target

    times_serial = []
    times_parallel = []
    valid_sizes = []
    sizes = list(range(10, 151, 10))

    for size in sizes:
        X_subset = X[:size]
        labels_subset = labels[:size]

        if len(np.unique(labels_subset)) < 2:
            continue  

        valid_sizes.append(size)

        start = time.time()
        dunn_index_serial(X_subset, labels_subset)
        times_serial.append(time.time() - start)

        start = time.time()
        dunn_index_parallel(X_subset, labels_subset)
        times_parallel.append(time.time() - start)

    # Gráfico atualizado
    plt.figure(figsize=(10, 6))
    plt.plot(valid_sizes, times_serial, label="Serial", marker='o')
    plt.plot(valid_sizes, times_parallel, label="Paralelo (Multiprocessing)", marker='x')
    plt.xlabel("Tamanho do dataset (n amostras)")
    plt.ylabel("Tempo de execução (s)")
    plt.title("Comparação de desempenho: Serial vs Paralelo (Índice de Dunn)")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

## DATASET KAGGLE - MID LENGTH - Customer Segmentation Dataset

In [None]:
import pandas as pd
df = pd.read_excel("data/datamid.xlsx")  
df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680.0,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680.0,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680.0,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680.0,France


In [5]:
df.dropna(subset=["CustomerID"], inplace=True)
df = df[df["Quantity"] > 0]
df = df[df["UnitPrice"] > 0]

df["TotalPrice"] = df["Quantity"] * df["UnitPrice"]

In [6]:
import numpy as np

df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])
data_ref = df["InvoiceDate"].max() + pd.Timedelta(days=1)

rfm = df.groupby("CustomerID").agg({
    "InvoiceDate": lambda x: (data_ref - x.max()).days,  
    "InvoiceNo": "nunique",                              
    "TotalPrice": "sum"                                  
})

rfm.columns = ["Recency", "Frequency", "Monetary"]
rfm.reset_index(inplace=True)

from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(rfm[["Recency", "Frequency", "Monetary"]])

In [7]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=4, random_state=42)
labels = kmeans.fit_predict(X)

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import pdist, squareform
from collections import Counter
import multiprocessing as mp

def dunn_index_serial(X, labels):
    print("🔁 [Serial] Iniciando cálculo do índice de Dunn...")
    start_time = time.time()

    distances = squareform(pdist(X))
    unique_labels = np.unique(labels)
    deltas = []
    diameters = []

    for i in range(len(unique_labels)):
        for j in range(i + 1, len(unique_labels)):
            cluster_i = X[labels == unique_labels[i]]
            cluster_j = X[labels == unique_labels[j]]
            delta = np.min(pairwise_distances(cluster_i, cluster_j))
            print(f"[Serial] Δ({unique_labels[i]},{unique_labels[j]}): {delta:.4f}")
            deltas.append(delta)

    for k in unique_labels:
        cluster_k = X[labels == k]
        diameter = np.max(pdist(cluster_k)) if len(cluster_k) > 1 else 0
        print(f"[Serial] Diâmetro do cluster {k}: {diameter:.4f}")
        diameters.append(diameter)

    dunn = np.min(deltas) / np.max(diameters)
    print(f"✅ [Serial] Dunn index: {dunn:.4f} | Tempo: {time.time() - start_time:.2f}s\n")
    return dunn

In [None]:
from joblib import Parallel, delayed
from sklearn.metrics import pairwise_distances

def compute_delta_pair(args):
    X, labels, i, j = args
    cluster_i = X[labels == i]
    cluster_j = X[labels == j]
    result = np.min(pairwise_distances(cluster_i, cluster_j))
    print(f"[Δ] Distância mínima entre cluster {i} e {j}: {result:.4f}")
    return result

def dunn_index_parallel_joblib(X, labels):
    print("🔁 [Joblib] Iniciando cálculo do índice de Dunn com joblib...")
    start_time = time.time()

    unique_labels = np.unique(labels)
    args_list = [(X, labels, unique_labels[i], unique_labels[j]) 
                 for i in range(len(unique_labels)) 
                 for j in range(i + 1, len(unique_labels))]

    def _wrap_compute(args):
        i, j = args[2], args[3]
        res = compute_delta_pair(args)
        print(f"[Joblib] Δ({i},{j}) = {res:.4f}")
        return res

    deltas = Parallel(n_jobs=-1)(delayed(_wrap_compute)(args) for args in args_list)

    diameters = []
    for k in unique_labels:
        cluster_k = X[labels == k]
        diameter = np.max(pdist(cluster_k)) if len(cluster_k) > 1 else 0
        print(f"[Joblib] Diâmetro do cluster {k}: {diameter:.4f}")
        diameters.append(diameter)

    dunn = np.min(deltas) / np.max(diameters)
    print(f"✅ [Joblib] Dunn index: {dunn:.4f} | Tempo: {time.time() - start_time:.2f}s\n")
    return dunn


In [22]:
if __name__ == "__main__":
    print("Serial:", dunn_index_serial(X, labels))
    print("Paralelo:", dunn_index_parallel(X, labels))


🔁 [Serial] Iniciando cálculo do índice de Dunn...
[Serial] Δ(0,1): 0.0113
[Serial] Δ(0,2): 8.4028
[Serial] Δ(0,3): 0.0982
[Serial] Δ(1,2): 10.0066
[Serial] Δ(1,3): 1.2900
[Serial] Δ(2,3): 2.4775
[Serial] Diâmetro do cluster 0: 2.7327
[Serial] Diâmetro do cluster 1: 2.3107
[Serial] Diâmetro do cluster 2: 32.6226
[Serial] Diâmetro do cluster 3: 14.3445
✅ [Serial] Dunn index: 0.0003 | Tempo: 0.14s

Serial: 0.000345168368830084
🔁 [Parallel] Iniciando cálculo do índice de Dunn com multiprocessing...


Process SpawnPoolWorker-43:
Process SpawnPoolWorker-44:
Traceback (most recent call last):
  File "/Users/macbookpro/miniconda3/envs/cad/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/macbookpro/miniconda3/envs/cad/lib/python3.10/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/macbookpro/miniconda3/envs/cad/lib/python3.10/multiprocessing/pool.py", line 114, in worker
    task = get()
  File "/Users/macbookpro/miniconda3/envs/cad/lib/python3.10/multiprocessing/queues.py", line 367, in get
    return _ForkingPickler.loads(res)
Traceback (most recent call last):
AttributeError: Can't get attribute 'compute_delta_pair' on <module '__main__' (built-in)>
  File "/Users/macbookpro/miniconda3/envs/cad/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/Users/macbookpro/miniconda3/envs/cad/lib/python3.10/multiprocessing/process.py", line 108, in r

KeyboardInterrupt: 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.metrics import pairwise_distances
from scipy.spatial.distance import pdist, squareform
from collections import Counter
import multiprocessing as mp

# Função para a versão paralela
def compute_delta_pair(args):
    X, labels, i, j = args
    cluster_i = X[labels == i]
    cluster_j = X[labels == j]
    return np.min(pairwise_distances(cluster_i, cluster_j))

# Versão serial do índice de Dunn
def dunn_index_serial(X, labels):
    distances = squareform(pdist(X))
    unique_labels = np.unique(labels)
    deltas = []
    diameters = []

    for i in range(len(unique_labels)):
        for j in range(i + 1, len(unique_labels)):
            cluster_i = X[labels == unique_labels[i]]
            cluster_j = X[labels == unique_labels[j]]
            delta = np.min(pairwise_distances(cluster_i, cluster_j))
            deltas.append(delta)

    for k in unique_labels:
        cluster_k = X[labels == k]
        diameter = np.max(pdist(cluster_k)) if len(cluster_k) > 1 else 0
        diameters.append(diameter)

    return np.min(deltas) / np.max(diameters)

# Versão paralela
def dunn_index_parallel(X, labels):
    unique_labels = np.unique(labels)
    args_list = [(X, labels, unique_labels[i], unique_labels[j]) 
                 for i in range(len(unique_labels)) 
                 for j in range(i + 1, len(unique_labels))]

    with mp.Pool(processes=mp.cpu_count()) as pool:
        deltas = pool.map(compute_delta_pair, args_list)

    diameters = []
    for k in unique_labels:
        cluster_k = X[labels == k]
        diameter = np.max(pdist(cluster_k)) if len(cluster_k) > 1 else 0
        diameters.append(diameter)

    return np.min(deltas) / np.max(diameters)

# ============================
# LEITURA DO DATASET DO KAGGLE
# ============================

df = pd.read_xlsx("data/datamid.xlsx")  # nome do arquivo após upload

# Assume colunas: 'f1', 'f2', ..., 'label' (ou similares)
features = df.drop(columns=['label']).values
labels = df['label'].values

print(f"✅ Dataset carregado com {features.shape[0]} amostras e {features.shape[1]} features.")
print(f"Clusters encontrados: {np.unique(labels)}")
print(f"Tamanho dos clusters: {Counter(labels)}")

# ========================
# CÁLCULO DOS ÍNDICES
# ========================

start = time.time()
dunn_s = dunn_index_serial(features, labels)
tempo_serial = time.time() - start

start = time.time()
dunn_p = dunn_index_parallel(features, labels)
tempo_parallel = time.time() - start

print("\n📊 Resultados:")
print(f"Índice de Dunn (Serial): {dunn_s:.4f} | Tempo: {tempo_serial:.2f} s")
print(f"Índice de Dunn (Paralelo): {dunn_p:.4f} | Tempo: {tempo_parallel:.2f} s")
print(f"Ganho de desempenho: {tempo_serial / tempo_parallel:.2f}x mais rápido\n")
