In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np 
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import pairwise_distances
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

## Importing the dataset


In [2]:
df = pd.read_csv("Data.csv")

In [3]:
X = np.array(df.drop("Diagnosis", axis=1))

# AGNES clustering

In [4]:
# Instantiate the model
clustering = AgglomerativeClustering(linkage='ward', n_clusters=2)
# Fit model
clustering.fit(X)

### Initial Silhouette Coefficient

In [5]:
# Extract the cluster labels
labels = clustering.labels_

In [6]:
# Calculate the Silhouette Coefficient
silhouette_avg = silhouette_score(X, labels)
print("Silhouette Coefficient: ", silhouette_avg)

Silhouette Coefficient:  0.5479462758760916


## Hyper-parameter tunning

#### a useful function for calculating silhouette score

In [7]:
def compute_silhouette_score(X, linkage):
    clustering = AgglomerativeClustering( n_clusters=2, linkage=linkage)
    cluster_labels = clustering.fit_predict(X)
    score = silhouette_score(X, cluster_labels)
    return score

#### Searching for the best hyper-parameter

In [8]:
linkages = ["ward", "complete", "average", "single"]
best_linkage = None
best_score = -1

for linkage in linkages:
    try:
        score = compute_silhouette_score(X, linkage)
        print(f"Silhouette score for linkage={linkage}: {score}")
        if score > best_score:
            best_score = score
            best_linkage = linkage
    except Exception as e:
        print("Failed")

Silhouette score for linkage=ward: 0.5479462758760916
Silhouette score for linkage=complete: 0.6166354826398824
Silhouette score for linkage=average: 0.5986986692237101
Silhouette score for linkage=single: 0.3742447061332927


In [9]:
print(f"Best linkage: {best_linkage}")
print(f"Best silhouette score: {best_score}")

Best linkage: complete
Best silhouette score: 0.6166354826398824


### Try clustering with tuned hyper-parameter

In [10]:
# Instantiate the model
clustering = AgglomerativeClustering(linkage='complete', n_clusters=2)
# Fit model
clustering.fit(X)

## Evaluation metrics: <br>
We will use 1-average within cluster distance , 2-Between-Cluster Distance and 3-Silhouette Coefficient

### 1-Average within cluster distance

In [11]:
# Extract the cluster labels
labels = clustering.labels_

In [12]:
# Function to calculate the average within-cluster distance
def average_within_cluster_distance(X, labels):
    unique_labels = np.unique(labels)
    average_distances = []
    
    for label in unique_labels:
        cluster_points = X[labels == label]
        # Calculate pairwise distances within the cluster
        if len(cluster_points) > 1:  # Ensure there are at least two points to calculate distance
            distances = pairwise_distances(cluster_points)
            avg_distance = np.sum(distances) / (2 * len(cluster_points))
            average_distances.append(avg_distance)
        else:
            average_distances.append(0)  # If only one point, distance is zero

    # Calculate the average of the averages for a global measure
    overall_average = np.mean(average_distances)
    return overall_average

# Calculate the average within-cluster distance
avg_distance = average_within_cluster_distance(X, labels)
print("Average Within-Cluster Distance:", avg_distance)


Average Within-Cluster Distance: 13064.78367175092


### 2-Between-Cluster Distance

In [13]:
# Calculate cluster centroids
def calculate_centroids(X, labels):
    unique_labels = np.unique(labels)
    centroids = []
    for label in unique_labels:
        centroids.append(np.mean(X[labels == label], axis=0))
    return np.array(centroids)

# Calculate centroids
centroids = calculate_centroids(X, labels)

# Calculate the minimum distance between any two centroids
def min_inter_cluster_distance(centroids):
    dist_matrix = pairwise_distances(centroids)
    np.fill_diagonal(dist_matrix, np.inf)  # Fill diagonal with infinite to ignore zero distance of clusters to themselves
    return np.min(dist_matrix)

# Get the minimum inter-cluster distance
min_distance = min_inter_cluster_distance(centroids)
print("Minimum Between-Cluster Distance:", min_distance)

Minimum Between-Cluster Distance: 278.0000296564606


### 3-Silhouette Coefficient

In [14]:
# Calculate the Silhouette Coefficient
silhouette_avg = silhouette_score(X, labels)
print("Silhouette Coefficient: ", silhouette_avg)

Silhouette Coefficient:  0.6166354826398824


## K-means clustering

In [15]:
# Instantiate the model
clustering = KMeans(n_clusters=2, init='k-means++', max_iter=300, n_init=10, random_state=0)
# Fit model
clustering.fit(X)



### Initial Silhouette Coefficient

In [16]:
# Extract the cluster labels
labels = clustering.labels_

In [17]:
# Calculate the Silhouette Coefficient
silhouette_avg = silhouette_score(X, labels)
print("Silhouette Coefficient: ", silhouette_avg)

Silhouette Coefficient:  0.6253209536141464


## Hyper-parameter tunning

#### The useful function for calculating silhouette score

In [18]:
def compute_silhouette_score(X, n_init, max_iter, tol):
    clustering = KMeans(n_clusters=2, n_init=n_init, max_iter=max_iter, tol=tol, random_state=0)
    cluster_labels = clustering.fit_predict(X)
    score = silhouette_score(X, cluster_labels)
    return score

#### Searching for the best hyper-parameter

In [19]:
n_init_values = [10, 20, 30]
max_iter_values = [300, 400, 500]
tol_values = [1e-4, 1e-3, 1e-2]

best_params = None
best_score = -1

for n_init in n_init_values:
    for max_iter in max_iter_values:
        for tol in tol_values:
            score = compute_silhouette_score(X, n_init, max_iter, tol)
            print(f"Silhouette score for n_init={n_init}, max_iter={max_iter}, tol={tol}: {score}")
            if score > best_score:
                best_score = score
                best_params = (n_init, max_iter, tol)




Silhouette score for n_init=10, max_iter=300, tol=0.0001: 0.6253209536141464
Silhouette score for n_init=10, max_iter=300, tol=0.001: 0.6253126682897711
Silhouette score for n_init=10, max_iter=300, tol=0.01: 0.6253126682897711
Silhouette score for n_init=10, max_iter=400, tol=0.0001: 0.6253209536141464
Silhouette score for n_init=10, max_iter=400, tol=0.001: 0.6253126682897711




Silhouette score for n_init=10, max_iter=400, tol=0.01: 0.6253126682897711
Silhouette score for n_init=10, max_iter=500, tol=0.0001: 0.6253209536141464




Silhouette score for n_init=10, max_iter=500, tol=0.001: 0.6253126682897711
Silhouette score for n_init=10, max_iter=500, tol=0.01: 0.6253126682897711
Silhouette score for n_init=20, max_iter=300, tol=0.0001: 0.6253209536141464
Silhouette score for n_init=20, max_iter=300, tol=0.001: 0.6253126682897711




Silhouette score for n_init=20, max_iter=300, tol=0.01: 0.6253126682897711




Silhouette score for n_init=20, max_iter=400, tol=0.0001: 0.6253209536141464
Silhouette score for n_init=20, max_iter=400, tol=0.001: 0.6253126682897711
Silhouette score for n_init=20, max_iter=400, tol=0.01: 0.6253126682897711
Silhouette score for n_init=20, max_iter=500, tol=0.0001: 0.6253209536141464




Silhouette score for n_init=20, max_iter=500, tol=0.001: 0.6253126682897711
Silhouette score for n_init=20, max_iter=500, tol=0.01: 0.6253126682897711
Silhouette score for n_init=30, max_iter=300, tol=0.0001: 0.6253209536141464
Silhouette score for n_init=30, max_iter=300, tol=0.001: 0.6253126682897711




Silhouette score for n_init=30, max_iter=300, tol=0.01: 0.6253126682897711
Silhouette score for n_init=30, max_iter=400, tol=0.0001: 0.6253209536141464
Silhouette score for n_init=30, max_iter=400, tol=0.001: 0.6253126682897711
Silhouette score for n_init=30, max_iter=400, tol=0.01: 0.6253126682897711




Silhouette score for n_init=30, max_iter=500, tol=0.0001: 0.6253209536141464
Silhouette score for n_init=30, max_iter=500, tol=0.001: 0.6253126682897711
Silhouette score for n_init=30, max_iter=500, tol=0.01: 0.6253126682897711


In [20]:
print(f"Best parameters: n_init={best_params[0]}, max_iter={best_params[1]}, tol={best_params[2]}")
print(f"Best silhouette score: {best_score}")

Best parameters: n_init=10, max_iter=300, tol=0.0001
Best silhouette score: 0.6253209536141464


### Try clustering with tuned hyper-parameter

In [21]:
# Instantiate the model
clustering = KMeans(n_clusters=2, n_init=10, max_iter=300, tol=0.0001, random_state=0)
# Fit model
clustering.fit(X)



## Evaluation metrics: <br>
We will use 1-average within cluster distance 2-Compactness, 3-Between-Cluster Distance and 4-Silhouette Coefficient

### 1-Average within cluster distance

In [22]:
# Extract the cluster labels
labels = clustering.labels_

In [24]:
# Function to calculate the average within-cluster distance
def average_within_cluster_distance(X, labels):
    unique_labels = np.unique(labels)
    average_distances = []
    
    for label in unique_labels:
        cluster_points = X[labels == label]
        # Calculate pairwise distances within the cluster
        if len(cluster_points) > 1:  # Ensure there are at least two points to calculate distance
            distances = pairwise_distances(cluster_points)
            avg_distance = np.sum(distances) / (2 * len(cluster_points))
            average_distances.append(avg_distance)
        else:
            average_distances.append(0)  # If only one point, distance is zero

    # Calculate the average of the averages for a global measure
    overall_average = np.mean(average_distances)
    return overall_average

# Calculate the average within-cluster distance
avg_distance = average_within_cluster_distance(X, labels)
print("Average Within-Cluster Distance:", avg_distance)


Average Within-Cluster Distance: 12883.301690276458


### 2-Compactness

In [25]:
clustering.inertia_

3581110.8163670027

### 3-Between-Cluster Distance

In [26]:
# Calculate cluster centroids
def calculate_centroids(X, labels):
    unique_labels = np.unique(labels)
    centroids = []
    for label in unique_labels:
        centroids.append(np.mean(X[labels == label], axis=0))
    return np.array(centroids)

# Calculate centroids
centroids = calculate_centroids(X, labels)

# Calculate the minimum distance between any two centroids
def min_inter_cluster_distance(centroids):
    dist_matrix = pairwise_distances(centroids)
    np.fill_diagonal(dist_matrix, np.inf)  # Fill diagonal with infinite to ignore zero distance of clusters to themselves
    return np.min(dist_matrix)

# Get the minimum inter-cluster distance
min_distance = min_inter_cluster_distance(centroids)
print("Minimum Between-Cluster Distance:", min_distance)

Minimum Between-Cluster Distance: 278.00007876369637


### 4-Silhouette Coefficient

In [27]:
# Calculate the Silhouette Coefficient
silhouette_avg = silhouette_score(X, labels)
print("Silhouette Coefficient: ", silhouette_avg)

Silhouette Coefficient:  0.6253209536141464
