## Agglomerative Hierarchy Algorithm Implementation

In [13]:

# Add Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import numpy as np
from scipy.spatial.distance import euclidean
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering as SklearnAgglomerative
from sklearn.metrics import adjusted_rand_score
from scipy.spatial.distance import euclidean

In [20]:
dataset_path = "./../datasets"

iris_dataset_path = dataset_path + "/iris.csv"                                         
ai_global_index_path = dataset_path + "/AI_index_db.csv"
global_earthquake_data_path = dataset_path + "/earthquakes.csv"

datasets = {
    "iris": pd.read_csv(iris_dataset_path),
    "ai_global_index": pd.read_csv(ai_global_index_path),
    "global_earthquake": pd.read_csv(global_earthquake_data_path)
}


In [19]:
iris_df = pd.read_csv(iris_dataset_path)
ai_global_index_df = pd.read_csv(ai_global_index_path)
global_earthquake_data_df = pd.read_csv(global_earthquake_data_path)

In [22]:
import os

# Check if the paths exist
print(os.path.exists(iris_dataset_path))      # Should be True
print(os.path.exists(ai_global_index_path))  # Should be True
print(os.path.exists(global_earthquake_data_path))  # Should be True


True
True
True


### BIRCH Implementation (Based on our Algorithm - see report/Part-1.pdf)

In [6]:
import numpy as np
from scipy.spatial.distance import euclidean
from sklearn.cluster import AgglomerativeClustering

class Cluster:
    def __init__(self, points):
        self.points = points

    def merge(self, other_cluster):
        return Cluster(self.points + other_cluster.points)

    def centroid(self):
        return np.mean(self.points, axis=0)


def compute_distance(cluster1, cluster2, linkage='single'):
    if linkage == 'single':
        return min(euclidean(p1, p2) for p1 in cluster1.points for p2 in cluster2.points)
    elif linkage == 'complete':
        return max(euclidean(p1, p2) for p1 in cluster1.points for p2 in cluster2.points)
    elif linkage == 'average':
        distances = [euclidean(p1, p2) for p1 in cluster1.points for p2 in cluster2.points]
        return np.mean(distances)


def hac_custom(data, k, linkage='single'):
    clusters = [Cluster([point]) for point in data]

    while len(clusters) > k:
        min_distance = float('inf')
        to_merge = (None, None)

        for i in range(len(clusters)):
            for j in range(i + 1, len(clusters)):
                distance = compute_distance(clusters[i], clusters[j], linkage)
                if distance < min_distance:
                    min_distance = distance
                    to_merge = (i, j)

        cluster1, cluster2 = to_merge
        new_cluster = clusters[cluster1].merge(clusters[cluster2])
        clusters = [c for idx, c in enumerate(clusters) if idx not in (cluster1, cluster2)]
        clusters.append(new_cluster)

    labels = np.zeros(len(data), dtype=int)
    for cluster_idx, cluster in enumerate(clusters):
        for point in cluster.points:
            point_index = np.where((data == point).all(axis=1))[0][0]
            labels[point_index] = cluster_idx

    return clusters, labels


if __name__ == "__main__":
    # Sample dataset
    data = np.array([
        [1, 2], [1, 4], [1, 0],
        [10, 2], [10, 4], [10, 0]
    ])

    # Number of clusters
    k = 2

    # Run custom HAC
    clusters, labels = hac_custom(data, k, linkage='average')

    print("Cluster Labels:", labels)


Cluster Labels: [0 0 0 1 1 1]


In [23]:
results = {}

for name, df in datasets.items():
    print(f"{name} dataset shape: {df.shape}")
    # Drop missing values
    df = df.dropna()

    # Extract numerical features
    X = df.select_dtypes(include=[np.number]).values

    # Normalize the data
    X = StandardScaler().fit_transform(X)

    # Number of clusters
    k = 3  # Adjust as needed

    # Run the custom HAC implementation
    try:
        custom_output = hac_custom(X, k, linkage='average')
        if isinstance(custom_output, tuple):
            custom_labels = custom_output[1]  # Extract the cluster labels
        else:
            custom_labels = custom_output

        print(f"Custom HAC Labels for {name}: {custom_labels}")
    except Exception as e:
        print(f"Error in custom HAC for {name}: {e}")
        continue

    # Run the sklearn HAC implementation
    try:
        sklearn_hac = SklearnAgglomerative(n_clusters=k, linkage='average')
        sklearn_labels = sklearn_hac.fit_predict(X)
        print(f"Sklearn HAC Labels for {name}: {sklearn_labels}")
    except Exception as e:
        print(f"Error in sklearn HAC for {name}: {e}")
        continue

    # Compare the results using Adjusted Rand Index (ARI)
    if custom_labels is not None and sklearn_labels is not None and len(custom_labels) == len(sklearn_labels):
        ari_score = adjusted_rand_score(custom_labels, sklearn_labels)
        results[name] = ari_score
        print(f"Adjusted Rand Index (ARI) for {name}: {ari_score}")
    else:
        print(f"Invalid clustering for {name}.")

# Store results
results = pd.Series(results)
results.to_csv("./../results/hac_comparison.csv", header=False)


iris dataset shape: (150, 5)
Custom HAC Labels for iris: [2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2
 0 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1
 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1
 1 1]
Sklearn HAC Labels for iris: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1
 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1]
Adjusted Rand Index (ARI) for iris: 0.963629008041949
ai_global_index dataset shape: (62, 13)
Custom HAC Labels for ai_global_index: [0 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2]
Sklearn HAC Labels for ai_global_index: [1