## Agglomerative Hierarchy Algorithm Implementation

In [3]:

# Add Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import numpy as np
from scipy.spatial.distance import euclidean
from sklearn.cluster import AgglomerativeClustering

In [5]:
dataset_path = "./../datasets"

iris_dataset_path = dataset_path + "/iris.csv"                                         
ai_global_index_path = dataset_path + "/AI_index_db.csv"
global_earthquake_data_path = dataset_path + "/earthquakes.csv"

datasets = {
    "iris": pd.read_csv(iris_dataset_path),
    "ai_global_index": pd.read_csv(ai_global_index_path),
    "global_earthquake": pd.read_csv(global_earthquake_data_path)
}


In [None]:
iris_df = pd.read_csv(iris_dataset_path)
ai_global_index_df = pd.read_csv(ai_global_index_path)
global_earthquake_data_df = pd.read_csv(global_earthquake_data_path)

### BIRCH Implementation (Based on our Algorithm - see report/Part-1.pdf)

In [6]:
import numpy as np
from scipy.spatial.distance import euclidean
from sklearn.cluster import AgglomerativeClustering

class Cluster:
    def __init__(self, points):
        self.points = points

    def merge(self, other_cluster):
        return Cluster(self.points + other_cluster.points)

    def centroid(self):
        return np.mean(self.points, axis=0)


def compute_distance(cluster1, cluster2, linkage='single'):
    if linkage == 'single':
        return min(euclidean(p1, p2) for p1 in cluster1.points for p2 in cluster2.points)
    elif linkage == 'complete':
        return max(euclidean(p1, p2) for p1 in cluster1.points for p2 in cluster2.points)
    elif linkage == 'average':
        distances = [euclidean(p1, p2) for p1 in cluster1.points for p2 in cluster2.points]
        return np.mean(distances)


def hac_custom(data, k, linkage='single'):
    clusters = [Cluster([point]) for point in data]

    while len(clusters) > k:
        min_distance = float('inf')
        to_merge = (None, None)

        for i in range(len(clusters)):
            for j in range(i + 1, len(clusters)):
                distance = compute_distance(clusters[i], clusters[j], linkage)
                if distance < min_distance:
                    min_distance = distance
                    to_merge = (i, j)

        cluster1, cluster2 = to_merge
        new_cluster = clusters[cluster1].merge(clusters[cluster2])
        clusters = [c for idx, c in enumerate(clusters) if idx not in (cluster1, cluster2)]
        clusters.append(new_cluster)

    labels = np.zeros(len(data), dtype=int)
    for cluster_idx, cluster in enumerate(clusters):
        for point in cluster.points:
            point_index = np.where((data == point).all(axis=1))[0][0]
            labels[point_index] = cluster_idx

    return clusters, labels


if __name__ == "__main__":
    # Sample dataset
    data = np.array([
        [1, 2], [1, 4], [1, 0],
        [10, 2], [10, 4], [10, 0]
    ])

    # Number of clusters
    k = 2

    # Run custom HAC
    clusters, labels = hac_custom(data, k, linkage='average')

    print("Cluster Labels:", labels)


Cluster Labels: [0 0 0 1 1 1]


In [11]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import Birch as SklearnBIRCH
from sklearn.metrics import adjusted_rand_score
from scipy.spatial.distance import euclidean

# Assuming birch_clustering is defined elsewhere
def birch_clustering(X, threshold, branching_factor, n_clusters):
    # Dummy function for custom BIRCH clustering
    # Replace this with actual logic
    centroids = np.random.rand(n_clusters, X.shape[1])
    return None, centroids

results = {}

# Sample datasets (replace with actual data loading)
datasets = {
    "dataset1": pd.DataFrame(np.random.rand(100, 5)),
    "dataset2": pd.DataFrame(np.random.rand(200, 5))
}

for name, df in datasets.items():
    print(f"Dataset: {name}, Type: {type(df)}")

    df = df.dropna()

    # Extract numerical features
    X = df.select_dtypes(include=[np.number]).values

    # Normalize the data
    X = StandardScaler().fit_transform(X)

    # Run the custom BIRCH implementation
    threshold = 0.01  # Adjust as needed
    branching_factor = 10  # Adjust as needed
    n_clusters = 3  # Adjust as needed

    cf_tree, custom_centroids = birch_clustering(X, threshold, branching_factor, n_clusters)
    custom_labels = np.zeros(len(X))  # Placeholder for custom labels

    # Assign labels based on closest centroid
    for i, point in enumerate(X):
        distances = [euclidean(point, centroid) for centroid in custom_centroids]
        custom_labels[i] = np.argmin(distances)

    print(f"Custom BIRCH Centroids for {name}:")
    for i, centroid in enumerate(custom_centroids):
        print(f"Cluster {i + 1}: {centroid}")

    print(f"\nCustom BIRCH Labels for {name}: {custom_labels}")

    # Run the sklearn BIRCH implementation
    sklearn_birch = SklearnBIRCH(threshold=threshold, branching_factor=branching_factor, n_clusters=n_clusters)
    sklearn_labels = sklearn_birch.fit_predict(X)

    print(f"Sklearn BIRCH Labels for {name}: {sklearn_labels}")

    # Compare the results using Adjusted Rand Index (ARI)
    ari_score = adjusted_rand_score(custom_labels, sklearn_labels)
    results[name] = ari_score
    print(f"Adjusted Rand Index (ARI) for {name}: {ari_score}")

# Store results
results = pd.Series(results)
results.to_csv("./../results/birch_comparison.csv", header=False)


Dataset: dataset1, Type: <class 'pandas.core.frame.DataFrame'>


TypeError: '<' not supported between instances of 'NoneType' and 'float'