In [1]:
%pip install umap-learn


Collecting umap-learn
  Downloading umap_learn-0.5.7-py3-none-any.whl.metadata (21 kB)
Collecting numba>=0.51.2 (from umap-learn)
  Downloading numba-0.60.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting pynndescent>=0.5 (from umap-learn)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Collecting llvmlite<0.44,>=0.43.0dev0 (from numba>=0.51.2->umap-learn)
  Downloading llvmlite-0.43.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (4.8 kB)
Downloading umap_learn-0.5.7-py3-none-any.whl (88 kB)
Downloading numba-0.60.0-cp39-cp39-macosx_11_0_arm64.whl (2.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading pynndescent-0.5.13-py3-none-any.whl (56 kB)
Downloading llvmlite-0.43.0-cp39-cp39-macosx_11_0_arm64.whl (28.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.8/28.8 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h

In [4]:
import os
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN
from scipy.ndimage import gaussian_filter
import umap
from collections import Counter
from dotenv import load_dotenv

load_dotenv()

True

In [13]:
import re


def load_data(absolute_file_path):
    with open(absolute_file_path, 'r') as f:
        data = f.readlines()

    for i in range(len(data)):
        data[i] = data[i].split(',')

    labels = [-1] * len(data)
    for i in range(len(data)):
        labels[i] = data[i].pop(0)

    data = np.array(data)
    labels = np.array(labels)
    data = data.astype(np.float32)
    labels = labels.astype(np.int32)
    print("Labels shape: ", labels.shape)
    print("Data shape: ", data.shape)
    return data, labels


def preprocess_images(data):
    temp_data = data.reshape(-1, 28, 28)
    step1_data = gaussian_filter(temp_data, sigma=0.75)
    reshaped_data = step1_data.reshape(-1, 28 * 28)
    # data = data.astype(np.float32)
    scaler =  StandardScaler()
    data = scaler.fit_transform(reshaped_data)
    return data


def process_data_tsne(data):
    pca = PCA(n_components=100)
    pca_reduced = pca.fit_transform(data)
    print("Data shape after PCA: ", pca_reduced.shape)
    umapped = umap.UMAP(n_components=20)
    umap_reduced = umapped.fit_transform(pca_reduced)
    print("Data shape after t-SNE: ", umap_reduced.shape)
    return umap_reduced


def perform_clustering(data):
    # STep 1: Use DBSCAN to remove noise
    db = DBSCAN(eps=3, min_samples=9).fit(data)
    print("DBSCAN labels:", np.unique(db.labels_, return_counts=True))

    not_noise = db.labels_ != -1
    print("Number of noise points: ", len(data[db.labels_ == -1]))
    clean_data = data[not_noise]
    kmeans = KMeans(n_clusters=10, n_init='auto')
    kmeans.fit(clean_data)
    clustered_labels = kmeans.labels_
    print("Clustered labels shape: ", clustered_labels.shape)
    print("Max label: ", np.max(clustered_labels))
    print("Min label: ", np.min(clustered_labels))
    return clustered_labels


def evaluate(true_labels: np.ndarray, pred_labels: np.ndarray) -> tuple:
    """Entropy-based evaluation of a label assignment.

    Parameters:
    true_labels: the ground-truth class labels on the input data.
    pred_labels: the predicted class labels on the input data.

    Returns:
    a tuple (CM, (cs_e, cr_e, we)) containing the confusion matrix `CM`, the class entropies `cs_e`,
    the cluster entropies `cr_e`, and the averaged weighted entropies `we`.
    """
    from scipy.stats import entropy

    assert len(true_labels) == len(pred_labels), "Label predictions don't match"
    print("true", len(true_labels))
    print("pred", len(pred_labels))
    ## Map the labels to index set {0, 1, ..., k - 1 }
    t_classes, t_labels = np.unique(true_labels, return_inverse=True)
    p_classes, p_labels = np.unique(pred_labels, return_inverse=True)
    # assert np.all(np.isin(p_classes, t_classes)), "Predicted class outside of labels given"

    ## Accumulate the counts
    n_classes = len(t_classes)
    CM = np.zeros(shape=(n_classes, n_classes), dtype=np.uint32)
    ind = np.ravel_multi_index([t_labels, p_labels], CM.shape)
    np.add.at(CM.ravel(), ind, 1)

    ## Compute the entropy of the empirical row/column distributions
    empirical_dist = lambda x: x / np.sum(x)
    cluster_entropy = np.apply_along_axis(lambda x: entropy(empirical_dist(x), base=2), 0, CM)
    class_entropy = np.apply_along_axis(lambda x: entropy(empirical_dist(x), base=2), 1, CM)

    ## Average w/ count weights
    w_cluster_entropy = np.sum(cluster_entropy * CM.sum(axis=0)) / len(true_labels)
    w_class_entropy = np.sum(class_entropy * CM.sum(axis=1)) / len(true_labels)
    w_entropies = np.array([w_class_entropy, w_cluster_entropy])

    with np.printoptions(precision=3):
        print(f"Class Entropies: {class_entropy}")
        print(f"Cluster Entropies: {cluster_entropy}")
        print(f"Weighted average entropies: {w_entropies}, (avg: {np.mean(w_entropies):.3f})")
    return CM, (w_class_entropy, w_cluster_entropy, w_entropies)


def main():
    file_path = os.environ.get('DATASET_PATH') + "pb1data_XW_8358.txt"
    data, labels = load_data(file_path)
    data = preprocess_images(data)
    data_2d = process_data_tsne(data)
    clustered_labels = perform_clustering(data_2d)
    counts = Counter(clustered_labels)
    print("Cluster size distribution:")
    for c in sorted(counts):
        print(f"Cluster {c}: {counts[c]} samples")
    CM, (cs_e, cr_e, we) = evaluate(labels, clustered_labels)
    print("Confusion Matrix: \n", CM)
    print("Weighted Class Entropies: ", cs_e)
    print("Weighted Cluster Entropies: ", cr_e)
    print("Weighted Entropies: ", we)


if __name__ == "__main__":
    main()

Labels shape:  (8358,)
Data shape:  (8358, 784)
Data shape after PCA:  (8358, 100)
Data shape after t-SNE:  (8358, 20)
DBSCAN labels: (array([0, 1, 2, 3, 4]), array([ 670, 3208,  731, 1585, 2164]))
Number of noise points:  0
Clustered labels shape:  (8358,)
Max label:  9
Min label:  0
Cluster size distribution:
Cluster 0: 899 samples
Cluster 1: 1029 samples
Cluster 2: 1143 samples
Cluster 3: 670 samples
Cluster 4: 731 samples
Cluster 5: 1021 samples
Cluster 6: 490 samples
Cluster 7: 1206 samples
Cluster 8: 686 samples
Cluster 9: 483 samples
true 8358
pred 8358
Class Entropies: [0.045 1.    0.134 0.12  0.051 0.174 0.042 0.088 0.35  0.873]
Cluster Entropies: [0.045 0.198 0.694 0.016 0.076 1.069 0.134 0.128 0.047 0.16 ]
Weighted average entropies: [0.348 0.302], (avg: 0.325)
Confusion Matrix: 
 [[   0    0    0  669    2    0    0    1    0    0]
 [ 895    0    0    0    0    0    2    0  683    0]
 [   0    2    1    0    0    0  483    3    1    0]
 [   0 1005    1    0    0    1    1  