In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
    
import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.metrics import adjusted_rand_score
import numpy as np
from tqdm import tqdm
import matplotlib.colors as mcolors
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score
import helpers_datasets as hd
import helpers_clustering as hc
import pandas as pd
from functools import reduce
from collections import defaultdict
from scipy.spatial import distance_matrix

COLORS = np.random.permutation(list(mcolors.CSS4_COLORS.values())[1:])

def plot_clusters(X, y):
    cluster_inds = [i for i, yy in enumerate(y) if yy >= 0]
    clusterless_inds = [i for i, yy in enumerate(y) if yy < 0]
    plt.scatter(X[cluster_inds, 0], X[cluster_inds, 1], c=[COLORS[y[i]] for i in cluster_inds], marker="o")
    if len(clusterless_inds) > 0:
        plt.scatter(X[clusterless_inds, 0], X[clusterless_inds, 1], c="black", marker="s")

plt.style.use("seaborn-darkgrid")
X_train, X_test, y_train, y_test = hd.get_2d_mnist_dataset(
    num_train_images=10000, num_test_images=100, transform=hd.UMAP_ALGO)

plt.figure(figsize=(15,8))
plt.subplot(1,2,1)
plot_clusters(X_train, y_train)
plt.subplot(1,2,2)
plot_clusters(X_test, y_test)

In [None]:
num_maps_per_cluster_incoming = 50
incoming_nonexpansive_maps = hc.sample_incoming_nonexpansive_map_from_two_points_same_cluster(
    X_train, X_test, num_maps_per_cluster=num_maps_per_cluster_incoming, y_train=y_train)
incoming_relation = hc.construct_relation_from_incoming_nonexpansive_maps(
    num_test_points=X_test.shape[0], incoming_nonexpansive_maps=incoming_nonexpansive_maps, y_train=y_train)
lan_predictions = hc.generate_predictions_from_relation(X_test, incoming_relation)



num_maps_per_cluster_pair_outgoing = 1
num_cluster_pairs = 5
outgoing_nonexpansive_maps = hc.sample_outgoing_nonexpansive_map_to_two_points_different_clusters(
    X_train=X_train,
    X_test=X_test,
    relation=incoming_relation,
    num_maps_per_cluster_pair=num_maps_per_cluster_pair_outgoing,
    y_train=y_train,
    num_cluster_pairs=num_cluster_pairs)
outgoing_relation = hc.construct_relation_from_outgoing_nonexpansive_maps(
    incoming_relation, outgoing_nonexpansive_maps, y_train)
ran_predictions = hc.generate_predictions_from_relation(X_test, outgoing_relation)



In [None]:
def get_hyperopt_single_linkage_predictions(X, y_test, n_iter=10):
    D = distance_matrix(X, X)
    out = None
    best = -np.inf
    for delta in tqdm(np.linspace(np.min(D), np.max(D), n_iter)):
        single_linkage_predictions = AgglomerativeClustering(
            affinity="precomputed", n_clusters=None, distance_threshold=delta, linkage="single").fit(D).labels_
        score = adjusted_rand_score(y_test, single_linkage_predictions)
        if score > best:
            best = score
            out = single_linkage_predictions
    return out


plt.figure(figsize=(15,15))
plt.subplot(2,2,1)
plt.title("Ground Truth \n Num Unique Clusters: {}".format(len(set(y_test))))
plot_clusters(X_test, y_test)
#
plt.subplot(2,2,2)
single_linkage_predictions = get_hyperopt_single_linkage_predictions(X_test, y_test, 10)
plt.title("$\delta$-Single Linkage with Optimal $\delta$ \n Num Unique Clusters: {} \n Adjusted Rand Score with Ground Truth: {}".format(
    len(set(single_linkage_predictions)),
    adjusted_rand_score(y_test, single_linkage_predictions)))
plot_clusters(X_test, single_linkage_predictions)
#
plt.subplot(2,2,3)
plt.title("$Lan_G K(X, d_X)$ \n Num Unique Clusters: {} \n Adjusted Rand Score with Ground Truth: {}".format(
    len(set(lan_predictions)),
    adjusted_rand_score(y_test, lan_predictions)))
plot_clusters(X_test, lan_predictions)
#
plt.subplot(2,2,4)
plt.title("$Ran_G K(X, d_X)$ \n Num Unique Clusters: {} \n Adjusted Rand Score with Ground Truth: {}".format(
    len(set(ran_predictions)),
    adjusted_rand_score(y_test, ran_predictions)))
plot_clusters(X_test, ran_predictions)
