In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

from collections import namedtuple, defaultdict
from functools import reduce

import numpy as np
import random
import matplotlib.pyplot as plt
from sklearn.metrics import adjusted_rand_score
import numpy as np
from tqdm import tqdm
import matplotlib.colors as mcolors
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import adjusted_rand_score
import pandas as pd


import helpers_datasets as hd
import helpers_clustering as hc

Result = namedtuple("Result", "lan_score ran_score single_linkage_score")
provider = hd.TransformedSampleProvider()

# Run Experiment

In [None]:

NUM_EXPERIMENTS = 100

# results = []
for i in range(NUM_EXPERIMENTS):

    X_train, y_train, X_test, y_test = provider.get_transformed_sample()
    lan_predictions, ran_predictions = hc.get_ran_and_lan_predictions(
        X_train=X_train,
        y_train=y_train,
        X_test=X_test)
    single_linkage_predictions = hc.get_hyperopt_single_linkage_predictions(
        X_train=X_train,
        y_train=y_train,
        X_test=X_test)
    result = Result(
        lan_score=adjusted_rand_score(y_test, lan_predictions),
        ran_score=adjusted_rand_score(y_test, ran_predictions),
        single_linkage_score=adjusted_rand_score(y_test, single_linkage_predictions),
    )
    print(result)
    results.append(result)
    
    


In [None]:
ct_ran_beats_slink = len([r for r in results if r.ran_score > r.single_linkage_score])
ct_lan_beats_slink = len([r for r in results if r.lan_score > r.single_linkage_score])
ct_slink_beats_ran = len([r for r in results if r.ran_score < r.single_linkage_score])
ct_slink_beats_lan = len([r for r in results if r.lan_score < r.single_linkage_score])


prop_lan_beats_slink = ct_lan_beats_slink / (ct_lan_beats_slink + ct_slink_beats_lan)
sem_lan_beats_slink = np.sqrt(
    (prop_lan_beats_slink*(1-prop_lan_beats_slink)) / (
    ct_lan_beats_slink + ct_slink_beats_lan))
print("Lan Beats Slink: {} SEM: {}".format(prop_lan_beats_slink, sem_lan_beats_slink))


prop_ran_beats_slink = ct_ran_beats_slink / (ct_ran_beats_slink + ct_slink_beats_ran)
sem_ran_beats_slink = np.sqrt(
    (prop_ran_beats_slink*(1-prop_ran_beats_slink)) / (
    ct_ran_beats_slink + ct_slink_beats_ran))
print("Ran Beats Slink: {} SEM: {}".format(prop_ran_beats_slink, sem_ran_beats_slink))

# Plot an Example

In [None]:
COLORS = np.random.permutation(list(mcolors.CSS4_COLORS.values())[1:])

def plot_clusters(X, y):
    cluster_inds = [i for i, yy in enumerate(y) if yy >= 0]
    clusterless_inds = [i for i, yy in enumerate(y) if yy < 0]
    plt.scatter(X[cluster_inds, 0], X[cluster_inds, 1], c=[COLORS[y[i]] for i in cluster_inds], marker="o")
    if len(clusterless_inds) > 0:
        plt.scatter(X[clusterless_inds, 0], X[clusterless_inds, 1], c="black", marker="s")


X_train, y_train, X_test, y_test = provider.get_transformed_sample()

plt.style.use("seaborn-darkgrid")
plt.figure(figsize=(15,8))
plt.subplot(1,2,1)
plot_clusters(X_train, y_train)
plt.subplot(1,2,2)
plot_clusters(X_test, y_test)

In [None]:
X_train, y_train, X_test, y_test = provider.get_transformed_sample()
single_linkage_predictions = hc.get_hyperopt_single_linkage_predictions(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test)
lan_predictions, ran_predictions = hc.get_ran_and_lan_predictions(
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    num_cluster_pairs=1)


plt.figure(figsize=(15,15))
plt.subplot(2,2,1)
plt.title("Ground Truth \n Num Unique Clusters: {}".format(len(set(y_test))))
plot_clusters(X_test, y_test)
#
plt.subplot(2,2,2)
plt.title("$\delta$-Single Linkage with Optimal $\delta$ \n Num Unique Clusters: {} \n Adjusted Rand Score with Ground Truth: {}".format(
    len(set(single_linkage_predictions)),
    "{0:.4g}".format(adjusted_rand_score(y_test, single_linkage_predictions))))
plot_clusters(X_test, single_linkage_predictions)
#
plt.subplot(2,2,3)
plt.title("$Lan_G K_L(X, d_X)$ \n Num Unique Clusters: {} \n Adjusted Rand Score with Ground Truth: {}".format(
    len(set(lan_predictions)),
    "{0:.4g}".format(adjusted_rand_score(y_test, lan_predictions))))
plot_clusters(X_test, lan_predictions)
#
plt.subplot(2,2,4)
plt.title("$Ran_G K_R(X, d_X)$ \n Num Unique Clusters: {} \n Adjusted Rand Score with Ground Truth: {}".format(
    len(set(ran_predictions)),
    "{0:.4g}".format(adjusted_rand_score(y_test, ran_predictions))))
plot_clusters(X_test, ran_predictions)
