In [1]:
# View Scikit Links:

# https://scikit-learn.org/stable/modules/clustering.html
# https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering

In [None]:
# Import Modules

import time
import warnings

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import cluster, datasets
from sklearn.preprocessing import StandardScaler
from itertools import cycle, islice

np.random.seed(0)

In [None]:
# Generate Data Sets

n_samples = 1500
random_state1 = 170
random_state2 = 8

# Circles

noisy_circles = datasets.make_circles(n_samples = n_samples, factor = 0.5, noise = 0.05)

# Moons

noisy_moons = datasets.make_moons(n_samples = n_samples, noise = 0.05)

# Blobs (with varied variances)

varied = datasets.make_blobs(n_samples = n_samples, cluster_std = [1.0, 2.5, 0.5], random_state=random_state1)

# Blobs (Anisotropicly distributed)

X, y = datasets.make_blobs(n_samples=n_samples, random_state = random_state1)

transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)

aniso = (X_aniso, y)

# Blobs (with equal variances)

blobs = datasets.make_blobs(n_samples = n_samples, random_state = random_state2)

# Random 2D Data

no_structure = np.random.rand(n_samples, 2), None

# View Random 2D Data

plt.figure()
X, y = no_structure
X = StandardScaler().fit_transform(X)
plt.scatter(X[:,0], X[:,1], s = 5)

In [None]:
# Set up cluster parameters

# Plot init

plt.figure(figsize=(9 * 1.3 + 2, 14.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01)
plot_num = 1

# Parameters Dictionary

params_base = {'n_neighbors': 10,
                'n_clusters': 3}

# List with tuples (datasets and parameters)

datasets = [
    (noisy_circles, {'n_clusters': 2}),
    (noisy_moons, {'n_clusters': 2}),
    (varied, {'n_neighbors': 2}),
    (aniso, {'n_neighbors': 2}),
    (blobs, {}),
    (no_structure, {})]

# Iterate list of datasets and paratemeters

for i_dataset, (dataset, algorithm_parameters) in enumerate(datasets):
    
    # Update parameters with dataset-specific values
    params = default_base.copy()
    params.update(algorithm_parameters)
    
    # Get dataSet and normalize it
    X, y = dataset
    X = StandardScaler().fit_transform(X)

    # Create cluster objects
    ward = cluster.AgglomerativeClustering(n_clusters = params['n_clusters'], linkage='ward')
    complete = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='complete')
    average = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='average')
    single = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='single')
    
    # List with tuples (algorithm's names and objects)
    clustering_algorithms = (
        ('Single Linkage', single),
        ('Average Linkage', average),
        ('Complete Linkage', complete),
        ('Ward Linkage', ward),
    )
    
    # Itarate each algorithm at a dataset
    for name, algorithm in clustering_algorithms:
        
        t0 = time.time()

        # catch warnings related to kneighbors_graph
        with warnings.catch_warnings():
            
            warnings.filterwarnings(
                "ignore",
                message="the number of connected components of the " +
                "connectivity matrix is [0-9]{1,2}" +
                " > 1. Completing it to avoid stopping the tree early.",
                category=UserWarning)
            
            # Fit algorithm to data
            algorithm.fit(X)

        t1 = time.time()
        
        # Do prediction step
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(X)
            
        # Indicate wich subplot
        plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
        
        # Plot title just in the first line
        if i_dataset == 0:
            plt.title(name, size=18)
            
        # 
        colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                             '#f781bf', '#a65628', '#984ea3',
                                             '#999999', '#e41a1c', '#dede00']),
                                      int(max(y_pred) + 1))))
        
        # Scatter PLot. Considering the predictions
        plt.scatter(X[:, 0], X[:, 1], s = 5, color = colors[y_pred])

        plt.xlim(-2.5, 2.5)
        plt.ylim(-2.5, 2.5)
        
        plt.xticks(())
        plt.yticks(())
        
        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
        
        # Update plot number to subplot
        plot_num += 1

plt.show()