In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import random
from tqdm import tqdm
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import SpectralClustering, AffinityPropagation

import utils
plt.ion()
plt.show()

# Create simulated data

In [None]:
nb_clusters = 7
X, y_true = make_blobs(n_samples=300, centers=nb_clusters,
                       cluster_std=.80, random_state=0)
/print X.shape
plt.title(f'Ground truth simulated data : {nb_clusters} clusters')
plt.scatter(X[:, 0], X[:, 1], s=50, c = y_true);

Let's check some internal cluster quality measures on the ground truth:

In [None]:
utils.internalValidation(X, y_true)

Since the generated data was overlapping on the borders, the silhoutte score is only 0.52

# Spectral clustering

In [None]:
clustering = SpectralClustering(n_clusters=nb_clusters, assign_labels="discretize", random_state=0).fit(X)
y_pred = clustering.labels_
plt.title(f'Spectral clustering results ')
plt.scatter(X[:, 0], X[:, 1], s=50, c = y_pred);

## Internal validation

In [None]:
utils.internalValidation(X, y_pred)

## External validation

In [None]:
utils.externalValidation(y_true, y_pred)

# Affinity propagation

In [None]:
clustering = AffinityPropagation().fit(X)
y_pred = clustering.labels_
print(f'Affinity propagation found {len(np.unique(y_pred))} clusters')
plt.title(f'Affinity propagation clustering results ')
plt.scatter(X[:, 0], X[:, 1], s=50, c = y_pred);

## Internal validation

In [None]:
utils.internalValidation(X, y_pred)

## External validation

In [None]:
utils.externalValidation(y_true, y_pred)