In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.datasets.samples_generator import make_blobs
X, y_true = make_blobs(n_samples=300, centers=4,
                       cluster_std=2, random_state=42)
plt.scatter(X[:, 0], X[:, 1], s=50)
plt.show()

## Cluster jerárquico

Dos criterios principales:

* Distancia (L2, L1, taxista, etc)
* Criterio de unión
    - Mínimo de distancias
    - Máximo de distancias
    - Distancia al promedio / minimizar varianza (Ward)

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage

In [None]:
Z = linkage(X, 'ward')
plt.figure(figsize=(20, 20))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Iris Dataset')
plt.ylabel('distance')
dendrogram(
    Z,
    orientation='right',
    leaf_rotation=0,
    leaf_font_size=8.,
)
plt.show()

## K-Means

In [None]:
k = 4

np.random.seed(10)
medias = np.random.rand(k, 2)*20 - 10
          
plt.scatter(X[:, 0], X[:, 1], s=50)
plt.scatter(medias[:,0], medias[:,1], color='red')
plt.show()

In [None]:
def dist(a, b, ax=1):
    return np.linalg.norm(a - b, axis=ax)

In [None]:
clusters = np.zeros(len(X))
for i in range(len(X)):
    distances = dist(X[i], medias)
    cluster = np.argmin(distances)
    clusters[i] = cluster
    
for i in range(k):
    points = [X[j] for j in range(len(X)) if clusters[j] == i]
    medias[i] = np.mean(points, axis=0)

plt.scatter(X[:, 0], X[:, 1], s=50)
plt.scatter(medias[:,0], medias[:,1], color='red')
plt.show()

## Modelos de Mezclas

In [None]:
from sklearn.mixture import GaussianMixture
gmm = GaussianMixture(n_components=4).fit(X)
labels = gmm.predict(X)

In [None]:
from matplotlib.patches import Ellipse

def draw_ellipse(position, covariance, ax=None, **kwargs):
    """Draw an ellipse with a given position and covariance"""
    ax = ax or plt.gca()
    
    # Convert covariance to principal axes
    if covariance.shape == (2, 2):
        U, s, Vt = np.linalg.svd(covariance)
        angle = np.degrees(np.arctan2(U[1, 0], U[0, 0]))
        width, height = 2 * np.sqrt(s)
    else:
        angle = 0
        width, height = 2 * np.sqrt(covariance)
    
    # Draw the Ellipse
    for nsig in range(1, 4):
        ax.add_patch(Ellipse(position, nsig * width, nsig * height,
                             angle, **kwargs))
        
def plot_gmm(gmm, X, label=True, ax=None):
    ax = ax or plt.gca()
    labels = gmm.fit(X).predict(X)
    if label:
        ax.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis', zorder=2)
    else:
        ax.scatter(X[:, 0], X[:, 1], s=40, zorder=2)
    ax.axis('equal')
    
    w_factor = 0.2 / gmm.weights_.max()
    for pos, covar, w in zip(gmm.means_, gmm.covariances_, gmm.weights_):
        draw_ellipse(pos, covar, alpha=w * w_factor)

In [None]:
gmm = GaussianMixture(n_components=4, random_state=42)
plot_gmm(gmm, X)

In [None]:
np.random.seed(45)
X_stretched = np.dot(X, np.random.randn(2, 2))
plt.scatter(X_stretched[:, 0], X_stretched[:, 1], s=50)
plt.show()

In [None]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=4, random_state=1)
km.fit(X_stretched)

In [None]:
clusters_km = km.predict(X_stretched)

colores = ['r', 'g', 'b', 'k']

for cc in np.unique(clusters_km):
    plt.scatter(X_stretched[clusters_km == cc, 0], X_stretched[clusters_km == cc, 1], color=colores[cc])

In [None]:
gmm = GaussianMixture(n_components=4, random_state=42)
plot_gmm(gmm, X_stretched)

## Número de clusters

En estos algoritmos, el número de grupos se debe definir previamente. Es necesario tener una forma de elegir.

In [None]:
var = []
for k in range(1, 10):
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(X)
    var.append(km.inertia_)
    
plt.plot(range(1, 10), var)
plt.show()

In [None]:
print(km.inertia_, km.score(X))