<a href="https://colab.research.google.com/github/flaviorv/ml_clustering/blob/main/clustering_at_part3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

###Imports

In [1]:
import numpy as np
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.metrics import normalized_mutual_info_score

###Identifying faces of Olivetti Faces dataset with K-Means, DBSCAN and Agglomerative Clustering

In [4]:
# Loading dataset
try:
  target = np.load('olivetti_faces_target.npy')
except:
  !wget {'https://github.com/flaviorv/ml_clustering/raw/refs/heads/main/datasets/olivetti_faces_target.npy'}
  target = np.load('olivetti_faces_target.npy')

try:
  faces = np.load('olivetti_faces.npy')
except:
  !wget {'https://github.com/flaviorv/ml_clustering/raw/refs/heads/main/datasets/olivetti_faces.npy'}
  faces = np.load('olivetti_faces.npy')

# 3d matrix to 2d
n_samples, h, w = faces.shape
faces_reshaped = faces.reshape(n_samples, h * w)

# Hyperparams
eps = [5.5, 6, 6.5, 7]
min_samples = [2, 3, 4, 5]

# Saving results
best_nmi = {'dbscan': 0, 'kmeans': 0, 'agglomerative': 0}
best_params = {'dbscan': None, 'kmeans': None, 'agglomerative': None}

# Getting the best DBSCAN model
for e in eps:
  for ms in min_samples:
    dbscan = DBSCAN(eps=e, min_samples=ms)
    y_pred = dbscan.fit_predict(faces_reshaped)
    nmi = normalized_mutual_info_score(target, y_pred)
    if nmi > best_nmi['dbscan']:
      best_nmi['dbscan'] = nmi
      best_params['dbscan'] = f'eps: {e} min_samples: {ms}'
# DBSCAN results
print(f'DBSCAN\nNMI {best_nmi["dbscan"]:.2f}\nPARAMS {best_params["dbscan"]}')

# K-Means results
kmeans = KMeans(n_clusters=40, random_state=42, algorithm='elkan')
kmeans_labels = kmeans.fit_predict(faces_reshaped)
print(f'\nK-Means\nNMI {normalized_mutual_info_score(target, kmeans_labels):.2f}\nN CLUSTERS 40')

# Agglomerative Clustering results
ac = AgglomerativeClustering(n_clusters=40, linkage='ward')
ac.fit(faces_reshaped)
ac_labels = ac.labels_
print(f'\nAgglomerative Clustering\nNMI {normalized_mutual_info_score(target, ac_labels):.2f}\nN CLUSTERS 40')

DBSCAN
NMI 0.75
PARAMS eps: 6 min_samples: 2

K-Means
NMI 0.77
N CLUSTERS 40

Agglomerative Clustering
NMI 0.81
N CLUSTERS 40
