\## Image clustering with K-medoids 

In [None]:
# we install scikit-learn-extra to get k-medoids
!pip install scikit-learn-extra

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting scikit-learn-extra
  Downloading scikit_learn_extra-0.2.0-cp37-cp37m-manylinux2010_x86_64.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.1 MB/s 
Installing collected packages: scikit-learn-extra
Successfully installed scikit-learn-extra-0.2.0


In [None]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

In [None]:
from pathlib import Path
import os
path = Path('/gdrive/My Drive/')

In [None]:
import pickle
os.listdir(path / 'pets_images_2/oxford-iiit-pet/')

In [None]:
os.listdir(path)

In [None]:
with open(path / 'pets_images_2/pet_image_embeddings.pkl', 'rb') as handle:
  pets_embeddings = pickle.load(handle)

with open(path / 'dogs_vs_cats/cats_vs_dogs_embeddings.pkl', 'rb') as handle:
  dogs_and_cats_embeddings = pickle.load(handle)

In [None]:
len(pets_embeddings), pets_embeddings.keys()

In [None]:
len(dogs_and_cats_embeddings), dogs_and_cats_embeddings.keys()

In [None]:
# merge dictionaries
image_embeddings_dict = {**dogs_and_cats_embeddings, **pets_embeddings}
len(image_embeddings_dict)

In [None]:
# leave it in pets for the moment (save memory for quick experiments)
# and avoid having to fix the imread visualization paths 
image_embeddings_dict = pets_embeddings

In [None]:
import numpy as np
# separate vectors and images (for convenience)
X = []
images = []
for image, embedding in image_embeddings_dict.items():
    X.append(embedding)
    images.append(image)

## Finding nearest neighbors 

We want to know how similar is every image to its nine closest neighbors

In [None]:
from sklearn.neighbors import NearestNeighbors

# sklearn gives the cosine distance = 1 - cos_similarity
nbrs = NearestNeighbors(n_neighbors=9, metric='cosine').fit(X) 
nbrs

In [None]:
neighbor_similarities, neighbor_indices = nbrs.kneighbors(X)
neighbor_similarities, neighbor_indices

In [None]:
# check sklearn's cosine distance vs our own cosine similarity metric
from numpy.linalg import norm
def cosine_similarity(vector_a, vector_b):
    return np.inner(vector_a, vector_b) / (norm(vector_a) * norm(vector_b))


neighborhoods_dict = {}
for i in range(len(neighbor_indices)):
    center_image = images[neighbor_indices[i, 0]]
    neighbors_list = []
    for j in range(0, len(neighbor_indices[0])):
        neighbor_image = images[neighbor_indices[i][j]]
        neighbor_similarity = 1.0 - neighbor_similarities[i][j]
        neighbors_list.append([neighbor_image, neighbor_similarity])
    neighborhoods_dict[center_image] = neighbors_list

In [None]:
neighborhoods_dict

In [None]:
neighborhoods_dict['shiba_inu_92.jpg']

In [None]:
import skimage.io as io
import matplotlib.pyplot as plt


# quick and dirty plot, which is the biggest flaw here? :D 
# how can we write a quick fix to the path? 
def plot_neighborhood(neighbors_row):
  f, axarr = plt.subplots(3,3, figsize=(15,15))

  axarr[0][0].imshow(io.imread(path /'pets_images/oxford-iiit-pet/images' / neighbors_row[0][0]))
  axarr[0][1].imshow(io.imread(path /'pets_images/oxford-iiit-pet/images' / neighbors_row[1][0]))
  axarr[0][2].imshow(io.imread(path /'pets_images/oxford-iiit-pet/images' / neighbors_row[2][0]))
  axarr[1][0].imshow(io.imread(path /'pets_images/oxford-iiit-pet/images' / neighbors_row[3][0]))
  axarr[1][1].imshow(io.imread(path /'pets_images/oxford-iiit-pet/images' / neighbors_row[4][0]))
  axarr[1][2].imshow(io.imread(path /'pets_images/oxford-iiit-pet/images' / neighbors_row[5][0]))
  axarr[2][0].imshow(io.imread(path /'pets_images/oxford-iiit-pet/images' / neighbors_row[6][0]))
  axarr[2][1].imshow(io.imread(path /'pets_images/oxford-iiit-pet/images' / neighbors_row[7][0]))
  axarr[2][2].imshow(io.imread(path /'pets_images/oxford-iiit-pet/images' / neighbors_row[8][0]))

  axarr[0, 0].set_title(neighbors_row[0][1])
  axarr[0, 1].set_title(neighbors_row[1][1])
  axarr[0, 2].set_title(neighbors_row[2][1])
  axarr[1, 0].set_title(neighbors_row[3][1])
  axarr[1, 1].set_title(neighbors_row[4][1])
  axarr[1, 2].set_title(neighbors_row[5][1])
  axarr[2, 0].set_title(neighbors_row[6][1])
  axarr[2, 1].set_title(neighbors_row[7][1])
  axarr[2, 2].set_title(neighbors_row[8][1])
  
  for i in range(3):
        for j in range(3):
            axarr[i, j].axis('off')

plot_neighborhood(neighborhoods_dict['shiba_inu_92.jpg'])

### Selecting 37 representative images (medoids) 

In [None]:
from sklearn_extra.cluster import KMedoids


# We want to create the model with 37 clusters,
# dividing vectors using the cosine similarity metric
# and using the k-medoids++ initialization algorithm
# we want to define a random_state to make our result reproducible 
kmedoids = KMedoids(n_clusters=37, random_state=0, 
                        init='k-medoids++', metric='cosine')
    
kmedoids.fit(X)     

In [None]:
#assignment of images to different clusters
image_clusters = dict(zip(images, kmedoids.labels_))
image_clusters

In [None]:
clusters, counts = np.unique(kmedoids.labels_, return_counts=True)

cluster_counts = sorted(list(zip(clusters, counts)), key=lambda x:x[-1], reverse=True)
cluster_counts = dict(cluster_counts)
cluster_counts

In [None]:
representative_images = []
representative_embeddings = []
cluster_number_repr_images = []
for image, embedding in image_embeddings_dict.items():
    
    for cluster_number, repr_embedding in enumerate(kmedoids.cluster_centers_):
        comparison = np.equal(repr_embedding, embedding)
        if comparison.all():
            representative_images.append(image)
            representative_embeddings.append(embedding)
            cluster_number_repr_images.append(cluster_number)

In [None]:
representative_images

In [None]:
f, axarr = plt.subplots(1, 10, figsize=(15,15))

for i in range(10):
  axarr[i].imshow(io.imread(path /'pets_images/oxford-iiit-pet/images' / representative_images[i]))  
  axarr[i].axis('off')

In [None]:
f, axarr = plt.subplots(1, 10, figsize=(15,15))

for i in range(10):
  axarr[i].imshow(io.imread(path /'pets_images/oxford-iiit-pet/images' / representative_images[10+i]))  
  axarr[i].axis('off')

In [None]:
f, axarr = plt.subplots(1, 10, figsize=(15,15))

for i in range(10):
  axarr[i].imshow(io.imread(path /'pets_images/oxford-iiit-pet/images' / representative_images[20+i]))  
  axarr[i].axis('off')

# Choosing an 'optimal' number of clusters based on the elbow method 

We want to see find where the inflexion point happens. 

* Distortion: It is calculated as the average of the squared distances from the cluster centers of the respective clusters. 

* Inertia: It is the sum of squared distances of samples to their closest cluster center.

https://www.geeksforgeeks.org/elbow-method-for-optimal-value-of-k-in-kmeans/

In [None]:
# we choose a smaller dataset for this experiment 
# (we can go out of RAM in Colab quite easily)
image_embeddings_dict = pets_embeddings

# separate images and embeddings 
embeddings = []
images = []
for image, embedding in image_embeddings_dict.items():
    embeddings.append(embedding)
    images.append(image)

In [None]:
import numpy as np
X = np.array(embeddings)
X.shape

In [None]:
image_embeddings_dict = pets_embeddings

In [None]:
from sklearn.cluster import KMeans 
from sklearn_extra.cluster import KMedoids
from sklearn import metrics 
from scipy.spatial.distance import cdist 
import numpy as np 
import matplotlib.pyplot as plt 


distortions = [] 
inertias = [] 
mapping1 = {} 
mapping2 = {} 
K = range(40, 60, 2) 
  
for k in K: 
    #Building and fitting the model 
    kmedoidModel = KMedoids(n_clusters=k, random_state=0, init='k-medoids++', 
                            metric='cosine')
    kmedoidModel.fit(X)     
      
    distortions.append(sum(np.min(cdist(X, kmedoidModel.cluster_centers_, 
                      'cosine'),axis=1)) / X.shape[0]) 
    inertias.append(kmedoidModel.inertia_) 
  
    mapping1[k] = sum(np.min(cdist(X, kmedoidModel.cluster_centers_, 
                 'cosine'),axis=1)) / X.shape[0] 
    mapping2[k] = kmedoidModel.inertia_ 

In [None]:
plt.plot(K, distortions, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Distortion') 
plt.title('The Elbow Method using Distortion') 
plt.show() 

In [None]:
plt.plot(K, inertias, 'bx-') 
plt.xlabel('Values of K') 
plt.ylabel('Inertia') 
plt.title('The Elbow Method using Inertia') 
plt.show() 