In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


In [3]:
!pip install efficientnet_pytorch

Collecting efficientnet_pytorch
  Downloading efficientnet_pytorch-0.7.1.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: efficientnet_pytorch
  Building wheel for efficientnet_pytorch (setup.py) ... [?25l[?25hdone
  Created wheel for efficientnet_pytorch: filename=efficientnet_pytorch-0.7.1-py3-none-any.whl size=16428 sha256=978d42edff40d1a54ce37be9fcbf8d93a1ba6b77911e349b732b2a5ccba21e87
  Stored in directory: /root/.cache/pip/wheels/03/3f/e9/911b1bc46869644912bda90a56bcf7b960f20b5187feea3baf
Successfully built efficientnet_pytorch
Installing collected packages: efficientnet_pytorch
Successfully installed efficientnet_pytorch-0.7.1


In [4]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from efficientnet_pytorch import EfficientNet

from PIL import Image
import faiss
import numpy as np
import os
from os.path import exists, join, isfile, realpath, isdir
from os import listdir, makedirs, walk

import numpy as np
from scipy.spatial.distance import pdist, squareform

import shutil

In [5]:
if exists('/content/drive/MyDrive/ORT/Master/Codes/datasets/transformed_datasets/retinopatia/train/'):
  WORK_DIR = '/content/drive/MyDrive/ORT/Master/Codes/datasets/transformed_datasets/retinopatia/train/'
  CLUSTER_DIR = '/content/drive/MyDrive/ORT/Tesis/Codes/clustering/retinopatia/'
elif exists('/content/drive/MyDrive/ORT/Tesis/Codes/datasets/transformed_datasets/retinopatia/train/'):
  WORK_DIR = '/content/drive/MyDrive/ORT/Tesis/Codes/datasets/transformed_datasets/retinopatia/train/'
  CLUSTER_DIR = '/content/drive/MyDrive/ORT/Tesis/Codes/clustering/retinopatia/'

In [6]:
def get_dir_files(dir_path: str):
    return [f for f in listdir(dir_path) if isfile(join(dir_path, f))]


def get_dirs(dir_path: str):
    return [d for d in listdir(dir_path) if isdir(join(dir_path, d))]

In [7]:

# cargar EfficientNet-b0 model
model = EfficientNet.from_pretrained('efficientnet-b0')

# remover ultima layer
model._fc = torch.nn.Identity()

model.eval()

# transformacion imagen
transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5]),
])


Downloading: "https://github.com/lukemelas/EfficientNet-PyTorch/releases/download/1.0/efficientnet-b0-355c32eb.pth" to /root/.cache/torch/hub/checkpoints/efficientnet-b0-355c32eb.pth
100%|██████████| 20.4M/20.4M [00:00<00:00, 122MB/s] 


Loaded pretrained weights for efficientnet-b0


In [8]:
def extract_features(img_path):
    # Load the image
    img = Image.open(img_path).convert('RGB')

    # Apply the transformations and add an extra dimension (for the batch)
    img_t = transform(img)
    img_t = img_t.unsqueeze(0)

    # Don't calculate gradients
    with torch.no_grad():
        # Get the features from this image
        features = model(img_t)

    # The output will be in the form of a tensor, so we convert it to an array
    # Flatten the tensor to a 1D array
    features = features.numpy().flatten()

    return features


In [9]:
CLASS_NAME = get_dirs(WORK_DIR)[0]
IMAGE_PATH = join(WORK_DIR, CLASS_NAME)
image_paths = get_dir_files(IMAGE_PATH)

In [10]:
# Create a directory for the clusters if it doesn't exist
if not os.path.exists(join(CLUSTER_DIR,f'clusters_{CLASS_NAME}_efficient')):
    os.makedirs(join(CLUSTER_DIR,f'clusters_{CLASS_NAME}_efficient'))

In [11]:
%cd "$IMAGE_PATH"

/content/drive/MyDrive/ORT/Tesis/Codes/datasets/transformed_datasets/retinopatia/train/mild


In [12]:
# Extract features for all images
all_features = []
for image_path in image_paths:
    features = extract_features(image_path)
    all_features.append(features)

# Convert list of features to numpy array
all_features = np.vstack(all_features)

# Dimension of our vector space
d = all_features.shape[1]

# Initialize a FAISS index
index = faiss.IndexFlatL2(d)

# Add vectors to the index
index.add(all_features)

In [13]:
# Number of clusters
k = 10

# Faiss k-means clustering
niter = 30
verbose = True
kmeans = faiss.Clustering(d, k)
kmeans.niter = niter
kmeans.verbose = verbose
kmeans.train(all_features, index)

# The centroids are stored in kmeans.centroids
centroids = faiss.vector_to_array(kmeans.centroids).reshape(k, d)

# To assign each vector to a cluster, you can use the index.search function
D, I = index.search(all_features, 1)

# I contains the cluster assignments for each vector
cluster_assignments = I.reshape(-1)

In [14]:
# Create a dictionary of clusters
clusters = {i: [] for i in range(k)}

# Assign each image to a cluster
for image_path, cluster_id in zip(image_paths, cluster_assignments):
    clusters[cluster_id].append(image_path)

# Now, clusters[i] is a list of images that belong to cluster i
for i in range(k):
    print(f"Cluster {i}:")

    # Create a directory for this cluster if it doesn't exist
    cluster_dir = join(CLUSTER_DIR,f'clusters_{CLASS_NAME}_efficient', f'cluster_{i}')
    if exists(cluster_dir):
      shutil.rmtree(cluster_dir)
    makedirs(cluster_dir)

    for image_path in clusters[i]:
        # Copy the image into the cluster directory
        shutil.copy(image_path, cluster_dir)

Cluster 0:
Cluster 1:
Cluster 2:
Cluster 3:
Cluster 4:
Cluster 5:
Cluster 6:
Cluster 7:
Cluster 8:
Cluster 9:


In [15]:
def compute_distances(feature_vectors, centroid):
    # Convert list to numpy array
    feature_vectors = np.vstack(feature_vectors)

    # Compute distances
    distances = np.linalg.norm(feature_vectors - centroid, axis=1)

    return distances

In [16]:
N = 25  # number of images you want to keep per cluster

# Initialize dictionaries for average and standard deviation of similarities
average_similarities = {}
std_dev_similarities = {}
centroid_distances = {}
top_average_similarities = {}
top_std_dev_similarities = {}
best_clusters = {}

for cluster_id, image_paths in clusters.items():
    # Extract the feature vectors for images in this cluster
    feature_vectors = np.array([extract_features(image_path) for image_path in image_paths])

    # Calculate distances from centroid
    distances = compute_distances(feature_vectors, centroids[cluster_id])

    # Save distances to centroid_distances dictionary
    centroid_distances[cluster_id] = distances

    # Pair images with their distances and sort by distance
    image_distances = sorted(zip(image_paths, distances), key=lambda x: x[1])

    # Calculate pairwise cosine similarities
    similarity_matrix = 1 - squareform(pdist(feature_vectors, metric='cosine'))

    # Flatten the matrix into a list and remove self-similarities (diagonal of the matrix)
    similarities = similarity_matrix[np.triu_indices(similarity_matrix.shape[0], k=1)]

    # Calculate the average and standard deviation of similarities
    average_similarities[cluster_id] = np.mean(similarities)
    std_dev_similarities[cluster_id] = np.std(similarities)

    # Select the top N images
    top_images = image_distances[:N]

    # Extract the feature vectors for the top N images in this cluster
    top_feature_vectors = feature_vectors[:N]

    # Calculate pairwise cosine similarities for top images
    top_similarity_matrix = 1 - squareform(pdist(top_feature_vectors, metric='cosine'))

    # Flatten the matrix into a list and remove self-similarities (diagonal of the matrix)
    top_similarities = top_similarity_matrix[np.triu_indices(top_similarity_matrix.shape[0], k=1)]

    # Calculate the average and standard deviation of similarities for top images
    top_average_similarities[cluster_id] = np.mean(top_similarities)
    top_std_dev_similarities[cluster_id] = np.std(top_similarities)

    # Check the condition for similarities and standard deviations
    if top_average_similarities[cluster_id] - top_std_dev_similarities[cluster_id] >= 0.65:
        # Only proceed with this cluster if the condition is met
        best_clusters[cluster_id] = True

        # Now, copy these images to the new directory
        cluster_dir = join(CLUSTER_DIR, f'clusters_{CLASS_NAME}_efficient', 'best_results', f'cluster_{cluster_id}')
        if exists(cluster_dir):
          shutil.rmtree(cluster_dir)
        makedirs(cluster_dir)
        for image_path, _ in top_images:
            # Copy the image into the new cluster directory
            shutil.copy(image_path, cluster_dir)


In [17]:
for cluster_id in centroid_distances.keys():
    # Define the path to the cluster directory
    cluster_dir = join(CLUSTER_DIR, f'clusters_{CLASS_NAME}_efficient', 'best_results', f'cluster_{cluster_id}')

    # Get a count of all files in the directory (assumes all files are images)
    image_count = len(os.listdir(cluster_dir)) if os.path.exists(cluster_dir) else 0

    print(f"Cluster {cluster_id}:")
    print(f"Image count: {image_count}")
    print(f"Average similarity: {top_average_similarities[cluster_id]}")
    print(f"Standard deviation of similarity: {top_std_dev_similarities[cluster_id]}\n")

Cluster 0:
Image count: 25
Average similarity: 0.8886942576646841
Standard deviation of similarity: 0.044125791116829995

Cluster 1:
Image count: 25
Average similarity: 0.9354241216544406
Standard deviation of similarity: 0.022694040385913584

Cluster 2:
Image count: 15
Average similarity: 0.9205824015707429
Standard deviation of similarity: 0.030245922968237266

Cluster 3:
Image count: 19
Average similarity: 0.9302734992610463
Standard deviation of similarity: 0.022000396889910782

Cluster 4:
Image count: 25
Average similarity: 0.9178726634536148
Standard deviation of similarity: 0.04044263686490279

Cluster 5:
Image count: 7
Average similarity: 0.9002619424125406
Standard deviation of similarity: 0.023681112457936443

Cluster 6:
Image count: 25
Average similarity: 0.8824425863364069
Standard deviation of similarity: 0.045916936696064385

Cluster 7:
Image count: 13
Average similarity: 0.9201084646535025
Standard deviation of similarity: 0.03096833241746696

Cluster 8:
Image count: 25


In [20]:
dirclus = join(CLUSTER_DIR,f'clusters_{CLASS_NAME}_clip')
# Now, change the working directory
os.chdir(dirclus)
folder_to_compress='best_results'
zip_file=f'{CLASS_NAME}_clusters.zip'
!zip -r -q "$zip_file" "$folder_to_compress"