In [36]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [37]:
!pip install faiss-cpu

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [38]:
!pip install efficientnet_pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [39]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from efficientnet_pytorch import EfficientNet

from PIL import Image
import faiss
import numpy as np
import os
from os.path import exists, join, isfile, realpath, isdir
from os import listdir, makedirs, walk

import numpy as np
from scipy.spatial.distance import pdist, squareform

import shutil

In [40]:
if exists('/content/drive/MyDrive/ORT/Master/Codes/datasets/transformed_datasets/melanoma_cancer/train/'):
  WORK_DIR = '/content/drive/MyDrive/ORT/Master/Codes/datasets/transformed_datasets/melanoma_cancer/train/'
  CLUSTER_DIR = '/content/drive/MyDrive/ORT/Tesis/Codes/clustering/melanoma/'
elif exists('/content/drive/MyDrive/ORT/Tesis/Codes/datasets/transformed_datasets/melanoma_cancer/train/'):
  WORK_DIR = '/content/drive/MyDrive/ORT/Tesis/Codes/datasets/transformed_datasets/melanoma_cancer/train/'
  CLUSTER_DIR = '/content/drive/MyDrive/ORT/Tesis/Codes/clustering/melanoma/'

In [41]:
def get_dir_files(dir_path: str):
    return [f for f in listdir(dir_path) if isfile(join(dir_path, f))]


def get_dirs(dir_path: str):
    return [d for d in listdir(dir_path) if isdir(join(dir_path, d))]

In [42]:

# cargar EfficientNet-b0 model
model = EfficientNet.from_pretrained('efficientnet-b0')

# remover ultima layer
model._fc = torch.nn.Identity()

model.eval()

# transformacion imagen
transform = transforms.Compose([
    transforms.Resize((256, 256)), 
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5]),
])


Loaded pretrained weights for efficientnet-b0


In [43]:
def extract_features(img_path):
    # Load the image
    img = Image.open(img_path).convert('RGB')

    # Apply the transformations and add an extra dimension (for the batch)
    img_t = transform(img)
    img_t = img_t.unsqueeze(0)

    # Don't calculate gradients
    with torch.no_grad():
        # Get the features from this image
        features = model(img_t)

    # The output will be in the form of a tensor, so we convert it to an array
    # Flatten the tensor to a 1D array
    features = features.numpy().flatten()

    return features


In [44]:
CLASS_NAME = get_dirs(WORK_DIR)[0]
IMAGE_PATH = join(WORK_DIR, CLASS_NAME)
image_paths = get_dir_files(IMAGE_PATH)

In [45]:
%cd "$IMAGE_PATH"

/content/drive/MyDrive/ORT/Tesis/Codes/datasets/transformed_datasets/melanoma_cancer/train/malignant


In [46]:
# Extract features for all images
all_features = []
for image_path in image_paths:
    features = extract_features(image_path)
    all_features.append(features)

# Convert list of features to numpy array
all_features = np.vstack(all_features)

# Dimension of our vector space
d = all_features.shape[1]

# Initialize a FAISS index
index = faiss.IndexFlatL2(d)

# Add vectors to the index
index.add(all_features)

In [47]:
# Number of clusters
k = 10

# Faiss k-means clustering
niter = 30
verbose = True
kmeans = faiss.Clustering(d, k)
kmeans.niter = niter
kmeans.verbose = verbose
kmeans.train(all_features, index)

# The centroids are stored in kmeans.centroids
centroids = faiss.vector_to_array(kmeans.centroids).reshape(k, d)

# To assign each vector to a cluster, you can use the index.search function
D, I = index.search(all_features, 1)

# I contains the cluster assignments for each vector
cluster_assignments = I.reshape(-1)

In [48]:
# Create a dictionary of clusters
clusters = {i: [] for i in range(k)}

# Assign each image to a cluster
for image_path, cluster_id in zip(image_paths, cluster_assignments):
    clusters[cluster_id].append(image_path)

# Create a directory for the clusters if it doesn't exist
if not os.path.exists(join(CLUSTER_DIR,f'clusters_{CLASS_NAME}_efficient')):
    os.makedirs(join(CLUSTER_DIR,f'clusters_{CLASS_NAME}_efficient'))

# Now, clusters[i] is a list of images that belong to cluster i
for i in range(k):
    print(f"Cluster {i}:")

    # Create a directory for this cluster if it doesn't exist
    cluster_dir = join(CLUSTER_DIR,f'clusters_{CLASS_NAME}_efficient', f'cluster_{i}')
    if not os.path.exists(cluster_dir):
        os.makedirs(cluster_dir)

    for image_path in clusters[i]:
        print(image_path)
        # Copy the image into the cluster directory
        shutil.copy(image_path, cluster_dir)

Cluster 0:
melanoma_8736.jpg
melanoma_8567.jpg
melanoma_8999.jpg
melanoma_6099.jpg
melanoma_8474.jpg
melanoma_7963.jpg
melanoma_5648.jpg
melanoma_5692.jpg
melanoma_5219.jpg
melanoma_8797.jpg
melanoma_6201.jpg
melanoma_6514.jpg
melanoma_7213.jpg
melanoma_8682.jpg
melanoma_7310.jpg
melanoma_6035.jpg
melanoma_8377.jpg
melanoma_7626.jpg
melanoma_6456.jpg
melanoma_7082.jpg
melanoma_7165.jpg
melanoma_9510.jpg
melanoma_8078.jpg
melanoma_6645.jpg
melanoma_5787.jpg
melanoma_8867.jpg
melanoma_6324.jpg
melanoma_5356.jpg
melanoma_8587.jpg
melanoma_6467.jpg
melanoma_7256.jpg
melanoma_8454.jpg
melanoma_7118.jpg
melanoma_5023.jpg
melanoma_6147.jpg
melanoma_6091.jpg
melanoma_8642.jpg
melanoma_6575.jpg
melanoma_8715.jpg
melanoma_7518.jpg
melanoma_8496.jpg
melanoma_6196.jpg
melanoma_6319.jpg
melanoma_6675.jpg
melanoma_6777.jpg
melanoma_8009.jpg
melanoma_7872.jpg
melanoma_9461.jpg
melanoma_5758.jpg
melanoma_7326.jpg
melanoma_8930.jpg
melanoma_8535.jpg
melanoma_8366.jpg
melanoma_6030.jpg
melanoma_6414.jpg

In [49]:
def compute_distances(feature_vectors, centroid):
    # Convert list to numpy array
    feature_vectors = np.vstack(feature_vectors)

    # Compute distances
    distances = np.linalg.norm(feature_vectors - centroid, axis=1)

    return distances

In [50]:
N = 25  # number of images you want to keep per cluster

# Initialize dictionaries for average and standard deviation of similarities
average_similarities = {}
std_dev_similarities = {}
centroid_distances = {}
top_average_similarities = {}
top_std_dev_similarities = {}
best_clusters = {}

for cluster_id, image_paths in clusters.items():
    # Extract the feature vectors for images in this cluster
    feature_vectors = np.array([extract_features(image_path) for image_path in image_paths])

    # Calculate distances from centroid
    distances = compute_distances(feature_vectors, centroids[cluster_id])

    # Save distances to centroid_distances dictionary
    centroid_distances[cluster_id] = distances

    # Pair images with their distances and sort by distance
    image_distances = sorted(zip(image_paths, distances), key=lambda x: x[1])

    # Calculate pairwise cosine similarities
    similarity_matrix = 1 - squareform(pdist(feature_vectors, metric='cosine'))

    # Flatten the matrix into a list and remove self-similarities (diagonal of the matrix)
    similarities = similarity_matrix[np.triu_indices(similarity_matrix.shape[0], k=1)]

    # Calculate the average and standard deviation of similarities
    average_similarities[cluster_id] = np.mean(similarities)
    std_dev_similarities[cluster_id] = np.std(similarities)

    # Select the top N images
    top_images = image_distances[:N]

    # Extract the feature vectors for the top N images in this cluster
    top_feature_vectors = feature_vectors[:N]

    # Calculate pairwise cosine similarities for top images
    top_similarity_matrix = 1 - squareform(pdist(top_feature_vectors, metric='cosine'))

    # Flatten the matrix into a list and remove self-similarities (diagonal of the matrix)
    top_similarities = top_similarity_matrix[np.triu_indices(top_similarity_matrix.shape[0], k=1)]

    # Calculate the average and standard deviation of similarities for top images
    top_average_similarities[cluster_id] = np.mean(top_similarities)
    top_std_dev_similarities[cluster_id] = np.std(top_similarities)

    # Check the condition for similarities and standard deviations
    if top_average_similarities[cluster_id] - top_std_dev_similarities[cluster_id] >= 0.65:
        # Only proceed with this cluster if the condition is met
        best_clusters[cluster_id] = True

        # Now, copy these images to the new directory
        cluster_dir = join(CLUSTER_DIR, f'clusters_{CLASS_NAME}_efficient', 'best_results', f'cluster_{cluster_id}')
        if not os.path.exists(cluster_dir):
            os.makedirs(cluster_dir)
        for image_path, _ in top_images:
            print(image_path)
            # Copy the image into the new cluster directory
            shutil.copy(image_path, cluster_dir)


melanoma_6206.jpg
melanoma_8090.jpg
melanoma_7424.jpg
melanoma_5816.jpg
melanoma_7051.jpg
melanoma_6220.jpg
melanoma_7698.jpg
melanoma_7757.jpg
melanoma_7440.jpg
melanoma_8695.jpg
melanoma_7353.jpg
melanoma_8237.jpg
melanoma_7449.jpg
melanoma_8046.jpg
melanoma_9039.jpg
melanoma_7433.jpg
melanoma_6808.jpg
melanoma_7555.jpg
melanoma_6695.jpg
melanoma_7669.jpg
melanoma_5791.jpg
melanoma_6274.jpg
melanoma_7163.jpg
melanoma_5346.jpg
melanoma_5719.jpg


In [51]:
for cluster_id in clusters.keys():
    print(f"Cluster {cluster_id}:")
    print(f"Average similarity: {average_similarities[cluster_id]}")
    print(f"Standard deviation of similarity: {std_dev_similarities[cluster_id]}\n")

Cluster 0:
Average similarity: 0.6362655674993154
Standard deviation of similarity: 0.10479549695151821

Cluster 1:
Average similarity: 0.6361987087955908
Standard deviation of similarity: 0.12364023570996477

Cluster 2:
Average similarity: 0.6014868129514259
Standard deviation of similarity: 0.11797012478798477

Cluster 3:
Average similarity: 0.6399419901600085
Standard deviation of similarity: 0.13239883063809077

Cluster 4:
Average similarity: 0.7116967091172455
Standard deviation of similarity: 0.0906802130453223

Cluster 5:
Average similarity: 0.6235105721403528
Standard deviation of similarity: 0.12956084502583118

Cluster 6:
Average similarity: 0.6973301218437553
Standard deviation of similarity: 0.10458012158694198

Cluster 7:
Average similarity: 0.5839995180493185
Standard deviation of similarity: 0.11210373395026604

Cluster 8:
Average similarity: 0.6073414343697879
Standard deviation of similarity: 0.10257370216767843

Cluster 9:
Average similarity: 0.7360979717074395
Standa

In [52]:

# Now print the average similarity and standard deviation of similarity for each cluster's top N images
for cluster_id in centroid_distances.keys():
    print(f"Cluster {cluster_id}:")
    print(f"Average similarity: {top_average_similarities[cluster_id]}")
    print(f"Standard deviation of similarity: {top_std_dev_similarities[cluster_id]}\n")


Cluster 0:
Average similarity: 0.6424883593153904
Standard deviation of similarity: 0.09943590639537514

Cluster 1:
Average similarity: 0.6422539893089669
Standard deviation of similarity: 0.10790988918267408

Cluster 2:
Average similarity: 0.6229996958574843
Standard deviation of similarity: 0.1008404868320282

Cluster 3:
Average similarity: 0.6398379109487685
Standard deviation of similarity: 0.12522433361016952

Cluster 4:
Average similarity: 0.7341939393299292
Standard deviation of similarity: 0.08680602159728332

Cluster 5:
Average similarity: 0.5949122468871345
Standard deviation of similarity: 0.1476978604283407

Cluster 6:
Average similarity: 0.6738822720887744
Standard deviation of similarity: 0.12735403693693642

Cluster 7:
Average similarity: 0.5677972065617398
Standard deviation of similarity: 0.11564637216894039

Cluster 8:
Average similarity: 0.6112169686607937
Standard deviation of similarity: 0.10588475014397765

Cluster 9:
Average similarity: 0.7517226358748829
Standar