In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!cd /d drive

/bin/bash: line 1: cd: too many arguments


In [3]:
!dir

drive  sample_data


In [4]:
%cd "/content/drive/MyDrive/ORT/Tesis/Codes/datasets/original_datasets/"

/content/drive/MyDrive/ORT/Tesis/Codes/datasets/original_datasets


In [5]:
# !unzip imagewang-320.zip
!pip install transformers
!pip install faiss-cpu



In [6]:
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from transformers import CLIPImageProcessor, CLIPModel, CLIPTokenizer

from PIL import Image
import faiss
import numpy as np
import os
from os.path import exists, join, isfile, realpath, isdir
from os import listdir, makedirs, walk

import numpy as np
from scipy.spatial.distance import pdist, squareform

import shutil

from sklearn.cluster import KMeans
import numpy as np


In [7]:
if exists('/content/drive/MyDrive/ORT/Master/Codes/datasets/transformed_datasets/pneumoconiosis/pneumoconiosis_resized/'):
  WORK_DIR = '/content/drive/MyDrive/ORT/Master/Codes/datasets/transformed_datasets/pneumoconiosis/pneumoconiosis_resized/train/'
  CLUSTER_DIR = '/content/drive/MyDrive/ORT/Tesis/Codes/clustering/pneumoconiosis/'
elif exists('/content/drive/MyDrive/ORT/Tesis/Codes/datasets/transformed_datasets/pneumoconiosis/pneumoconiosis_resized/'):
  WORK_DIR = '/content/drive/MyDrive/ORT/Tesis/Codes/datasets/transformed_datasets/pneumoconiosis/pneumoconiosis_resized/train/'
  CLUSTER_DIR = '/content/drive/MyDrive/ORT/Tesis/Codes/clustering/pneumoconiosis/'

In [8]:
def get_dir_files(dir_path: str):
    return [f for f in listdir(dir_path) if isfile(join(dir_path, f))]


def get_dirs(dir_path: str):
    return [d for d in listdir(dir_path) if isdir(join(dir_path, d))]

def count_files_in_dirs(base_path):
    for root, dirs, files in walk(base_path):
        print(f"cantidad de imagenes en {root}: {len(files)}")


In [9]:
# Load the CLIP model
model_ID = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(model_ID)

preprocess = CLIPImageProcessor.from_pretrained(model_ID)

# Define a function to load an image and preprocess it for CLIP
def load_and_preprocess_image(image_path):
    # Load the image from the specified path
    image = Image.open(image_path)

    # Apply the CLIP preprocessing to the image
    image = preprocess(image, return_tensors="pt")

    # Return the preprocessed image
    return image

# transformacion imagen
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5]),
])


In [10]:
def extract_features(img_path):
    # Load the image
    img = Image.open(img_path).convert('RGB')

    # Apply the transformations and add an extra dimension (for the batch)
    img_t = transform(img)
    img_t = img_t.unsqueeze(0)

    # Don't calculate gradients
    with torch.no_grad():
        # Get the features from this image
        features = model.get_image_features(img_t)

    # The output will be in the form of a tensor, so we convert it to an array
    # Flatten the tensor to a 1D array
    features = features.numpy().flatten()

    return features


In [11]:
CLASS_NAME = get_dirs(WORK_DIR)[1]
IMAGE_PATH = join(WORK_DIR, CLASS_NAME)
image_paths = get_dir_files(IMAGE_PATH)

In [12]:
%cd "$IMAGE_PATH"

/content/drive/MyDrive/ORT/Tesis/Codes/datasets/transformed_datasets/pneumoconiosis/pneumoconiosis_resized/train/1


In [13]:
# Extract features for all images
all_features = []
image_features = {}
for image_path in image_paths:
    features = extract_features(image_path)
    all_features.append(features)
    image_features[image_path] = features

# Convert list of features to numpy array
all_features = np.vstack(all_features)

# Dimension of our vector space
d = all_features.shape[1]

# Initialize a FAISS index
index = faiss.IndexFlatL2(d)

# Add vectors to the index
index.add(all_features)

In [14]:
# # Define a range of clusters you want to try (1 to 20, for example)
# range_n_clusters = range(1, 21)

# # Create an empty list to hold the WCSS for each cluster size
# wcss = []

# for num_clusters in range_n_clusters:
#     # Train KMeans for each cluster size
#     kmeans = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=300, n_init=10, random_state=0)
#     kmeans.fit(all_features)

#     # Append WCSS for this cluster size
#     wcss.append(kmeans.inertia_)

# # Calculate the difference in WCSS between each number of clusters
# diff = np.diff(wcss)

# # Calculate the difference in differences
# diff_r = diff[1:] / diff[:-1]

# # Find the index where the rate of change starts to increase rapidly (the elbow point)
# k_opt = np.where(diff_r == np.min(diff_r))[0][0] + 2

# print("Optimal number of clusters:", k_opt)

In [15]:
# Number of clusters
k = 5

# Faiss k-means clustering
niter = 20
verbose = True
kmeans = faiss.Clustering(d, k)
kmeans.niter = niter
kmeans.verbose = verbose
kmeans.train(all_features, index)

# The centroids are stored in kmeans.centroids
centroids = faiss.vector_to_array(kmeans.centroids).reshape(k, d)

# To assign each vector to a cluster, you can use the index.search function
D, I = index.search(all_features, 1)

# I contains the cluster assignments for each vector
cluster_assignments = I.reshape(-1)

In [16]:
# Create a dictionary of clusters
clusters = {i: [] for i in range(k)}

# Assign each image to a cluster
for image_path, cluster_id in zip(image_paths, cluster_assignments):
    clusters[cluster_id].append(image_path)

# Create a directory for the clusters if it doesn't exist
if not os.path.exists(join(CLUSTER_DIR,f'clusters_{CLASS_NAME}_clip')):
    os.makedirs(join(CLUSTER_DIR,f'clusters_{CLASS_NAME}_clip'))

# Now, clusters[i] is a list of images that belong to cluster i
for i in range(k):
    print(f"Cluster {i}:")

    # Create a directory for this cluster if it doesn't exist
    cluster_dir = join(CLUSTER_DIR,f'clusters_{CLASS_NAME}_clip', f'cluster_{i}')
    if exists(cluster_dir):
      shutil.rmtree(cluster_dir)
    makedirs(cluster_dir)

    for image_path in clusters[i]:
        # Copy the image into the cluster directory
        shutil.copy(image_path, cluster_dir)

Cluster 0:
Cluster 1:
Cluster 2:
Cluster 3:
Cluster 4:


In [17]:
def compute_distances(feature_vectors, centroid):
    # Convert list to numpy array
    feature_vectors = np.vstack(feature_vectors)

    # Compute distances
    distances = np.linalg.norm(feature_vectors - centroid, axis=1)

    return distances

In [18]:
N = 25  # number of images you want to keep per cluster

# Initialize dictionaries for average and standard deviation of similarities
average_similarities = {}
std_dev_similarities = {}
centroid_distances = {}
top_average_similarities = {}
top_std_dev_similarities = {}
best_clusters = {}

for cluster_id, image_paths in clusters.items():
    # Extract the feature vectors for images in this cluster
    feature_vectors = np.array([extract_features(image_path) for image_path in image_paths])

    # Calculate distances from centroid
    distances = compute_distances(feature_vectors, centroids[cluster_id])

    # Save distances to centroid_distances dictionary
    centroid_distances[cluster_id] = distances

    # Pair images with their distances and sort by distance
    image_distances = sorted(zip(image_paths, distances), key=lambda x: x[1])

    # Calculate pairwise cosine similarities
    similarity_matrix = 1 - squareform(pdist(feature_vectors, metric='cosine'))

    # Flatten the matrix into a list and remove self-similarities (diagonal of the matrix)
    similarities = similarity_matrix[np.triu_indices(similarity_matrix.shape[0], k=1)]

    # Calculate the average and standard deviation of similarities
    average_similarities[cluster_id] = np.mean(similarities)
    std_dev_similarities[cluster_id] = np.std(similarities)

    # Select the top N images
    top_images = image_distances[:N]

    # Extract the feature vectors for the top N images in this cluster
    top_feature_vectors = feature_vectors[:N]

    # Calculate pairwise cosine similarities for top images
    top_similarity_matrix = 1 - squareform(pdist(top_feature_vectors, metric='cosine'))

    # Flatten the matrix into a list and remove self-similarities (diagonal of the matrix)
    top_similarities = top_similarity_matrix[np.triu_indices(top_similarity_matrix.shape[0], k=1)]

    # Calculate the average and standard deviation of similarities for top images
    top_average_similarities[cluster_id] = np.mean(top_similarities)
    top_std_dev_similarities[cluster_id] = np.std(top_similarities)

    # Check the condition for similarities and standard deviations
    if top_average_similarities[cluster_id] - top_std_dev_similarities[cluster_id] >= 0.65:
        # Only proceed with this cluster if the condition is met
        best_clusters[cluster_id] = True

        # Now, copy these images to the new directory
        cluster_dir = join(CLUSTER_DIR, f'clusters_{CLASS_NAME}_clip', 'best_results', f'cluster_{cluster_id}')
        if exists(cluster_dir):
          shutil.rmtree(cluster_dir)
        makedirs(cluster_dir)
        for image_path, _ in top_images:
            # Copy the image into the new cluster directory
            shutil.copy(image_path, cluster_dir)


In [19]:
for cluster_id in centroid_distances.keys():
    # Define the path to the cluster directory
    cluster_dir = join(CLUSTER_DIR, f'clusters_{CLASS_NAME}_clip', 'best_results', f'cluster_{cluster_id}')

    # Get a count of all files in the directory (assumes all files are images)
    image_count = len(os.listdir(cluster_dir)) if os.path.exists(cluster_dir) else 0

    print(f"Cluster {cluster_id}:")
    print(f"Image count: {image_count}")
    print(f"Average similarity: {top_average_similarities[cluster_id]}")
    print(f"Standard deviation of similarity: {top_std_dev_similarities[cluster_id]}\n")

Cluster 0:
Image count: 17
Average similarity: 0.9806486840295047
Standard deviation of similarity: 0.0062844574782212614

Cluster 1:
Image count: 25
Average similarity: 0.9859809393061323
Standard deviation of similarity: 0.004431116246495783

Cluster 2:
Image count: 25
Average similarity: 0.9827191757573807
Standard deviation of similarity: 0.005742660162247082

Cluster 3:
Image count: 13
Average similarity: 0.9853410116389091
Standard deviation of similarity: 0.004141346983842292

Cluster 4:
Image count: 11
Average similarity: 0.965779163492399
Standard deviation of similarity: 0.011640635242215298



In [20]:
clusters_dir = join(CLUSTER_DIR, f'clusters_{CLASS_NAME}_clip')
count_files_in_dirs(clusters_dir)

cantidad de imagenes en /content/drive/MyDrive/ORT/Tesis/Codes/clustering/pneumoconiosis/clusters_1_clip: 0
cantidad de imagenes en /content/drive/MyDrive/ORT/Tesis/Codes/clustering/pneumoconiosis/clusters_1_clip/cluster_0: 17
cantidad de imagenes en /content/drive/MyDrive/ORT/Tesis/Codes/clustering/pneumoconiosis/clusters_1_clip/cluster_1: 36
cantidad de imagenes en /content/drive/MyDrive/ORT/Tesis/Codes/clustering/pneumoconiosis/clusters_1_clip/cluster_2: 37
cantidad de imagenes en /content/drive/MyDrive/ORT/Tesis/Codes/clustering/pneumoconiosis/clusters_1_clip/cluster_3: 13
cantidad de imagenes en /content/drive/MyDrive/ORT/Tesis/Codes/clustering/pneumoconiosis/clusters_1_clip/cluster_4: 11
cantidad de imagenes en /content/drive/MyDrive/ORT/Tesis/Codes/clustering/pneumoconiosis/clusters_1_clip/best_results: 0
cantidad de imagenes en /content/drive/MyDrive/ORT/Tesis/Codes/clustering/pneumoconiosis/clusters_1_clip/best_results/cluster_0: 17
cantidad de imagenes en /content/drive/MyDri

In [21]:
dirclus = join(CLUSTER_DIR,f'clusters_{CLASS_NAME}_clip')
# Now, change the working directory
os.chdir(dirclus)
folder_to_compress='best_results'
zip_file=f'{CLASS_NAME}_clusters.zip'
!zip -r -q "$zip_file" "$folder_to_compress"

In [22]:
# for cluster_id in clusters.keys():
#     print(f"Cluster {cluster_id}:")
#     print(f"Average similarity: {average_similarities[cluster_id]}")
#     print(f"Standard deviation of similarity: {std_dev_similarities[cluster_id]}\n")

In [23]:
# best_clusters = sorted(clusters.keys(), key=lambda x: (average_similarities[x], -std_dev_similarities[x]), reverse=True)
# best_clusters

In [24]:
# %cd "$CLUSTER_DIR""clusters_glioma_efficient"
# if not os.path.exists('best_results'):
#     os.makedirs('best_results')

# numero = 0
# for cluster_id in best_clusters[:2]:
#     numero += 1
#     src_dir = f"cluster_{cluster_id}"
#     dest_dir = os.path.join('best_results', f"cluster_{numero}")
#     if not os.path.exists(dest_dir):
#         os.makedirs(dest_dir)
#     for filename in os.listdir(src_dir):
#         src_file = os.path.join(src_dir, filename)
#         dest_file = os.path.join(dest_dir, filename)
#         shutil.copy(src_file, dest_file)


# !zip -r -q "/content/drive/MyDrive/ORT/Tesis/Codes/clustering/brain_tumor/clusters_glioma_efficient/best_results/glioma_clusters.zip" "/content/drive/MyDrive/ORT/Tesis/Codes/clustering/brain_tumor/clusters_glioma_efficient/best_results"


In [25]:
# from google.colab import files
# files.download('glioma_1.zip')