In [114]:
import re
from glob import glob
from zennitcrp.crp.attribution import CondAttribution
import os
import json
from zennit.composites import EpsilonPlusFlat
from zennit.canonizers import SequentialMergeBatchNorm
from crp.attribution import CondAttribution
from sklearn.cluster import KMeans
import numpy as np
import hdbscan
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering
from sklearn.metrics import silhouette_score
import torch
from torchvision.models.vgg import vgg16_bn
import torchvision.transforms as T
from PIL import Image

In [22]:
def analysis_clusters(methods_cluster,
                      heatmaps_scaled,
                      picture_name,
                      cluster_folder_save_path,
                      min_cluster : int = 2,
                      max_cluster : int = 16):
    cluster_range = range(min_cluster, max_cluster)
    silhouette_scores = {method: [] for method in methods_cluster}
    GMM_dict = {}
    KMeans_dict = {}
    Spectral_dict = {}
    Agglomerative_dict = {}

    for n_clusters in cluster_range:
        for method_name, clustering_function in methods_cluster.items():
            model = clustering_function(n_clusters)
            labels = model.fit_predict(heatmaps_scaled)

            if len(set(labels)) > 1:
                score = silhouette_score(heatmaps_scaled, labels)
            else:
                score = -1
            silhouette_scores[method_name].append(score)

            if method_name == GMM_CONST:
                GMM_dict[n_clusters] = labels
            elif method_name == KMEANS_CONST:
                KMeans_dict[n_clusters] = labels
            elif method_name == SPECTRAL_CONST:
                Spectral_dict[n_clusters] = labels
            elif method_name == AGGLOMERATIVE_CONST:
                Agglomerative_dict[n_clusters] = labels

    general_dict = {}
    general_score_dict = {}

    for method_name, _ in methods_cluster.items():
        if method_name == GMM_CONST:
            list_dict_keys = list(GMM_dict.keys())
            for key in list_dict_keys:
                general_dict[f"{key}_{GMM_CONST}"] = np.array(GMM_dict[key])
            general_score_dict[f"{GMM_CONST}"] = np.array(silhouette_scores[GMM_CONST])
        elif method_name == KMEANS_CONST:
            list_dict_keys = list(KMeans_dict.keys())
            for key in list_dict_keys:
                general_dict[f"{key}_{KMEANS_CONST}"] = np.array(KMeans_dict[key])
            general_score_dict[f"{KMEANS_CONST}"] = np.array(silhouette_scores[KMEANS_CONST])
        elif method_name == SPECTRAL_CONST:
            list_dict_keys = list(Spectral_dict.keys())
            for key in list_dict_keys:
                general_dict[f"{key}_{SPECTRAL_CONST}"] = np.array(Spectral_dict[key])
            general_score_dict[f"{SPECTRAL_CONST}"] = np.array(silhouette_scores[SPECTRAL_CONST])
        elif method_name == AGGLOMERATIVE_CONST:
            list_dict_keys = list(Agglomerative_dict.keys())
            for key in list_dict_keys:
                general_dict[f"{key}_{AGGLOMERATIVE_CONST}"] = np.array(Agglomerative_dict[key])
            general_score_dict[f"{AGGLOMERATIVE_CONST}"] = np.array(silhouette_scores[AGGLOMERATIVE_CONST])
    general_score_dict[f"{MIN_CLUSTER_CONST}"] = min_cluster
    general_score_dict[f"{MAX_CLUSTER_CONST}"] = max_cluster

    np.savez(f"{cluster_folder_save_path}/clusters_{picture_name}.npz", **general_dict)
    np.savez(f"{cluster_folder_save_path}/scores_{picture_name}.npz", **general_score_dict)
    return silhouette_scores, cluster_range

In [12]:
def plot_silhouette_scores(silhouette_scores, cluster_range, file_name):
    fig, ax = plt.subplots(figsize=(10, 6))

    for method, scores in silhouette_scores.items():
        ax.plot(cluster_range, scores, marker='o', linestyle='-', label=method)

    ax.set_xlabel("Nombre de Clusters")
    ax.set_ylabel("Silhouette Score")
    ax.set_title(f"Comparaison des Algorithmes de Clustering\n{file_name}")
    ax.legend()
    ax.grid(True)
    plt.show()

In [None]:
def load_and_process_normalized(file_name : str,
                                base_path,
                                layer_name : str = "layer_40"):
    data = np.load(f"{base_path}/heatmap/{file_name}")
    heatmaps = data[layer_name]
    heatmaps = np.abs(heatmaps)
    heatmaps_scaled = np.zeros_like(heatmaps)
    for i in range(heatmaps.shape[0]):
        min_val = np.min(heatmaps[i])
        max_val = np.max(heatmaps[i])
        if max_val > min_val:  # Éviter division par zéro
            heatmaps_scaled[i] = (heatmaps[i] - min_val) / (max_val - min_val)
    heatmaps_flat = heatmaps_scaled.reshape(heatmaps_scaled.shape[0], -1)
    return heatmaps, heatmaps_flat

In [None]:
def compute_feature_importance(model, input_tensor, layer_idx, num_features, pred_class):
    """
    Calcule l'importance de chaque feature d'une couche donnée pour une classe prédite.

    Arguments :
    - model : le modèle VGG16
    - input_tensor : l'image d'entrée sous forme de tenseur
    - layer_idx : l'index de la couche (ex : 40)
    - num_features : le nombre total de features dans cette couche (ex : 512)
    - pred_class : la classe prédite initialement

    Retourne :
    - Un dictionnaire {feature_idx : importance} trié par importance décroissante
    """
    input_tensor.requires_grad = True
    # Obtenir la probabilité originale de la classe prédite
    with torch.no_grad():
        output_original = model(input_tensor)
        probs_original = torch.nn.functional.softmax(output_original, dim=1)
        original_score = probs_original[0, pred_class].item()

    feature_importance = {}

    # Désactiver chaque feature une par une et mesurer l'impact
    for feature_idx in range(num_features):
        def zero_out_feature(module, input, output, feature_idx=feature_idx):
            output[:, feature_idx, :, :] = 0  # Désactiver la feature
            return output

        # Ajouter un hook temporaire
        hook = model.features[layer_idx].register_forward_hook(zero_out_feature)

        # Faire une prédiction avec la feature désactivée
        with torch.no_grad():
            output_disabled = model(input_tensor)
            probs_disabled = torch.nn.functional.softmax(output_disabled, dim=1)
            new_score = probs_disabled[0, pred_class].item()

        # Supprimer le hook
        hook.remove()

        # Calcul de l'importance
        importance = original_score - new_score
        feature_importance[feature_idx] = float(importance)

        # Affichage de progression
        #print(f"Feature {feature_idx+1}/{num_features} - Importance: {importance:.4f}")

    # Trier les features par importance décroissante
    sorted_importance = dict(sorted(feature_importance.items(), key=lambda item: item[1], reverse=True))

    return sorted_importance


In [None]:
def processPicture(model,
                   transform,
                   global_dictionary : dict,
                   picture_path : str,
                   heatmap_folder_save_path : str,
                   device : str = "cpu"):
    local_dictionary = {}
    image_name = os.path.splitext(os.path.basename(picture_path))[0]
    image = Image.open(picture_path).convert("RGB")
    input_tensor = transform(image).unsqueeze(0).to(device)
    input_tensor.requires_grad = True

    output = model(input_tensor)
    pred_class = torch.argmax(output, dim=1).item()
    probs = torch.nn.functional.softmax(output, dim=1)

    local_dictionary[CLASSE_PREDICTED] = pred_class
    local_dictionary[PROBABILITY] = probs[0, pred_class]

    composite = EpsilonPlusFlat([SequentialMergeBatchNorm()])
    attribution = CondAttribution(model, no_param_grad=True)

    features_dict = {}
    layers_heatmaps = {}
    for layer_idx, num_features in features_per_layer.items():
        all_heatmaps = []
        num_feature_per_batch = 8
        index = 0
        borne_sup = 0
        while borne_sup != num_features:
            borne_sup = min((index+1)*num_feature_per_batch, num_features)
            conditions = [{"y": [40], "features.40": [j]} for j in range(index*num_feature_per_batch, borne_sup)]
            heatmaps, _, _, _ = attribution(input_tensor, conditions, composite)
            all_heatmaps.append(heatmaps)
            index += 1
        heatmaps = np.concatenate([heatmap.cpu().numpy() for heatmap in all_heatmaps], axis=0)
        #heatmaps = np.concatenate(all_heatmaps, axis=0)
        layers_heatmaps[layer_idx] = heatmaps

        importance_dict = compute_feature_importance(model, input_tensor, layer_idx=layer_idx, num_features=num_features, pred_class=pred_class)

        features_dict[layer_idx] = importance_dict

    # Normalisation globale sur toutes les heatmaps
    min_value = min([heatmaps.min() for heatmaps in layers_heatmaps.values()])
    max_value = max([heatmaps.max() for heatmaps in layers_heatmaps.values()])
    max_value = max(abs(min_value), abs(max_value))
    min_value = -max_value

    save_path = os.path.join(heatmap_folder_save_path, f"{image_name}.npz")

    save_dict = {f"layer_{idx_layers}": np.array(heatmaps) for idx_layers, heatmaps in layers_heatmaps.items()}
    np.savez(save_path, **save_dict)

    local_dictionary[FEATURES] = features_dict
    global_dictionary[image_name] = local_dictionary
    return global_dictionary

In [None]:
def tensor_to_list(obj):
    if isinstance(obj, torch.Tensor):
        return obj.tolist()
    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")

In [None]:
def get_filename_without_extension(path: str) -> str:
    return os.path.splitext(os.path.basename(path))[0]

In [None]:
def create_json_from_data(data, output_filename):
    """
    Fonction qui prend un dictionnaire de données avec des informations sur des images,
    et les sauvegarde dans un fichier JSON structuré.

    :param data: Dictionnaire avec les données des images.
    :param output_filename: Nom du fichier JSON à créer.
    """
    # Créer un dictionnaire pour les données au format désiré
    image_data = {}

    for image_name, info in data.items():
        # Extraire les informations : classe, probabilité, et dictionnaire de features
        classe = info[CLASSE_PREDICTED]
        probability = info[PROBABILITY]
        features = info[FEATURES]

        # Ajouter ces informations dans le dictionnaire final
        image_data[image_name] = {
            CLASSE_PREDICTED: classe,
            PROBABILITY: probability,
            FEATURES: features
        }

    # Sauvegarder les données dans un fichier JSON
    with open(output_filename, 'w') as json_file:
        json.dump(image_data, json_file, indent=4, default=tensor_to_list)
    print(f"Le fichier JSON '{output_filename}' a été créé avec succès.")

Code necessaire standard

In [None]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"

model_vgg16 = vgg16_bn(True).to(device)
model_vgg16.eval()

transform_vgg16 = T.Compose([
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

Constantes

In [None]:
CLASSE_PREDICTED = "classe_predicted"
PROBABILITY = "probability"
FEATURES = "features"

In [None]:
GMM_CONST = "GMM"
KMEANS_CONST = "KMeans"
SPECTRAL_CONST = "SpectralClustering"
AGGLOMERATIVE_CONST = "AgglomerativeClustering"
HDBSCAN_CONST = "HDBSCAN"
MIN_CLUSTER_CONST = "min_cluster"
MAX_CLUSTER_CONST = "max_cluster"

TODO : modifier pour mettre le bon chemin
picture_folder_path chemin vers le dossier avec les photos
cluster_folder_save_path chemin où sont sauvegarder les .npz des heatmaps et l'importance de chaque features dans un .json
heatmap_folder_save_path chemin où sont sauvegarder les .npz des clusters et les scores de chaque méthode de clustering

In [None]:
base_path = "./data/v6/clusters"

In [None]:
picture_folder_path = f"{base_path}/pictures"
assert os.path.exists(picture_folder_path), f"Le dossier '{picture_folder_path}' n'existe pas."

In [None]:
cluster_folder_save_path = f"{base_path}/clusters"
os.makedirs(cluster_folder_save_path, exist_ok=True)
heatmap_folder_save_path = f"{base_path}/heatmap"
os.makedirs(heatmap_folder_save_path, exist_ok=True)

Modifier pour rajouter d'autre couches si voulu

In [None]:
features_per_layer = {
    40: 512
}

In [None]:
global_dictionary = {}
image_paths = glob(os.path.join(picture_folder_path, "*.jpeg"))
print(f"Nombre d'images trouvées : {len(image_paths)}")

Modifier pour ne pas traiter certaines images

In [None]:
avoid_images = []

Extraction des features pour chaque images

In [None]:
for image_path in image_paths:
    global_dictionary = {}
    if image_path in avoid_images:
        print(f"Image {image_path} avoid")
        continue
    image_name = get_filename_without_extension(image_path)
    global_dictionary = processPicture(model=model_vgg16,
                                       transform=transform_vgg16,
                                       global_dictionary = global_dictionary,
                                       picture_path = image_path,
                                       heatmap_folder_save_path=heatmap_folder_save_path,
                                       device=device)
    avoid_images.append(image_path)
    create_json_from_data(global_dictionary, f"{heatmap_folder_save_path}/{image_name}_importance.json") #importance

Modifier pour supprimer ou rajouter d'autre méthodes de clustering

In [None]:
methods_cluster = {
    GMM_CONST: lambda n: GaussianMixture(n_components=n, random_state=42, covariance_type='diag', reg_covar=1e-3),
    KMEANS_CONST: lambda n: KMeans(n_clusters=n, random_state=42),
    SPECTRAL_CONST: lambda n: SpectralClustering(n_clusters=n, affinity='nearest_neighbors', random_state=42),
    AGGLOMERATIVE_CONST: lambda n: AgglomerativeClustering(n_clusters=n),
}

In [None]:
global_dictionary = {}
heatmaps_file_list = glob(os.path.join(heatmap_folder_save_path, "*.npz"))
print(f"Nombre de 'npz' trouvées : {len(image_paths)}")

Traitement de l'analyse de clustering et affichage de l'évaluation des différentes méthodes et nombre de cluster

In [None]:
for heatmaps_file in heatmaps_file_list:
    picture_name = f"{get_filename_without_extension(heatmaps_file)}"
    file_name_with_extension = os.path.basename(heatmaps_file)
    heatmaps, heatmaps_scaled = load_and_process_normalized(file_name_with_extension, base_path)
    silhouette_scores, cluster_range = analysis_clusters(methods_cluster = methods_cluster,
                                                         heatmaps_scaled = heatmaps_scaled,
                                                         picture_name = picture_name,
                                                         cluster_folder_save_path = cluster_folder_save_path,
                                                         min_cluster=2,
                                                         max_cluster=16)
    plot_silhouette_scores(silhouette_scores = silhouette_scores,
                           cluster_range = cluster_range,
                           file_name = picture_name)