# Notebook for clustering analysis

## Loading libraries and KMeans parameters
Modify Kmeans parameters, especially `n_clusters`, in `config.json`

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import json

from sklearn.cluster import KMeans

In [None]:
from ddae1d.utils import match_cluster_labels, sort_cluster_labels_by_size
from ddae1d.paths import PROJECT_ROOT

In [None]:
with open('config.json') as f:
    config = json.load(f)

kmeans_params = config['kmeans_params']

## Loading and flattening data for KMeans clustering

In [None]:
final_map = np.load( PROJECT_ROOT / 'data' / 'preprocessed' / 'final-map' / 'preprocessed-final-map.npy')
n_x, n_y, _ = final_map.shape
final_map = final_map.reshape((n_x * n_y, -1))

In [None]:
denoised_final_map = np.load( PROJECT_ROOT / 'data' / 'results' / 'denoised' / 'denoised-final-map.npy')
denoised_final_map = denoised_final_map.reshape((n_x * n_y, -1))

## L2 normalisation *(modify: import L2 norms from denoising notebook)*

In [None]:
denoised_final_map_norm = denoised_final_map / np.linalg.norm(final_map, axis=1, keepdims=True)
final_map_norm = final_map / np.linalg.norm(final_map, axis=1, keepdims=True)

## Temporarily removing NaNs

In [None]:
mask_noisy = ~np.any(np.isnan(final_map_norm), axis=1)
final_map_tmp = final_map_norm[mask_noisy]

In [None]:
mask_denoised = ~np.any(np.isnan(denoised_final_map_norm), axis=1)
denoised_final_map_tmp = denoised_final_map_norm[mask_denoised]

## Performing clustering

In [None]:
kmeans_noisy = KMeans(**kmeans_params)
labels_noisy_tmp = kmeans_noisy.fit_predict(final_map_tmp)

In [None]:
kmeans_denoised = KMeans(**kmeans_params)
labels_denoised_tmp = kmeans_denoised.fit_predict(denoised_final_map_tmp)

## Setting NaNs labels to `-1`

In [None]:
labels_noisy = np.full((n_x * n_y,), -1)
labels_noisy[mask_noisy] = labels_noisy_tmp

In [None]:
labels_denoised = np.full((n_x * n_y,), -1)
labels_denoised[mask_denoised] = labels_denoised_tmp

## Reshaping labels into a 2D array

In [None]:
labels_noisy = labels_noisy.reshape((n_x, n_y))
labels_denoised = labels_denoised.reshape((n_x, n_y))

## Sort noisy cluster labels by cluster size (preserving `-1`)
Dedicated function `sort_cluster_labels_by_size` from `ddae1d.utils`

In [None]:
labels_noisy = sort_cluster_labels_by_size(labels_noisy)

## Align cluster labels between noisy and denoised results for optimal correspondence

Dedicated function `match_cluster_labels` from `ddae1d.utils`

In [None]:
labels_denoised = match_cluster_labels(labels_noisy, labels_denoised)

## Plotting results

In [None]:
plt.figure(figsize=(8, 6))
plt.imshow(labels_noisy, cmap='coolwarm')
plt.title('KMeans Clusters (Noisy Data)')
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.imshow(labels_denoised, cmap='coolwarm')
plt.title('KMeans Clusters (Denoised Data)')
plt.tight_layout()
plt.show()