# Unsupervised clustering for HESS telescope events

Trying to cluster HESS telescope events to see if we can automatically identify muon rings. These are important for calibrating the optical efficiency but currently we identify them manually which is slow.

The idea is to use CNN features from the event images and then cluster them to see if muons separate from other cosmic ray events. If it works, this could be useful for automated calibration.

In [1]:
%matplotlib inline

import time
import os, os.path
import random
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA

import tables

try:
    import tensorflow as tf
    from tensorflow.keras.applications import VGG16, VGG19, ResNet50
    print(f"TensorFlow {tf.__version__} available")
    cnn_available = True
except ImportError:
    print("TensorFlow not available")
    cnn_available = False

# Configuration
DATA_DIR = "../data"
flashcam_files = sorted(glob.glob(f"{DATA_DIR}/flashcam_run178799_full_*.h5"))
print(f"Found {len(flashcam_files)} files")

# Quick check of first file
if flashcam_files:
    with tables.open_file(flashcam_files[0], 'r') as f:
        print(f"Events per file: {f.root.images.shape[0]}")
        print(f"Image shape: {f.root.images.shape[1:]}")

TensorFlow not available
Found 0 files


In [2]:
# Check TensorFlow installation
import sys
print(f"Python executable: {sys.executable}")
print(f"Python path: {sys.path}")

try:
    import tensorflow
    print(f"TensorFlow found at: {tensorflow.__file__}")
    print(f"TensorFlow version: {tensorflow.__version__}")
except ImportError as e:
    print(f"TensorFlow import failed: {e}")

# Check if tensorflow is in the installed packages
import subprocess
result = subprocess.run([sys.executable, '-m', 'pip', 'list'], 
                       capture_output=True, text=True)
tf_lines = [line for line in result.stdout.split('\n') if 'tensorflow' in line.lower()]
print(f"TensorFlow packages found: {tf_lines}")

Python executable: /home/hpc/b129dc/b129dc28/miniconda3/bin/python
Python path: ['/home/hpc/b129dc/b129dc28/miniconda3/lib/python313.zip', '/home/hpc/b129dc/b129dc28/miniconda3/lib/python3.13', '/home/hpc/b129dc/b129dc28/miniconda3/lib/python3.13/lib-dynload', '', '/home/hpc/b129dc/b129dc28/miniconda3/lib/python3.13/site-packages']
TensorFlow import failed: No module named 'tensorflow'
TensorFlow packages found: []


### Let's import the libraries we'll need

Keras is using a TensorFlow backend in our case here.

In [3]:
import time
import os, os.path
import random
import cv2
import glob
import keras
import matplotlib
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA

import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'cv2'

## Dataset information

HESS flashcam data is stored in HDF5 files with the following format:
flashcam_run178799_full_XXX.h5

Each file contains:
- images: 56x56 pixel intensity maps (photoelectron counts)
- event_nr: event numbers
- n_pixels: number of active pixels per event
- total_charge: total charge (pe) per event

Files are numbered sequentially (000-161) with ~10k events per file.
We'll use this to load subsets of data for clustering analysis.

In [7]:
# directory where data files are stored
DATA_DIR = "../data"

def dataset_stats():
    flashcam_files = sorted(glob.glob(f"{DATA_DIR}/flashcam/flashcam_run178799_full_*.h5"))
    
    stats = []
    
    for file_path in flashcam_files:
        file_name = os.path.basename(file_path)
        
        with tables.open_file(file_path, 'r') as f:
            n_events = f.root.images.shape[0]
            charges = f.root.total_charge[:]
            n_pixels = f.root.n_pixels[:]
            
            stats.append({
                "File": file_name,
                "Events": n_events,
                "Mean charge": charges.mean(),
                "Mean pixels": n_pixels.mean()
            })
    
    return pd.DataFrame(stats)

In [5]:
# Show file statistics
dataset = dataset_stats().set_index("File")
dataset[["Events", "Mean charge", "Mean pixels"]]

Unnamed: 0_level_0,Events,Mean charge,Mean pixels
File,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
flashcam_run178799_full_000.h5,10000,207.004318,15.658600
flashcam_run178799_full_001.h5,10000,211.432007,16.007700
flashcam_run178799_full_002.h5,10000,207.352402,15.687600
flashcam_run178799_full_003.h5,10000,211.275223,15.904100
flashcam_run178799_full_004.h5,10000,210.083298,15.905500
...,...,...,...
flashcam_run178799_full_157.h5,10000,280.651855,18.840700
flashcam_run178799_full_158.h5,10000,269.676819,18.762200
flashcam_run178799_full_159.h5,10000,265.999268,18.510300
flashcam_run178799_full_160.h5,10000,273.485352,18.643700


### Loading the images

Load events from selected files and prepare them for clustering.
Events are kept at native 56x56 resolution initially, can resize later if needed for CNNs.

In [6]:
def load_images(file_indices, events_per_file=50):
    """Load HESS events from specified files"""
    
    images = []
    labels = []
    
    for file_idx in file_indices:
        file_path = f"{DATA_DIR}/flashcam_run178799_full_{file_idx:03d}.h5"
        
        with tables.open_file(file_path, 'r') as f:
            loaded = 0
            
            for i in range(f.root.images.shape[0]):
                if loaded >= events_per_file:
                    break
                    
                img = f.root.images[i]
                img_clean = np.nan_to_num(img, nan=0.0)
                
                # Skip if no significant signal
                if np.max(img_clean) < 1.0:
                    continue
                
                images.append(img_clean)
                labels.append(file_idx)
                loaded += 1
    
    return images, labels

#### Load events from selected files

Choose a few files to start with. Working with the full dataset would be more interesting but requires more memory.

In [None]:
file_ids = [0, 10, 20, 30]  
images, labels = load_images(file_ids, events_per_file=50)

print(f"Loaded {len(images)} events")
print(f"Image shape: {np.array(images[0]).shape}")
print(f"Range: {np.min(images[0]):.2f} to {np.max(images[0]):.2f}")

### Event visualization

Let's have a look at some sample events from each file to see what we're working with.

In [None]:
def show_random_images(images, labels, number_of_images_to_show=2):
    
    for file_id in set(labels):
        indices = [i for i, label in enumerate(labels) if label == file_id]
        random_indices = [random.choice(indices) for i in range(number_of_images_to_show)]
        
        fig, axes = plt.subplots(1, number_of_images_to_show, figsize=(8, 4))
        if number_of_images_to_show == 1:
            axes = [axes]
        
        for i, ax in enumerate(axes):
            img = images[random_indices[i]]
            im = ax.imshow(img, cmap='viridis', origin='lower')
            ax.set_title(f'File {file_id:03d}')
            plt.colorbar(im, ax=ax)
        
        plt.tight_layout()
        plt.show()

show_random_images(images, labels)

In [None]:
show_random_images(images, labels)

### Normalize

Convert to numpy arrays and normalize the images for clustering analysis.

In [None]:
def normalise_images(images, labels):
    
    images = np.array(images, dtype=np.float32)
    labels = np.array(labels)
    
    # Min-max normalization
    images_norm = (images - images.min()) / (images.max() - images.min())
    
    return images_norm, labels

images, labels = normalise_images(images, labels)

print(f"Shape: {images.shape}")
print(f"Range: {images.min():.3f} to {images.max():.3f}")
print(f"Files: {np.unique(labels)}")

In [None]:
images, labels = normalise_images(images, labels)

### Shuffle data

For clustering we just need to shuffle the events since we don't have true labels to split on.

In [None]:
def shuffle_data(images, labels):
    indices = np.arange(len(images))
    np.random.shuffle(indices)
    
    return images[indices], labels[indices]

X_train, y_train = shuffle_data(images, labels)

print(f"Data shape: {X_train.shape}")
print(f"Ready for clustering")

In [None]:
X_train, y_train = shuffle_data(images, labels)

### Load pre-trained CNN models

Load VGG16, VGG19, ResNet50 with ImageNet weights for feature extraction.
Need to resize our 56x56 images to 224x224 for these models.

In [None]:
from scipy.ndimage import zoom

def prepare_for_cnn(X_train):
    """Convert 56x56 to 224x224 RGB for CNN input"""
    X_resized = []
    scale_factor = 224 / 56
    
    for img in X_train:
        img_224 = zoom(img, scale_factor, order=1)
        img_rgb = np.stack([img_224, img_224, img_224], axis=-1)
        X_resized.append(img_rgb)
    
    return np.array(X_resized, dtype=np.float32)

try:
    X_train_cnn = prepare_for_cnn(X_train)
    print(f"CNN input shape: {X_train_cnn.shape}")
    
    vgg16_model = VGG16(include_top=False, weights="imagenet", input_shape=(224,224,3))
    vgg19_model = VGG19(include_top=False, weights="imagenet", input_shape=(224,224,3))
    resnet50_model = ResNet50(include_top=False, weights="imagenet", input_shape=(224,224,3))
    
    print("Loaded CNN models")
    cnn_available = True
    
except:
    print("TensorFlow not available, using raw pixels")
    cnn_available = False
    X_train_cnn = None

In [None]:
# Load the models with ImageNet weights

vgg16_model = keras.applications.vgg16.VGG16(include_top=False, weights="imagenet", input_shape=(224,224,3))

vgg19_model = keras.applications.vgg19.VGG19(include_top=False, weights="imagenet", input_shape=(224,224,3))

resnet50_model = keras.applications.resnet50.ResNet50(include_top=False, weights="imagenet", input_shape=(224,224,3))




### Output... falls flat

The covnet models will give us 3D vectors that represent the image. We need to flatten these for the clustering algorithms to start working with them.

In [None]:
def covnet_transform(covnet_model, raw_images):
    """Extract features using CNN and flatten"""
    
    # Pass data through the network
    pred = covnet_model.predict(raw_images)
    
    # Flatten the array
    flat = pred.reshape(raw_images.shape[0], -1)
    
    return flat

if cnn_available:
    # Extract features using CNNs
    vgg16_output = covnet_transform(vgg16_model, X_train_cnn)
    print(f"VGG16 flattened output has {vgg16_output.shape[1]} features")
    
    vgg19_output = covnet_transform(vgg19_model, X_train_cnn)
    print(f"VGG19 flattened output has {vgg19_output.shape[1]} features")
    
    resnet50_output = covnet_transform(resnet50_model, X_train_cnn)
    print(f"ResNet50 flattened output has {resnet50_output.shape[1]} features")
    
else:
    # Use raw pixel values as features
    raw_features = X_train.reshape(X_train.shape[0], -1)
    print(f"Using raw pixel features: {raw_features.shape[1]} features per image")
    print(f"Total data shape: {raw_features.shape}")
    
    # Assign to the same variable names for consistency
    vgg16_output = raw_features
    vgg19_output = raw_features  
    resnet50_output = raw_features

In [None]:
# PCA for dimensionality reduction

def create_fit_PCA(data, n_components=None):
    p = PCA(n_components=n_components, random_state=42)
    p.fit(data)
    return p

# Create PCA instances for each feature set (all the same in our case)
vgg16_pca = create_fit_PCA(vgg16_output)
vgg19_pca = create_fit_PCA(vgg19_output)
resnet50_pca = create_fit_PCA(resnet50_output)

# Function to plot cumulative explained variance
def pca_cumsum_plot(pca, title="PCA"):
    plt.figure(figsize=(8, 5))
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of components')
    plt.ylabel('Cumulative explained variance')
    plt.title(f'{title} - Cumulative Explained Variance')
    plt.grid(True)
    plt.show()

# Plot explained variance for raw pixel features
pca_cumsum_plot(vgg16_pca, "Raw Pixel Features")

# Transform the data using PCA
vgg16_output_pca = vgg16_pca.transform(vgg16_output)
vgg19_output_pca = vgg19_pca.transform(vgg19_output)
resnet50_output_pca = resnet50_pca.transform(resnet50_output)

print(f"Original features: {vgg16_output.shape[1]}")
print(f"PCA features: {vgg16_output_pca.shape[1]}")
print(f"Explained variance with all components: {np.sum(vgg16_pca.explained_variance_ratio_):.3f}")

The above cell shows us the number of features each covnet gives to a single image. When we compare these to the original size of the image 224 x 224 x 3 = 150,528 pixels/features, we can see that this is a large reduction in what the clustering algorithms will have to work with.

 

Hopefully these reduces number of feature are represent more meaningful features in the image structure.

In [None]:
# Clustering functions

def create_train_kmeans(data, number_of_clusters=4):
    k = KMeans(n_clusters=number_of_clusters, n_init=10, random_state=42)
    
    start = time.time()
    k.fit(data)
    end = time.time()
    
    print(f"Training took {end-start:.3f} seconds")
    
    return k

def create_train_gmm(data, number_of_clusters=4):
    g = GaussianMixture(n_components=number_of_clusters, covariance_type="full", random_state=42)
    
    start = time.time()
    g.fit(data)
    end = time.time()
    
    print(f"Training took {end-start:.3f} seconds")
    
    return g

# Try different numbers of clusters for HESS data
n_clusters = 4  # Start with 4 like the original notebook

print("KMeans clustering:")
print("Raw features:")
K_raw = create_train_kmeans(vgg16_output, n_clusters)

print("\nPCA features:")
K_raw_pca = create_train_kmeans(vgg16_output_pca, n_clusters)

print("\nGaussian Mixture clustering:")
print("PCA features:")
G_raw_pca = create_train_gmm(vgg16_output_pca, n_clusters)

# Get cluster predictions
k_raw_pred = K_raw.predict(vgg16_output)
k_raw_pred_pca = K_raw_pca.predict(vgg16_output_pca)
g_raw_pred_pca = G_raw_pca.predict(vgg16_output_pca)

print(f"\nClustering complete!")
print(f"KMeans raw: {len(set(k_raw_pred))} clusters")
print(f"KMeans PCA: {len(set(k_raw_pred_pca))} clusters") 
print(f"GMM PCA: {len(set(g_raw_pred_pca))} clusters")

### Ask the clustering algo what it thinks is what

In [None]:
# Let's pass the data into the algorithm and predict who lies in which cluster. 
# Since we're using the same data that we trained it on, this should give us the training results.

# Here we create and fit a KMeans model with the PCA outputs
print("KMeans (PCA): \n")

print("VGG16")
K_vgg16_pca = create_train_kmeans(vgg16_output_pca)

print("\nVGG19")
K_vgg19_pca = create_train_kmeans(vgg19_output_pca)

print("\nResNet50")
K_resnet50_pca = create_train_kmeans(resnet50_output_pca)

In [None]:
# Same for Gaussian Model
print("GMM (PCA): \n")

print("VGG16")
G_vgg16_pca = create_train_gmm(vgg16_output_pca)

print("\nVGG19")
G_vgg19_pca = create_train_gmm(vgg19_output_pca)

print("\nResNet50")
G_resnet50_pca = create_train_gmm(resnet50_output_pca)


In [None]:
# Let's also create models for the covnet outputs without PCA for comparison
print("KMeans: \n")

print("VGG16:")
K_vgg16 = create_train_kmeans(vgg16_output)

print("\nVGG19:")
K_vgg19 = create_train_kmeans(vgg19_output)

print("\nResNet50:")
K_resnet50 = create_train_kmeans(resnet50_output)


#### Attempts to run the Gaussian Mixtue Model on the outputs without PCA always give an out of memory error. I am therefore unable to test these and conclude that they are impractical to use.

In [None]:
# Now we get the custer model predictions

# KMeans with PCA outputs
k_vgg16_pred_pca = K_vgg16_pca.predict(vgg16_output_pca)
k_vgg19_pred_pca = K_vgg19_pca.predict(vgg19_output_pca)
k_resnet50_pred_pca = K_resnet50_pca.predict(resnet50_output_pca)

# KMeans with CovNet outputs
k_vgg16_pred = K_vgg16.predict(vgg16_output)
k_vgg19_pred = K_vgg19.predict(vgg19_output)
k_resnet50_pred = K_resnet50.predict(resnet50_output)

# Gaussian Mixture with PCA outputs
g_resnet50_pred_pca = G_resnet50_pca.predict(resnet50_output_pca)
g_vgg16_pred_pca = G_vgg16_pca.predict(vgg16_output_pca)
g_vgg19_pred_pca = G_vgg19_pca.predict(vgg19_output_pca)

Remember that the clustering algorith does not detect which images are cats and which are dogs, it only groups images that look alike together and assigns them a number arbitrarily. 

We now need to count how many of each label are in  each cluster, this way we can take a look and if sufficient eperation has happened we can quicly see which cluster is which label. So let's write a function that does that.

In [None]:
# Analyze the clusters

def cluster_label_count(clusters, labels):
    """Count how many events from each file end up in each cluster"""
    
    count = {}
    
    unique_clusters = list(set(clusters))
    unique_labels = list(set(labels))
    
    for cluster in unique_clusters:
        count[cluster] = {}
        for label in unique_labels:
            count[cluster][label] = 0
    
    for i in range(len(clusters)):
        count[clusters[i]][labels[i]] += 1
    
    cluster_df = pd.DataFrame(count)
    return cluster_df

# Analyze cluster distributions
print("Cluster distributions by file:")
print("\nKMeans Raw features:")
raw_clusters = cluster_label_count(k_raw_pred, y_train)
print(raw_clusters)

print("\nKMeans PCA features:")
pca_clusters = cluster_label_count(k_raw_pred_pca, y_train)
print(pca_clusters)

print("\nGMM PCA features:")
gmm_clusters = cluster_label_count(g_raw_pred_pca, y_train)
print(gmm_clusters)

# Show some sample events from each cluster
def show_cluster_samples(images, predictions, n_clusters=4, events_per_cluster=3):
    
    fig, axes = plt.subplots(n_clusters, events_per_cluster, figsize=(12, 3*n_clusters))
    
    for cluster_id in range(n_clusters):
        # Find events in this cluster
        cluster_indices = [i for i, pred in enumerate(predictions) if pred == cluster_id]
        
        if len(cluster_indices) == 0:
            continue
            
        # Select random samples
        sample_indices = np.random.choice(cluster_indices, 
                                        min(events_per_cluster, len(cluster_indices)), 
                                        replace=False)
        
        for j, idx in enumerate(sample_indices):
            ax = axes[cluster_id, j] if n_clusters > 1 else axes[j]
            
            im = ax.imshow(images[idx], cmap='viridis', origin='lower')
            ax.set_title(f'Cluster {cluster_id}\nEvent {idx}')
            ax.axis('off')
            plt.colorbar(im, ax=ax)
    
    plt.suptitle('Sample Events by Cluster (KMeans PCA)', fontsize=14)
    plt.tight_layout()
    plt.show()

print("\nSample events from each cluster:")
show_cluster_samples(X_train, k_raw_pred_pca)

In [None]:
# Analyze what makes each cluster different

def analyze_cluster_properties(images, labels, predictions):
    """Analyze physical properties of events in each cluster"""
    
    results = []
    
    for cluster_id in set(predictions):
        cluster_indices = [i for i, pred in enumerate(predictions) if pred == cluster_id]
        
        if len(cluster_indices) == 0:
            continue
            
        cluster_images = images[cluster_indices]
        
        # Calculate properties
        total_charges = [np.sum(img) for img in cluster_images]
        max_pixels = [np.max(img) for img in cluster_images]
        n_pixels_active = [np.sum(img > 0.1) for img in cluster_images]  # pixels above 10% of max
        
        results.append({
            'Cluster': cluster_id,
            'N_events': len(cluster_indices),
            'Mean_charge': np.mean(total_charges),
            'Mean_max_pixel': np.mean(max_pixels),
            'Mean_active_pixels': np.mean(n_pixels_active),
            'Charge_std': np.std(total_charges)
        })
    
    return pd.DataFrame(results)

print("Cluster characteristics (KMeans PCA):")
cluster_props = analyze_cluster_properties(X_train, y_train, k_raw_pred_pca)
print(cluster_props)

# Show representative event from each cluster (the one closest to cluster center)
def show_representative_events(images, predictions, model):
    """Show the event closest to each cluster center"""
    
    n_clusters = len(set(predictions))
    centers = model.cluster_centers_
    
    fig, axes = plt.subplots(1, n_clusters, figsize=(4*n_clusters, 4))
    if n_clusters == 1:
        axes = [axes]
    
    for cluster_id in range(n_clusters):
        cluster_indices = [i for i, pred in enumerate(predictions) if pred == cluster_id]
        
        if len(cluster_indices) == 0:
            continue
            
        # Find event closest to cluster center
        cluster_data = vgg16_output_pca[cluster_indices]
        distances = np.linalg.norm(cluster_data - centers[cluster_id], axis=1)
        closest_idx = cluster_indices[np.argmin(distances)]
        
        ax = axes[cluster_id]
        im = ax.imshow(images[closest_idx], cmap='viridis', origin='lower')
        ax.set_title(f'Cluster {cluster_id}\n{len(cluster_indices)} events\nRepresentative event')
        ax.axis('off')
        plt.colorbar(im, ax=ax)
    
    plt.suptitle('Representative Events (Closest to Cluster Centers)', fontsize=14)
    plt.tight_layout()
    plt.show()

print("\nRepresentative events:")
show_representative_events(X_train, k_raw_pred_pca, K_raw_pca)