# 3. extract 1 file of h5 images -> useable data

this notebook is for extracting individual crystals from the large h5 files, and converting to their own individual png + have a corresponding csv with a bunch of stats about the particle

(notebook follows on from 2. which has more comments + details)

* one file at a time -> potentially loop in the future
* assume Jonny format, all images stacked together
* !! importantly, using Jaffeux et al. 2022 paper for processing details !!

In [1]:
import xarray as xr
import matplotlib.pyplot as plt
import matplotlib as mpl
import numpy as np
from glob import glob
import seaborn as sns
import xesmf as xe
import pandas as pd
#import def_homebrew as hb ## homemade functions xox
from scipy.special import gamma
import netCDF4 as nc
from datetime import datetime, timedelta

import h5py ####
from PIL import Image
#from IPython.display import display #
#import cv2 # not working
import os

from scipy.ndimage import convolve, label
from skimage.measure import regionprops, find_contours
from scipy.spatial import ConvexHull, distance_matrix
from skimage.morphology import remove_small_holes ## remove holes <3
from scipy.ndimage import binary_fill_holes
from skimage import measure
from cv2 import cvtColor, COLOR_BGR2GRAY, threshold, THRESH_BINARY, THRESH_OTSU

In [2]:
## files location
ds_loc = '/home/users/esree/data/2ds/'
hvps_loc = '/home/users/esree/data/hvps/'

#file of interest
file_name = 'Export_base220730153000.h5' # example file # Export_base220720161837.h5 #Export_base220730153000
f2ds = h5py.File(ds_loc+ file_name,'r') # open file

# break file into two - data + time
ds_image = f2ds['ImageData'] 
ds_time = f2ds['ImageTimes']
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
##### make xarray of useful time data #####
sec_since = ds_time[:,0]
pixel_slice = ds_time[:,1]
pix_sum = pixel_slice.cumsum(dtype = 'int')

## make useful datetime format (not seconds since midnight)
# using the file name for reference
date_str = file_name[11:17]
starting_date = datetime.strptime(date_str, '%y%m%d')
time_deltas = [timedelta(seconds=float(sec)) for sec in sec_since]
utc_time = [starting_date + delta for delta in time_deltas]

time_xr =xr.Dataset({
    'utc_time':utc_time,
    'pixel_slice': pixel_slice,
    'pix_sum': pix_sum})
## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

In [3]:
## this is the code for splitting up the big h5 file -> individual images
save_path = '/gws/nopw/j04/dcmex/users/ezriab/2dprocessed/'
folder_name = f'flight_{file_name[11:23]}' # each flightset -> own folder

if not os.path.exists(save_path+folder_name):
    os.makedirs(save_path+folder_name)
    print("Folder created successfully!")
else:
    print("Folder already exists.")
save_loc = save_path+folder_name+'/'

Folder already exists.


In [13]:
## functions to make this run smoothly
def stats_description(bw_crystal):
    '''take binary image, fill in small holes and returns object containing stats about crystal'''
    
    filled_particle = remove_small_holes(bw_crystal.image, area_threshold=4) # fill in voids within binary image - better estimation of stats # may need to be altered
    
    # can see the filled in particle if needs be
    #plt.imshow(filled_particle, cmap='gray')
    if filled_particle.shape[0] < 2 or filled_particle.shape[1] < 2:
        return None
    
    contours = measure.find_contours(filled_particle, 0.5)
    if contours:
        contour = max(contours, key=lambda x: x.shape[0])  # Sort contours by area (largest first) and select the largest contour
        
        labeled_image = measure.label(filled_particle)  # Label the image based on the threshold
        region = measure.regionprops(labeled_image)[0]  # Assumes largest labeled region corresponds to largest contour
        
        return region
    else:
        return None

## ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# function to add padding to extracted crystal - better sample to match Jaffeux
'''
def add_padding(one_crystal, desired_png_size):
    # Calculate padding
    pad_height = (desired_png_size - one_crystal.shape[0]) // 2
    pad_width = (desired_png_size - one_crystal.shape[1]) // 2
    
    # Pad the image
    padded_image = np.pad(one_crystal, ((pad_height, pad_height), (pad_width, pad_width)), mode='constant', constant_values=255)
    
    # In case the dimensions are odd, add an extra row/column to match the desired size
    if padded_image.shape[0] < desired_png_size:
        padded_image = np.pad(padded_image, ((0, 1), (0, 0)), mode='constant', constant_values=255)
    if padded_image.shape[1] < desired_png_size:
        padded_image = np.pad(padded_image, ((0, 0), (0, 1)), mode='constant', constant_values=255)

    return padded_image

'''
def add_padding(one_crystal, desired_png_size):
    # Calculate padding
    pad_height = max(0, (desired_png_size - one_crystal.shape[0]) // 2)
    pad_width = max(0, (desired_png_size - one_crystal.shape[1]) // 2)
    
    # Pad the image
    padded_image = np.pad(one_crystal, ((pad_height, pad_height), (pad_width, pad_width)), mode='constant', constant_values=255)
    
    # In case the dimensions are odd, add an extra row/column to match the desired size
    if padded_image.shape[0] < desired_png_size:
        padded_image = np.pad(padded_image, ((0, 1), (0, 0)), mode='constant', constant_values=255)
    if padded_image.shape[1] < desired_png_size:
        padded_image = np.pad(padded_image, ((0, 0), (0, 1)), mode='constant', constant_values=255)

    return padded_image

In [5]:
## cleaning of whole h5 file 
## this has been edited - allow for corresponding time
pix_sum = time_xr['pix_sum']
utc_time = time_xr['utc_time']

# Calculate the difference
diff = np.diff(pix_sum.values)

# Create a mask where the difference is greater than 4 - i.e. select segments of significant sizze
mask = diff > 4
# Apply the mask to select the corresponding values from pix_sum and utc_time
selected_pix_sum = pix_sum[:-1][mask]
selected_utc_time = utc_time[:-1][mask]

In [6]:
length_threshold = 300 # mu - need this minimum length of max dimension to extract the particle
pixel_resolution = 10 # mu
desired_image_size = 120

In [41]:
minimum_area = 15 # very quick metric to stop the processing of particles with area < 10 pixels

###  set up dataframe, used to extract from raw h5 file + has stats about the particle
columns = [
    "name",
    "particle_label",
    "start_index",
    "end_index",
    "start_time",
    "end_time",
    "major_axis_length",
    "minor_axis_length",
    "orientation",
    "centroid",
    "area",
    "perimeter",
    "y0",
    "y1"
]
particle_df = pd.DataFrame(columns=columns)

In [42]:
# ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~ ~  #
# loop though length of images that are somewhat usable
for i in range(len(selected_pix_sum)-2):
    # pull out selected area + do analysis
    one_crystal = f2ds['ImageData'][:,int(selected_pix_sum[i]):int(selected_pix_sum[i+1])] # extract 1 crystal
    binary_image = (one_crystal == 0) ## important, convert regions where 0 = True (our bits of interest), all else false
    
    labeled_image, num_features = label(binary_image) # identify connected true areas
    # labeled_image = array, with each true area given a number to identify them
    # num_features = number of unique connected components in image. Have to literally have adjacent pixel, not diagonal (this will make them seperate)
    
    props = regionprops(labeled_image) # creates quick list of properties describing each feature detected in the image.

    if props:
        for particle in props:
            # quickly get rid of tiny particles
            if particle.area >= minimum_area:
                ## basic info
                coords = particle.coords # basically gives coords of each point of interest
                x_values = np.unique(coords[:, 1])
                s_idx = int(selected_pix_sum[i] + x_values[0])
                e_idx = int(selected_pix_sum[i] + x_values[-1])
                
                ## more complex stats
                spec_region = stats_description(particle)
                ### !!!!!! important part - this is where useful crystals are getting stats + being recorded in the dataframe    
                
                if spec_region and spec_region.major_axis_length * pixel_resolution >= length_threshold:
                    # nice way of saving data - lenth + measurements are correct in microns
                    one_particle_data = {
                            #"image_index": image_index,
                            "name": f'{s_idx}_{particle.label}',
                            "particle_label": particle.label,
                            "start_index": s_idx,
                            "end_index": e_idx,
                            "start_time": str(selected_utc_time[i].values).split('T')[1], # more friendly time
                            "end_time": str(selected_utc_time[i+1].values).split('T')[1], # more friendly time
                            
                            #"start_time": str(time_xr['utc_time'][s_idx].values).split('T')[1], # more friendly time
                            #"end_time": str(time_xr['utc_time'][e_idx].values).split('T')[1],
                        
                            #"start_time": time_xr['utc_time'][s_idx],  # assuming 'time_xr' is pre-defined and syncs with indices
                            #"end_time": time_xr['utc_time'][e_idx],
                            "major_axis_length": spec_region.major_axis_length * pixel_resolution,
                            "minor_axis_length": spec_region.minor_axis_length * pixel_resolution,
                            "orientation": spec_region.orientation,
                            "centroid": spec_region.centroid,
                            "area": (spec_region.area * (pixel_resolution**2)),
                            "perimeter": (spec_region.perimeter * pixel_resolution),
                            "y0": coords[0][0],
                            "y1": coords[-1][0]
                            }
                    print(f'{s_idx} done')
                    one_particle_data_df = pd.DataFrame([one_particle_data])
                    particle_df = pd.concat([particle_df, one_particle_data_df], ignore_index=True)

15 done


  particle_df = pd.concat([particle_df, one_particle_data_df], ignore_index=True)


1378 done
10304 done
17025 done
17220 done
17443 done
17511 done
17528 done
18899 done
18950 done
19332 done
19988 done
20746 done
21357 done
23789 done
24893 done
27086 done
43643 done
82129 done
82930 done
83241 done
83507 done
83911 done
84220 done
84631 done
84697 done
85399 done
87289 done
87555 done
88368 done
89165 done
89218 done
92190 done
99744 done
102052 done
103901 done
107420 done
108176 done
108323 done
110119 done
110582 done
110636 done
110731 done
112859 done
114067 done
115716 done
115814 done
116196 done
116389 done
116525 done
116790 done
117442 done
118254 done
118459 done
119855 done
122654 done
124056 done
128116 done
128396 done
128992 done
129325 done
129450 done
129531 done
131237 done
131380 done
132036 done
132294 done
132414 done
132559 done
133072 done
133721 done
134526 done
134580 done
134726 done
134927 done
135546 done
136206 done
136767 done
136915 done
136974 done
137044 done
137351 done
137675 done
137931 done
137952 done
138365 done
138430 done
13

In [43]:
particle_df

Unnamed: 0,name,particle_label,start_index,end_index,start_time,end_time,major_axis_length,minor_axis_length,orientation,centroid,area,perimeter,y0,y1
0,15_82,82,15,21,16:51:03.577000000,16:51:05.465000000,345.340872,53.001538,-0.036097,"(18.294117647058822, 3.764705882352941)",8500.0,696.923882,97,127
1,1378_22,22,1378,1413,16:51:06.058000000,16:51:06.120000000,400.146102,332.045327,-0.155526,"(20.21533923303835, 18.13765978367748)",101700.0,1593.675324,69,112
2,10304_41,41,10304,10353,16:51:09.615000000,16:51:09.786000000,491.870635,442.699181,0.068674,"(25.55128205128205, 21.843101343101342)",163800.0,2298.589104,18,69
3,17025_1,1,17025,17071,16:55:31.789000000,16:55:31.789000000,480.666215,422.675017,-0.052924,"(26.03423680456491, 22.95506419400856)",140200.0,1999.177849,83,127
4,17220_1,1,17220,17242,16:55:31.929000000,16:55:32.038000000,312.156569,85.096358,-0.898120,"(9.671717171717171, 11.247474747474747)",19800.0,719.766594,105,124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5666,1115041_1,1,1115041,1115575,19:17:26.639000000,19:17:26.639000000,4312.847208,1366.941642,-1.507026,"(73.55683314080004, 308.3652043091637)",3722300.0,17218.610024,0,127
5667,1115610_4,4,1115610,1116095,19:17:27.638000000,19:17:27.638000000,4228.153555,369.898696,1.568560,"(25.213804046013486, 231.9259222530742)",1008400.0,14541.726189,89,127
5668,1115584_6,6,1115584,1115636,19:17:27.638000000,19:17:27.638000000,544.230723,15.554171,-1.548255,"(0.4305555555555556, 24.680555555555557)",7200.0,700.000000,95,96
5669,1116143_1,1,1116143,1116476,19:17:28.589000000,19:17:28.589000000,3138.281795,1361.698152,1.406323,"(68.20192066281697, 164.0024165202109)",3186400.0,12498.305192,0,127


In [7]:
## for making images post csv been made
particle_df = pd.read_csv(f'{save_loc}particle_stats.csv')

In [14]:
## image saving:
for index, row in particle_df.iterrows():
    # extract image data from main h5 file
    extract_crystal_image = f2ds['ImageData'][row['y0']-2:row['y1']+3,row['start_index']-2:row['end_index']+3] # a couple of points have been added around the image, as it appears to cut it off
    padded_crystal = add_padding(extract_crystal_image, 200) # add padding 
    save_name = row['name']

    # save
    plt.imsave(f'{save_loc}{save_name}.png', padded_crystal, cmap='Greys')
    print(f'{save_name} saved')

15_82 saved
1378_22 saved
10304_41 saved
17025_1 saved
17220_1 saved
17443_1 saved
17511_1 saved
17528_1 saved
18899_1 saved
18950_3 saved
19332_7 saved
19988_1 saved
20746_1 saved
21357_1 saved
23789_12 saved
24893_1 saved
27086_185 saved
43643_19 saved
82129_7 saved
82930_1 saved
83241_1 saved
83507_7 saved
83911_1 saved
84220_2 saved
84631_1 saved
84697_7 saved
85399_82 saved
87289_5 saved
87555_1 saved
88368_111 saved
89165_1 saved
89218_97 saved
92190_2 saved
99744_1 saved
102052_3 saved
103901_2 saved
107420_26 saved
108176_6 saved
108323_122 saved
110119_4 saved
110582_7 saved
110636_19 saved
110731_1 saved
112859_10 saved
114067_111 saved
115716_12 saved
115814_21 saved
116196_7 saved
116389_1 saved
116525_3 saved
116790_2 saved
117442_8 saved
118254_9 saved
118459_1 saved
119855_283 saved
122654_3 saved
124056_1 saved
128116_3 saved
128396_1 saved
128992_3 saved
129325_4 saved
129450_3 saved
129531_3 saved
131237_10 saved
131380_3 saved
132036_3 saved
132294_1 saved
132414_1 s

In [44]:
# save df of stats
particle_df.to_csv(f'{save_loc}particle_stats.csv', index=False)