The provided script is designed for the analysis of collective behavior in nematodes. 

Here we conduct the analysis comparing all the strains belonging to the divergent data set. 

# Load Packages #

In [None]:
import os
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from skimage.measure import label, regionprops
from skimage.draw import circle_perimeter
from math import sqrt
from skimage.io import imread
from skimage.transform import resize
from skimage.filters import threshold_otsu
from tabulate import tabulate
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.image as mpimg
from matplotlib import rcParams
import h5py    # for reading HDF5 files 
import cv2     # for image conversion
import re    # use it to sort but not sure what it is!
from scipy.stats import zscore
from skimage import draw, measure
import pywt
from scipy.spatial.distance import cdist
import seaborn as sns
from scipy.stats import multivariate_normal
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import cv2
from skimage.filters import threshold_otsu
from skimage import color
import glob

# Data extraction #

### Data set infromation overview: ###

1. Divergent Sets:

    Consists of 12 genotypically distinct strains. These are strains that are further apart on the phenotypic tree. 
    Ideal for determining broad-sense heritability with repeated measurements of specific phenotypes.
    This allows us to look into the relationship between the behavior and the genetic makeup. 

2. Mapping Sets:

    Comprises 48 strains suitable for collecting phenotype data for broad diversity or genome-wide association studies.
    Using multiple strain sets can enhance the statistical strength in genome-wide association studies.

## Handling and processing image data stored in HDF5 files ##

### Workflow: ###

1. The script sets the paths to your HDF5 files and the directory for extracted frames.

2. It ensures the output directory exists for each strain and each experiment.

3. It retrieves the paths of HDF5 files for each strain and creates corresponding output directories.

4. It extracts frames from HDF5 files and saves them in the respective experiment subfolders.

5. It loads images from these output directories for further analysis.

6. It creates a dictionary labels_strain_list mapping experiment labels to the loaded images.

### Functions: ###

ensure_directory_exists: Ensures a directory exists, and if not, creates it.

extract_experiment_name: Extracts the experiment name from the HDF5 file name. Adjust the logic here depending on your file naming convention.

create_output_directories_for_experiments: Creates output directories for each experiment under each strain.

extract_frames_from_hdf5: Extracts frames from an HDF5 file and saves them in the specified output directory.

load_images_from_folder: Loads images from a given folder.

In [None]:
def ensure_directory_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)

def extract_experiment_name(file_path):
    # Assuming the experiment's name is part of the file's name
    base_name = os.path.basename(file_path)
    name_parts = base_name.split('_')
    experiment_name = '_'.join(name_parts[:-1])  # Exclude the extension and any other parts
    return experiment_name

def create_output_directories_for_experiments(root_path):
    output_dirs = {}
    strain_folders = [d for d in os.listdir(root_path) if os.path.isdir(os.path.join(root_path, d))]
    
    for strain in strain_folders:
        hdf5_files = glob.glob(os.path.join(root_path, strain, '*.hdf5'))
        for file_path in hdf5_files:
            experiment_name = extract_experiment_name(file_path)
            output_dir = os.path.join(root_path, strain, experiment_name)
            ensure_directory_exists(output_dir)
            output_dirs[file_path] = output_dir

    return output_dirs

def extract_frames_from_hdf5(file_path, output_directory, step=500):
    with h5py.File(file_path, 'r') as hdf:
        img_ds2 = hdf['/mask']
        for i in range(0, img_ds2.shape[0], step):
            name = f"{i:06d}.jpg"
            cv2.imwrite(os.path.join(output_directory, name), img_ds2[i, :])

def load_images_from_folder(folder):
    all_images = []
    image_files = [f for f in os.listdir(folder) if f.endswith(".jpg")]
    for filename in image_files:
        filepath = os.path.join(folder, filename)
        image = cv2.imread(filepath, cv2.IMREAD_GRAYSCALE)
        if image is not None:
            all_images.append(image)
    return all_images

In [None]:
# Set paths
root_path = '/Volumes/TOSHIBA_EXT/Phenotype_features_collective/Data'
output_root_path = '/Volumes/TOSHIBA_EXT/Phenotype_features_collective/ExtractedFrames'

# Ensure the output root folder exists
ensure_directory_exists(output_root_path)

# Create output directories for each experiment under each strain
output_directories = create_output_directories_for_experiments(root_path)

# Extract frames and load images
labels_strain_list = {}
for file_path, output_dir in output_directories.items():
    extract_frames_from_hdf5(file_path, output_dir)
    images = load_images_from_folder(output_dir)
    experiment_label = extract_experiment_name(file_path)
    labels_strain_list[experiment_label] = images