The provided script is designed for the analysis of collective behavior in nematodes. 

Here we conduct the analysis comparing all the strains belonging to the divergent data set. 

# Load Packages #

In [1]:
import os
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from skimage.measure import label, regionprops
from skimage.draw import circle_perimeter
from math import sqrt
from skimage.io import imread
from skimage.transform import resize
from skimage.filters import threshold_otsu
from tabulate import tabulate
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.image as mpimg
from matplotlib import rcParams
import h5py    # for reading HDF5 files 
import cv2     # for image conversion
import re    # use it to sort but not sure what it is!
from scipy.stats import zscore
from skimage import draw, measure
import pywt
from scipy.spatial.distance import cdist
import seaborn as sns
from scipy.stats import multivariate_normal
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import cv2
from skimage.filters import threshold_otsu
from skimage import color
import glob

# Data extraction #

### Data set infromation overview: ###

1. Divergent Sets:

    Consists of 12 genotypically distinct strains. These are strains that are further apart on the phenotypic tree. 
    Ideal for determining broad-sense heritability with repeated measurements of specific phenotypes.
    This allows us to look into the relationship between the behavior and the genetic makeup. 

2. Mapping Sets:

    Comprises 48 strains suitable for collecting phenotype data for broad diversity or genome-wide association studies.
    Using multiple strain sets can enhance the statistical strength in genome-wide association studies.

### Workflow: Handling and processing image data stored in HDF5 files ###

In hdf5_frames_extractor: 

1. The script sets the paths to your HDF5 files and the directory for extracted frames.

2. It ensures the output directory exists for each strain and each experiment.

3. It retrieves the paths of HDF5 files for each strain and creates corresponding output directories.

4. It extracts frames from HDF5 files and saves them in the respective experiment subfolders.

Strain_comparison_study: 

5. Load Images: Use the load_images_from_folder function to load frames from each experiment.

6. It creates a dictionary labels_strain_list mapping experiment labels to the loaded images.

7. Feature Extraction: Write functions to extract the necessary features from each frame.

8. Aggregate and Analyze: Aggregate the features per experiment, and then perform statistical analyses and visualizations.

9. Comparison: Compare these features across different strains.

### Functions: ###

extract_experiment_name: Extracts the experiment name from a given file or folder path.

load_images_from_folder: Loads all images in the specified folder and returns a list of images.

labels_strain_list: A dictionary that maps each experiment's label to its corresponding images. This is created by iterating through each strain's folder and then through each experiment's subfolder within the strain.

In [None]:
def extract_experiment_name(file_path):
    # Assuming the experiment's name is part of the file's name
    base_name = os.path.basename(file_path)
    name_parts = base_name.split('_')
    experiment_name = '_'.join(name_parts[:-1])  # Exclude the extension and any other parts
    return experiment_name

def load_images_from_folder(folder):
    all_images = []
    image_files = [f for f in os.listdir(folder) if f.endswith(".jpg")]
    for filename in image_files:
        filepath = os.path.join(folder, filename)
        image = cv2.imread(filepath, cv2.IMREAD_GRAYSCALE)
        if image is not None:
            all_images.append(image)
    return all_images

### Image Loading ###

In [None]:
# Set the root path where the extracted images are stored
output_root_path = '/Volumes/TOSHIBA_EXT/Phenotype_features_collective/ExtractedFrames'

# Create a dictionary to map experiment labels to images
labels_strain_list = {}
strain_folders = [d for d in os.listdir(output_root_path) if os.path.isdir(os.path.join(output_root_path, d))]

for strain in strain_folders:
    experiment_folders = [os.path.join(output_root_path, strain, e) for e in os.listdir(os.path.join(output_root_path, strain)) if os.path.isdir(os.path.join(output_root_path, strain, e))]
    for folder in experiment_folders:
        images = load_images_from_folder(folder)
        experiment_label = extract_experiment_name(folder)
        labels_strain_list[experiment_label] = images

# Now 'labels_strain_list' contains the mapping of experiment labels to the loaded images

# Feature Extraction #