In [None]:
import os
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from skimage.measure import label, regionprops
from skimage.draw import circle_perimeter
from math import sqrt
from skimage.io import imread
from skimage.transform import resize
from skimage.filters import threshold_otsu
from tabulate import tabulate
from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
import matplotlib.image as mpimg
from matplotlib import rcParams
import h5py    # for reading HDF5 files 
import cv2     # for image conversion
import re    # use it to sort but not sure what it is!
from scipy.stats import zscore
from skimage import draw, measure
import pywt
from scipy.spatial.distance import cdist
import seaborn as sns
from scipy.stats import multivariate_normal
import matplotlib.colors as colors
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
import cv2
from skimage.filters import threshold_otsu
from skimage import color
import glob

# Data extraction #

### Data set infromation overview: ###

1. Divergent Sets:

    Consists of 12 genotypically distinct strains. These are strains that are further apart on the phenotypic tree. 
    Ideal for determining broad-sense heritability with repeated measurements of specific phenotypes.
    This allows us to look into the relationship between the behavior and the genetic makeup. 

2. Mapping Sets:

    Comprises 48 strains suitable for collecting phenotype data for broad diversity or genome-wide association studies.
    Using multiple strain sets can enhance the statistical strength in genome-wide association studies.

### Pipline: ###

1. Take hdf5 tierpsy output and extract frames

ATTENTION: change the output_root_path depending on which data set you are using 

In [None]:
def extract_label_from_path(file_path):
    # Get the base file name
    base_name = os.path.basename(file_path)
    
    try:
        # Find the position of the first underscore after the 6th character
        first_underscore_pos = base_name.index('_', 6)
        
        # Extract the label from the 6th character to the first underscore
        label_strain = base_name[6:first_underscore_pos]
        
        return label_strain
    except ValueError:
        return "Unknown"

### Accessing single strains all videos ###

1. Keeping all frames sepearated

2. Putting all frames from all videos in same folder for analysis 

Extract frames from each hdf5 file: 


This code snippet is designed to process a series of HDF5 files, specifically for extracting and saving image data:

1. Iterate Over HDF5 Files: The for loop iterates over a list of file paths (hdf5_file_paths).

2. Open HDF5 File: Inside the loop, each file is opened using h5py.File.

3. Access Image Dataset: The script accesses a dataset named /mask within each HDF5 file. This dataset presumably contains image data. Information about the dataset, like its shape and data type, is printed out.

4. Frame Extraction Step: A variable step is set, which determines the interval at which frames (or elements) from the dataset will be extracted. In this code, it's set to 500, meaning every 500th frame will be considered.

5. Output Directory Preparation: For each HDF5 file, an output directory is created to store the extracted images. This directory is based on the name of the HDF5 file and is located within a root output path (output_root_path).

6. Create Directory If Necessary: The script checks if the output directory already exists. If it doesn't, it creates the directory using os.makedirs.

7. Extract and Save Frames: Another loop iterates through the image dataset (img_ds2), jumping step frames at a time. Each selected frame is saved as a JPEG image. The naming convention for these images is based on their index in the dataset, formatted to have six digits with leading zeros if necessary.

This code is useful for batch processing HDF5 files containing image data, particularly when you only need to extract and save certain frames (e.g., for donsampling or reducing data size).

### 1. ###

In [None]:
def ensure_directory_exists(path):
    if not os.path.exists(path):
        os.makedirs(path)

def extract_frames_from_hdf5(file_paths, output_directories, step=500):
    for path in file_paths:
        with h5py.File(path, 'r') as hdf:
            img_ds2 = hdf['/mask']
            print(f'Image Dataset info: Shape={img_ds2.shape}, Dtype={img_ds2.dtype}')
            for i in range(0, img_ds2.shape[0], step):
                name = "{:06d}.jpg".format(i)
                cv2.imwrite(os.path.join(output_directories[path], name), img_ds2[i, :])

def load_images_from_folder(main_folder, every_jth_image=10):
    all_images = []
    # Get all subdirectories in the main folder
    subfolders = [os.path.join(main_folder, d) for d in os.listdir(main_folder) if os.path.isdir(os.path.join(main_folder, d))]

    for folder in subfolders:
        imagefiles = [f for f in os.listdir(folder) if f.endswith(".jpg")]
        imagefiles.sort()

        print(f"Found {len(imagefiles)} image files in {folder}")

        for ii in range(0, len(imagefiles), every_jth_image):
            currentfilename = imagefiles[ii]
            filepath = os.path.join(folder, currentfilename)
            currentimage = cv2.imread(filepath, cv2.IMREAD_GRAYSCALE)

            if currentimage is not None:
                all_images.append(currentimage)
            else:
                print(f"Failed to load image: {filepath}")

    return all_images

# Load images from subfolders
all_images = load_images_from_folder(output_root_path)
print("Total number of images loaded from all subfolders: ", len(all_images))



# Set paths
hdf5_folder_path = r'/Volumes/TOSHIBA_EXT/Phenotype_features_collective/Data/CB4856'
output_root_path = os.path.join(hdf5_folder_path, 'framesExtracted')

# Ensure the output root folder exists
ensure_directory_exists(output_root_path)

# Ensure the output root folder exists
if not os.path.exists(output_root_path):
    os.makedirs(output_root_path)

# Get HDF5 file paths and create output directories
hdf5_file_paths = glob.glob(os.path.join(hdf5_folder_path, '*.hdf5'))
output_directories = create_output_directories(hdf5_file_paths, output_root_path)

# Extract frames from HDF5 files
extract_frames_from_hdf5(hdf5_file_paths, output_directories)

# Load images
all_images = load_images_from_folder(output_root_path)
print("Total number of images loaded: ", len(all_images))

# PLOT FRAMES FROM VIDEO 

# Loop through each list of images
for idx, video_images in enumerate(all_images):
    num_frames = len(video_images)
    
    # Skip if no images are loaded
    if num_frames == 0:
        print(f"No images to display for index {idx}.")
        continue

    # Calculate the number of rows and columns for the subplot
    num_cols = 5
    num_rows = int(np.ceil(num_frames / num_cols))

    # Create a new figure for each video
    fig, axs = plt.subplots(num_rows, num_cols, figsize=(15, 15))
    axs = np.ravel(axs)

    # Get the file path corresponding to the current set of images
    file_path = hdf5_file_paths[idx]
    label_file = labels_strain_list[file_path]

    # Set the title for the figure using the label
    fig.suptitle(f"Video from {label_file}", fontsize=16)

    # Plot each frame
    for i in range(num_rows * num_cols):
        if i < num_frames:
            if video_images[i].ndim == 2:  # Check if the image is 2D
                axs[i].imshow(video_images[i], cmap='gray')
                axs[i].axis('off')
            else:
                print(f"Image at index {i} is not 2D. Shape: {video_images[i].shape}")
        else:
            axs[i].axis('off')  # Turn off extra subplots

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])  # Adjust layout to make room for title
    plt.show()
