In [1]:
#import required libraries
import os
import numpy as np
import pandas as pd
from PIL import Image
import shutil
import time
import yaml
from pathlib import Path
from tqdm.notebook import tqdm

In [9]:
# Set random seed for reproducibility
np.random.seed(42)

# Define dataset paths
data_path = r"C:\Users\puvia\OneDrive\Documents\GitHub\Locating-Bacterial-Flagellar-Motors-from-3D-tomographs-using-YOLO-model\data"
train_dir = os.path.join(data_path, "train")
test_dir = os.path.join(data_path, "test")  # Separate test dataset
labels_file = os.path.join(data_path, "train_labels.csv")

# Define YOLO dataset structure
yolo_dataset_dir = r"C:\Users\puvia\OneDrive\Documents\GitHub\Locating-Bacterial-Flagellar-Motors-from-3D-tomographs-using-YOLO-model\data\yolo_dataset"
yolo_images_train = os.path.join(yolo_dataset_dir, "images", "train")
yolo_images_test = os.path.join(yolo_dataset_dir, "images", "test")
yolo_labels_train = os.path.join(yolo_dataset_dir, "labels", "train")
yolo_labels_test = os.path.join(yolo_dataset_dir, "labels", "test")

# Create directories
for dir_path in [yolo_images_train, yolo_images_test, yolo_labels_train, yolo_labels_test]:
    os.makedirs(dir_path, exist_ok=True)

# Constants
TRUST = 4  # Number of slices above and below center slice (total 2*TRUST + 1 slices)
BOX_SIZE = 24  # Bounding box size for annotations (in pixels)

def normalize_slice(slice_data):
    """
    We convert the images into arrays and use only the pixels between 2 to 98 percentiles which means, not too dark or white.
    """
    p2, p98 = np.percentile(slice_data, [2, 98])
    clipped_data = np.clip(slice_data, p2, p98)
    return np.uint8(255 * (clipped_data - p2) / (p98 - p2))

def process_tomogram_set(tomogram_ids, images_dir, labels_dir, set_name):
    """
    Process a set of tomograms for YOLO format
    """
    labels_df = pd.read_csv(labels_file)
    motor_counts = []
    
    for tomo_id in tomogram_ids: #each folder in the train or test dataset folder
        tomo_motors = labels_df[labels_df['tomo_id'] == tomo_id] #pick only the tomo_id of the available data
        for _, motor in tomo_motors.iterrows(): #for every index and row in tomo_motors dataset
            if pd.isna(motor['Motor axis 0']): #skip if axis 0 is not available
                continue 
            motor_counts.append( #or add a list of tuples with the following column data of each row
                (tomo_id, 
                 int(motor['Motor axis 0']), #z_center
                 int(motor['Motor axis 1']), #x_center
                 int(motor['Motor axis 2']), #y_center
                 int(motor['Array shape (axis 0)'])) #z_max
            )
    
    processed_slices = 0
    for tomo_id, z_center, y_center, x_center, z_max in motor_counts:

        #YOLO only considers center slice, TRUST number of slices before the center, TRUST number of slices after the center
        for z in range(max(0, z_center - TRUST), min(z_max - 1, z_center + TRUST) + 1):
            slice_filename = f"slice_{z:04d}.jpg"
            src_path = os.path.join(train_dir, tomo_id, slice_filename)
            
            if not os.path.exists(src_path):
                print(f"Warning: {src_path} does not exist, skipping.")
                continue
            
            img = Image.open(src_path)
            img_array = np.array(img)
            normalized_img = normalize_slice(img_array)
            
            dest_filename = f"{tomo_id}_z{z:04d}_y{y_center:04d}_x{x_center:04d}.jpg"
            dest_path = os.path.join(images_dir, dest_filename)
            Image.fromarray(normalized_img).save(dest_path)
            
            img_width, img_height = img.size
            label_path = os.path.join(labels_dir, dest_filename.replace('.jpg', '.txt'))
            with open(label_path, 'w') as f:
                f.write(f"0 {x_center/img_width} {y_center/img_height} {BOX_SIZE/img_width} {BOX_SIZE/img_height}\n")
            
            processed_slices += 1
    
    return processed_slices, len(motor_counts)




In [10]:
# Process training and test datasets
train_tomos = os.listdir(train_dir)
test_tomos = os.listdir(test_dir)

total_train_slices, total_train_motors = process_tomogram_set(train_tomos, yolo_images_train, yolo_labels_train, "training")
total_test_slices, total_test_motors = process_tomogram_set(test_tomos, yolo_images_test, yolo_labels_test, "test")

# Create YAML configuration file for YOLO
yaml_content = {
    'path': yolo_dataset_dir,
    'train': 'images/train',
    'val': 'images/test',
    'names': {0: 'motor'}
}
with open(os.path.join(yolo_dataset_dir, 'dataset.yaml'), 'w') as f:
    yaml.dump(yaml_content, f, default_flow_style=False)

print(f"\nDataset ready for YOLO:")
print(f"- Training data: {total_train_motors} motors, {total_train_slices} slices")
print(f"- Test data: {total_test_motors} motors, {total_test_slices} slices")
print(f"- Dataset directory: {yolo_dataset_dir}")
print(f"- YAML configuration: {os.path.join(yolo_dataset_dir, 'dataset.yaml')}")


Dataset ready for YOLO:
- Training data: 737 motors, 5198 slices
- Test data: 3 motors, 22 slices
- Dataset directory: C:\Users\puvia\OneDrive\Documents\GitHub\Locating-Bacterial-Flagellar-Motors-from-3D-tomographs-using-YOLO-model\data\yolo_dataset
- YAML configuration: C:\Users\puvia\OneDrive\Documents\GitHub\Locating-Bacterial-Flagellar-Motors-from-3D-tomographs-using-YOLO-model\data\yolo_dataset\dataset.yaml
