DATA DOWNLOADING

In [None]:
# NOTE: Dataset files are available upon request
# The original dataset was stored in Google Cloud Storage
# Please contact the repository maintainers for access to the training data

# !gsutil cp -r gs://BUCKET_PATH .

In [None]:
# NOTE: Dataset files are available upon request
# The original dataset was downloaded from Roboflow
# API key has been removed for security. Please use your own Roboflow account.

# !pip install roboflow
# 
# from roboflow import Roboflow
# rf = Roboflow(api_key="YOUR_API_KEY_HERE")
# project = rf.workspace("WORKSPACE_ID").project("PROJECT_ID")
# version = project.version(2)
# dataset = version.download("yolov11")

DATA PREPROCESSING FOR YOLOv11 TRAINING PIPELINE

Combining multiple dataset folders, merging images and labels, and splitting them into train/validation sets in YOLO format.

In [None]:
import os
import shutil
import random
from pathlib import Path
from typing import List, Tuple
import yaml


def combine_datasets(source_folders: List[str], output_dir: str, train_ratio: float = 0.7, val_ratio: float = 0.2, seed: int = 42):
    """
    Combine multiple dataset folders into a single YOLO format dataset with train/val/test split.

    Args:
        source_folders: List of paths to dataset folders (each should contain images/ and labels/ subdirs)
        output_dir: Path to output directory for combined dataset
        train_ratio: Ratio of data to use for training (0.0-1.0)
        val_ratio: Ratio of data to use for validation (0.0-1.0)
        seed: Random seed for reproducible splits

    Note: test_ratio = 1.0 - train_ratio - val_ratio
    """
    random.seed(seed)

    # Create output directory structure
    output_path = Path(output_dir)
    train_images_dir = output_path / "train" / "images"
    train_labels_dir = output_path / "train" / "labels"
    val_images_dir = output_path / "val" / "images"
    val_labels_dir = output_path / "val" / "labels"
    test_images_dir = output_path / "test" / "images"
    test_labels_dir = output_path / "test" / "labels"

    for dir_path in [train_images_dir, train_labels_dir, val_images_dir, val_labels_dir, test_images_dir, test_labels_dir]:
        dir_path.mkdir(parents=True, exist_ok=True)

    # Collect all image-label pairs from source folders
    all_pairs = []

    for folder in source_folders:
        folder_path = Path(folder)
        images_dir = folder_path / "images"
        labels_dir = folder_path / "labels"

        if not images_dir.exists() or not labels_dir.exists():
            print(f"Warning: {folder} missing images/ or labels/ subdirectory")
            continue

        # Find matching image-label pairs
        for img_file in images_dir.glob("*"):
            if img_file.suffix.lower() in ['.jpg', '.jpeg', '.png', '.bmp']:
                label_file = labels_dir / f"{img_file.stem}.txt"
                if label_file.exists():
                    all_pairs.append((img_file, label_file))
                else:
                    print(f"Warning: No label found for {img_file}")

    print(f"Found {len(all_pairs)} image-label pairs")

    # Validate split ratios
    test_ratio = 1.0 - train_ratio - val_ratio
    if test_ratio < 0:
        raise ValueError("train_ratio + val_ratio cannot exceed 1.0")

    # Shuffle and split
    random.shuffle(all_pairs)
    train_split_idx = int(len(all_pairs) * train_ratio)
    val_split_idx = int(len(all_pairs) * (train_ratio + val_ratio))

    train_pairs = all_pairs[:train_split_idx]
    val_pairs = all_pairs[train_split_idx:val_split_idx]
    test_pairs = all_pairs[val_split_idx:]

    print(f"Train: {len(train_pairs)} samples ({train_ratio:.1%})")
    print(f"Val: {len(val_pairs)} samples ({val_ratio:.1%})")
    print(f"Test: {len(test_pairs)} samples ({test_ratio:.1%})")

    # Copy files to train directory
    for img_path, label_path in train_pairs:
        shutil.copy2(img_path, train_images_dir / img_path.name)
        shutil.copy2(label_path, train_labels_dir / label_path.name)

    # Copy files to val directory
    for img_path, label_path in val_pairs:
        shutil.copy2(img_path, val_images_dir / img_path.name)
        shutil.copy2(label_path, val_labels_dir / label_path.name)

    # Copy files to test directory
    for img_path, label_path in test_pairs:
        shutil.copy2(img_path, test_images_dir / img_path.name)
        shutil.copy2(label_path, test_labels_dir / label_path.name)

    print(f"Dataset combined successfully in {output_dir}")
    return len(train_pairs), len(val_pairs), len(test_pairs)


def create_yolo_config(output_dir: str, class_names: List[str]):
    """
    Create YOLO dataset configuration file.

    Args:
        output_dir: Path to dataset directory
        class_names: List of class names for the dataset
    """
    config = {
        'path': output_dir,
        'train': 'train',
        'val': 'val',
        'test': 'test',
        'nc': len(class_names),
        'names': class_names
    }

    config_path = Path(output_dir) / "data.yaml"
    with open(config_path, 'w') as f:
        yaml.dump(config, f, default_flow_style=False)

    print(f"YOLO config saved to {config_path}")
    return config_path


if __name__ == "__main__":
    # Configuration
    SOURCE_FOLDERS = [
        # Add your dataset folder paths here
        "PATH_1",
        "PATH_2",
        "PATH_3"
    ]

    OUTPUT_DIR = "combined_yolo_dataset"
    TRAIN_RATIO = 0.7   # 70% for training
    VAL_RATIO = 0.15     # 15% for validation (test = 15% automatically)
    RANDOM_SEED = 42    # Random seed for reproducible splits

    # Class names for spine vertebrae
    CLASS_NAMES = ['L1', 'L2', 'L3', 'L4', 'L5']

    # Execute data preprocessing
    if SOURCE_FOLDERS:  # Only run if source folders are specified
        print("Starting data preprocessing...")

        # Combine datasets
        train_count, val_count, test_count = combine_datasets(
            source_folders=SOURCE_FOLDERS,
            output_dir=OUTPUT_DIR,
            train_ratio=TRAIN_RATIO,
            val_ratio=VAL_RATIO,
            seed=RANDOM_SEED
        )

        # Create YOLO configuration
        config_path = create_yolo_config(OUTPUT_DIR, CLASS_NAMES)

        print("\n=== Data Preprocessing Complete ===")
        print(f"Training samples: {train_count}")
        print(f"Validation samples: {val_count}")
        print(f"Test samples: {test_count}")
        print(f"Dataset ready at: {OUTPUT_DIR}")
        print(f"Config file: {config_path}")
    else:
        print("Please specify SOURCE_FOLDERS in the configuration section")

Starting data preprocessing...
Found 15553 image-label pairs
Train: 10887 samples (70.0%)
Val: 2333 samples (15.0%)
Test: 2333 samples (15.0%)
Dataset combined successfully in combined_yolo_dataset
YOLO config saved to combined_yolo_dataset\data.yaml

=== Data Preprocessing Complete ===
Training samples: 10887
Validation samples: 2333
Test samples: 2333
Dataset ready at: combined_yolo_dataset
Config file: combined_yolo_dataset\data.yaml


COLOR GRADING

In [6]:
# import cv2
# import numpy as np
# import matplotlib.pyplot as plt
# from pathlib import Path
# import os

# # Get all images from dataset
# dataset_path = Path("combined_yolo_dataset")
# image_paths = []
# for split in ["train", "val", "test"]:
#     images_dir = dataset_path / split / "images"
#     if images_dir.exists():
#         for img_file in images_dir.glob("*.jpg"):
#             image_paths.append(img_file)
#         for img_file in images_dir.glob("*.png"):
#             image_paths.append(img_file)

# print(f"Found {len(image_paths)} images")

In [7]:
# import random

# sample_images = random.sample(image_paths, min(10, len(image_paths)))

# for i, img_path in enumerate(sample_images):
#     img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)

#     # Create color grading options
#     original = img
#     histogram = cv2.equalizeHist(img)
#     clahe_light = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8)).apply(img)
#     clahe_strong = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8,8)).apply(img)

#     # Display options
#     fig, axes = plt.subplots(1, 4, figsize=(16, 4))
#     fig.suptitle(f'Color Grading Options - {img_path.name}')

#     axes[0].imshow(original, cmap='gray')
#     axes[0].set_title('Original')
#     axes[0].axis('off')

#     axes[1].imshow(histogram, cmap='gray')
#     axes[1].set_title('Histogram EQ')
#     axes[1].axis('off')

#     axes[2].imshow(clahe_light, cmap='gray')
#     axes[2].set_title('CLAHE Light')
#     axes[2].axis('off')

#     axes[3].imshow(clahe_strong, cmap='gray')
#     axes[3].set_title('CLAHE Strong')
#     axes[3].axis('off')

#     plt.tight_layout()
#     plt.show()

In [8]:
# # Cell 3: Apply chosen color grading to all images
# # Choose option: 0=original, 1=histogram, 2=clahe_light, 3=clahe_strong
# chosen_option = 3  # CLAHE STRONG

# print(f"Applying color grading option {chosen_option} to all images...")

# processed_count = 0
# for img_path in image_paths:
#     img = cv2.imread(str(img_path), cv2.IMREAD_GRAYSCALE)

#     if chosen_option == 0:
#         processed = img  # Original
#     elif chosen_option == 1:
#         processed = cv2.equalizeHist(img)  # Histogram
#     elif chosen_option == 2:
#         clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
#         processed = clahe.apply(img)  # CLAHE Light
#     elif chosen_option == 3:
#         clahe = cv2.createCLAHE(clipLimit=4.0, tileGridSize=(8,8))
#         processed = clahe.apply(img)  # CLAHE Strong

#     # Save processed image
#     cv2.imwrite(str(img_path), processed)
#     processed_count += 1

# print(f"Color grading applied to {processed_count} images")

MODEL TRAINING

In [9]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.218-py3-none-any.whl.metadata (37 kB)
Collecting opencv-python>=4.6.0 (from ultralytics)
  Downloading opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl.metadata (19 kB)
Collecting polars (from ultralytics)
  Downloading polars-1.34.0-py3-none-any.whl.metadata (10 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.17-py3-none-any.whl.metadata (14 kB)
Collecting polars-runtime-32==1.34.0 (from polars->ultralytics)
  Downloading polars_runtime_32-1.34.0-cp39-abi3-win_amd64.whl.metadata (1.5 kB)
Downloading ultralytics-8.3.218-py3-none-any.whl (1.1 MB)
   ---------------------------------------- 0.0/1.1 MB ? eta -:--:--
   ---------------------------------------- 1.1/1.1 MB 12.1 MB/s eta 0:00:00
Downloading opencv_python-4.12.0.88-cp37-abi3-win_amd64.whl (39.0 MB)
   ---------------------------------------- 0.0/39.0 MB ? eta -:--:--
   -- ------------------------------------- 2.6/39.0 MB 13.8 MB/s


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [10]:
# Cell 4: YOLOv11 Training Pipeline
from ultralytics import YOLO

# Load YOLOv11 model
model = YOLO('yolo11n.pt')  # or yolo11s.pt, yolo11m.pt, yolo11l.pt, yolo11x.pt

# Training configuration
results = model.train(
    data='combined_yolo_dataset/data.yaml',
    epochs=100,
    imgsz=640,
    batch=16,

    # Data augmentation settings
    flipud=0.0,      # No vertical flip (spine should stay vertical)
    fliplr=0.5,      # 50% horizontal flip
    degrees=0.0,     # No rotation
    translate=0.1,   # Small translation
    scale=0.1,       # Small scaling
    shear=0.0,       # No shearing

    # Training settings
    patience=10,
    save_period=10
)


Creating new Ultralytics Settings v0.0.6 file  
View Ultralytics Settings with 'yolo settings' or at 'C:\Users\adytc\AppData\Roaming\Ultralytics\settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
[KDownloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n.pt to 'yolo11n.pt': 100% ━━━━━━━━━━━━ 5.4MB 12.6MB/s 0.4s.4s<0.0s.5s
Ultralytics 8.3.218  Python-3.13.4 torch-2.7.1+cu126 CUDA:0 (NVIDIA GeForce RTX 4060, 8188MiB)
[34m[1mengine\trainer: [0magnostic_nms=False, amp=True, augment=False, auto_augment=randaugment, batch=16, bgr=0.0, box=7.5, cache=False, cfg=None, classes=None, close_mosaic=10, cls=0.5, compile=False, conf=None, copy_paste=0.0, copy_paste_mode=flip, cos_lr=False, cutmix=0.0, data=combined_yolo_dataset/data.yaml, degrees=0.0, deterministic=True, device=None, dfl=1.5, dnn=False, dropout=0.0, dynamic=False, embed=None

In [None]:
# Validation
val_results = model.val()

# # Export model
# model.export(format='onnx')

Ultralytics 8.3.218  Python-3.13.4 torch-2.7.1+cu126 CUDA:0 (NVIDIA GeForce RTX 4060, 8188MiB)
YOLO11n summary (fused): 100 layers, 2,583,127 parameters, 0 gradients, 6.3 GFLOPs
[34m[1mval: [0mFast image access  (ping: 0.10.0 ms, read: 2101.41232.5 MB/s, size: 1420.5 KB)
[K[34m[1mval: [0mScanning C:\vscode workspace\skai\combined_yolo_dataset\val\labels.cache... 2333 images, 0 backgrounds, 0 corrupt: 100% ━━━━━━━━━━━━ 2333/2333 5.5Mit/s 0.0s0s
[K                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% ━━━━━━━━━━━━ 146/146 9.3it/s 15.6s<0.2s
                   all       2333      11659      0.955      0.965      0.979      0.867
                    L1       2328       2329      0.957      0.963      0.978      0.866
                    L2       2330       2332      0.955      0.962      0.979       0.88
                    L3       2332       2332      0.958      0.966      0.981      0.895
                    L4       2333       2335      0

EXPORT TO WEB APPLICATION

In [4]:
!pip install ultralytics

from ultralytics import YOLO
import shutil
from pathlib import Path

# Load the best trained model
model = YOLO('runs/detect/train/weights/best.pt')

# Export to ONNX format with WEB-COMPATIBLE settings
print("Exporting model to ONNX format for web deployment...")
onnx_path = model.export(
    format='onnx',
    imgsz=640,          # Input image size
    opset=12,           # ONNX opset version (12+ recommended for web)
    simplify=True,      # Simplify model graph
    dynamic=False,      # Fixed input size (required for web)
    half=False,         # Use FP32 (required for browser compatibility)
    int8=False,         # No INT8 quantization
    batch=1             # Fixed batch size
)
print(f"Model exported to: {onnx_path}")

# Copy to web application public folder
web_models_dir = Path('../public/models')
web_models_dir.mkdir(parents=True, exist_ok=True)

destination = web_models_dir / 'best.onnx'
shutil.copy2(onnx_path, destination)

print(f"\n✅ Model successfully exported to web application!")
print(f"   Location: {destination}")
print(f"   Size: {destination.stat().st_size / (1024*1024):.2f} MB")
print(f"\n📝 Export settings (web-compatible):")
print(f"   - ONNX opset: 12 (browser-compatible)")
print(f"   - Precision: FP32 (full precision)")
print(f"   - Dynamic shapes: Disabled (fixed 640x640 input)")
print(f"   - Simplification: Enabled (optimized graph)")
print(f"\nThe web application will now use this updated model.")

Exporting model to ONNX format for web deployment...
Ultralytics 8.3.222  Python-3.13.5 torch-2.9.0+cu126 CPU (AMD Ryzen 5 8400F 6-Core Processor)
YOLO11n summary (fused): 100 layers, 2,583,127 parameters, 0 gradients, 6.3 GFLOPs

[34m[1mPyTorch:[0m starting from 'runs\detect\train\weights\best.pt' with input shape (1, 3, 640, 640) BCHW and output shape(s) (1, 9, 8400) (5.2 MB)

[34m[1mONNX:[0m starting export with onnx 1.19.1 opset 12...
[34m[1mONNX:[0m slimming with onnxslim 0.1.72...
[34m[1mONNX:[0m export success  0.9s, saved as 'runs\detect\train\weights\best.onnx' (10.1 MB)

Export complete (1.1s)
Results saved to [1mC:\vscode workspace\sk.ai\sk.ai\model\runs\detect\train\weights[0m
Predict:         yolo predict task=detect model=runs\detect\train\weights\best.onnx imgsz=640  
Validate:        yolo val task=detect model=runs\detect\train\weights\best.onnx imgsz=640 data=combined_yolo_dataset/data.yaml  
Visualize:       https://netron.app
Model exported to: runs\det