# Instance Segmentation - YOLOv11 Training
Fine-tune YOLOv11-seg for instance segmentation using Apple Silicon (MPS) or CUDA

In [None]:
# !pip install ultralytics

In [None]:
import os
from pathlib import Path
from ultralytics import YOLO
import torch

print(f'PyTorch: {torch.__version__}')
print(f'MPS available: {torch.backends.mps.is_available()}')
print(f'CUDA available: {torch.cuda.is_available()}')

## Configuration

In [None]:
# Dataset path (YOLOv8/v11 segmentation format with data.yaml)
# Labels should be: <class_id> <x1> <y1> <x2> <y2> ... <xn> <yn> (normalized polygon coords)
DATA_YAML = '<path_to_data.yaml>'

# Output directory for trained model
OUTPUT_DIR = './checkpoints/yolo11s_seg'

# Model variant (segmentation models end with -seg)
# yolo11n-seg (nano), yolo11s-seg (small), yolo11m-seg (medium), yolo11l-seg (large)
MODEL = 'yolo11s-seg.pt'  # small - good balance of speed and accuracy

# Training parameters
EPOCHS = 100
BATCH_SIZE = 8  # lower batch for segmentation (more memory intensive)
IMG_SIZE = 640  # input resolution
PATIENCE = 15   # early stopping patience

# Device selection
if torch.backends.mps.is_available():
    DEVICE = 'mps'
elif torch.cuda.is_available():
    DEVICE = 0  # cuda device id
else:
    DEVICE = 'cpu'

print(f'Device: {DEVICE}')
print(f'Model: {MODEL}')
print(f'Epochs: {EPOCHS}')
print(f'Batch size: {BATCH_SIZE}')
print(f'Image size: {IMG_SIZE}')

## Load Model

In [None]:
# Load pretrained YOLOv11-seg
model = YOLO(MODEL)
print(f'Loaded {MODEL}')
print(f'Task: {model.task}')

## Training

In [None]:
# Train the segmentation model
results = model.train(
    data=DATA_YAML,
    epochs=EPOCHS,
    batch=BATCH_SIZE,
    imgsz=IMG_SIZE,
    device=DEVICE,
    patience=PATIENCE,
    project=OUTPUT_DIR,
    name='train',
    exist_ok=True,
    pretrained=True,
    optimizer='AdamW',
    lr0=0.001,
    lrf=0.01,
    warmup_epochs=3,
    close_mosaic=10,
    plots=True,
    save=True,
    val=True,
    mask_ratio=4,  # mask downsample ratio
    overlap_mask=True,  # overlap masks during training
)

print('\nTraining complete!')

## Evaluate on Validation Set

In [None]:
# Load best checkpoint and validate
best_model = YOLO(f'{OUTPUT_DIR}/train/weights/best.pt')
val_results = best_model.val(data=DATA_YAML)

# Box metrics
print(f"\nBox mAP50: {val_results.box.map50:.4f}")
print(f"Box mAP50-95: {val_results.box.map:.4f}")

# Mask metrics
print(f"\nMask mAP50: {val_results.seg.map50:.4f}")
print(f"Mask mAP50-95: {val_results.seg.map:.4f}")

## Single Image Inference

In [None]:
import cv2
import numpy as np
from IPython.display import display, Image
import tempfile

def predict_and_display(model, image_path, conf=0.5):
    """Run segmentation inference and display results."""
    results = model.predict(image_path, conf=conf)
    
    # Get annotated image with masks
    annotated = results[0].plot()
    
    # Save to temp file and display
    with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as f:
        cv2.imwrite(f.name, annotated)
        display(Image(filename=f.name))
    
    # Print detections
    if results[0].masks is not None:
        print(f'Found {len(results[0].masks)} objects')
        for i, (box, mask) in enumerate(zip(results[0].boxes, results[0].masks)):
            cls = int(box.cls)
            conf = float(box.conf)
            name = results[0].names[cls]
            print(f'  {i+1}. {name}: {conf:.2f}')
    else:
        print('No objects detected')
    
    return results

In [None]:
# Test on a single image
# predict_and_display(best_model, '/path/to/test/image.jpg', conf=0.5)

## Video Inference with Segmentation Masks

In [None]:
import cv2
import time

def process_video_segmentation(model, input_path, output_path, conf=0.5, display=False):
    """Process video with segmentation model, drawing masks."""
    cap = cv2.VideoCapture(input_path)
    
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    print(f'Processing: {width}x{height} @ {fps:.1f} fps ({total_frames} frames)')
    
    frame_count = 0
    total_time = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Predict with segmentation
        start = time.time()
        results = model.predict(frame, conf=conf, verbose=False)
        total_time += time.time() - start
        
        # Draw results (includes masks)
        annotated = results[0].plot()
        out.write(annotated)
        frame_count += 1
        
        if frame_count % 100 == 0:
            avg_fps = frame_count / total_time
            print(f'Progress: {frame_count}/{total_frames} ({avg_fps:.1f} FPS)')
        
        if display:
            cv2.imshow('Segmentation Output', annotated)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
    
    cap.release()
    out.release()
    if display:
        cv2.destroyAllWindows()
    
    avg_fps = frame_count / total_time
    print(f'\nDone. Saved to {output_path}')
    print(f'Average inference: {avg_fps:.1f} FPS ({1000/avg_fps:.1f}ms per frame)')
    return frame_count

In [None]:
# Process a video (update paths)
# process_video_segmentation(
#     best_model,
#     input_path='/path/to/input.mp4',
#     output_path='/path/to/output_seg.mp4',
#     conf=0.5,
#     display=True
# )

## Export Model

In [None]:
# Export to various formats
# best_model.export(format='onnx')  # ONNX
# best_model.export(format='coreml')  # CoreML for iOS/macOS
# best_model.export(format='engine')  # TensorRT for NVIDIA GPUs