# Wire Bounding Box Detection - RF-DETR Training
Fine-tune RF-DETR for wire detection using Apple Silicon (MPS) acceleration

In [None]:
# !pip install rfdetr supervision

In [2]:
import os
from rfdetr import RFDETRBase, RFDETRLarge
import supervision as sv
from PIL import Image
import matplotlib.pyplot as plt
import cv2
import numpy as np
import torch
from pathlib import Path

# enable MPS fallback for unsupported operations (required for RF-DETR on Apple Silicon)
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'

## Configuration

In [None]:
# expects COCO format dataset
# structure:
#   dataset/
#     train/
#       _annotations.coco.json
#       image1.jpg, image2.jpg, ...
#     valid/
#       _annotations.coco.json
#       image1.jpg, image2.jpg, ...
#     test/ (optional)
#       _annotations.coco.json
#       image1.jpg, image2.jpg, ...

DATA_DIR = '<data_dir>'
OUTPUT_DIR = './checkpoints/rfdetr_jan7'

EPOCHS = 50
BATCH_SIZE = 4
GRAD_ACCUM_STEPS = 4  # effective batch size = BATCH_SIZE * GRAD_ACCUM_STEPS
LEARNING_RATE = 1e-4
RESOLUTION = 560  # input resolution (default 560)

# model variant: 'base' or 'large'
MODEL_VARIANT = 'base'

if torch.backends.mps.is_available():
    DEVICE = 'mps'
    print('Using MPS (Apple Silicon) with CPU fallback for unsupported ops')
elif torch.cuda.is_available():
    DEVICE = 'cuda'
    print('Using CUDA')
else:
    DEVICE = 'cpu'
    print('Using CPU')

Using MPS (Apple Silicon) with CPU fallback for unsupported ops


## Load Model

In [3]:


# choose model variant based on config
if MODEL_VARIANT == 'large':
    model = RFDETRLarge()
    print('Loaded RF-DETR Large')
else:
    model = RFDETRBase()
    print('Loaded RF-DETR Base')

Loading pretrain weights
Loaded RF-DETR Base


## Training

In [None]:
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

print(f'Dataset: {DATA_DIR}')
print(f'Output: {OUTPUT_DIR}')
print(f'Device: {DEVICE}')
print(f'Epochs: {EPOCHS}')
print(f'Batch size: {BATCH_SIZE} (effective: {BATCH_SIZE * GRAD_ACCUM_STEPS})')
print(f'Learning rate: {LEARNING_RATE}')
print()

model.train(
    dataset_dir=DATA_DIR,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    grad_accum_steps=GRAD_ACCUM_STEPS,
    lr=LEARNING_RATE,
    resolution=RESOLUTION,
    output_dir=OUTPUT_DIR,
    device=DEVICE,
    use_ema=True,
    gradient_checkpointing=True,  # helps with memory on MPS
    checkpoint_interval=10,
)

print('\nTraining complete!')

Dataset: /Users/elisd/Desktop/vult/data/wires-one-class-bbox-COCO-format
Output: ./checkpoints/rfdetr_jan7
Device: mps
Epochs: 50
Batch size: 4 (effective: 16)
Learning rate: 0.0001

Unable to initialize TensorBoard. Logging is turned off for this session.  Run 'pip install tensorboard' to enable logging.
Not using distributed mode
git:
  sha: N/A, status: clean, branch: N/A

Namespace(num_classes=2, grad_accum_steps=4, amp=True, lr=0.0001, lr_encoder=0.00015, batch_size=4, weight_decay=0.0001, epochs=50, lr_drop=100, clip_max_norm=0.1, lr_vit_layer_decay=0.8, lr_component_decay=0.7, do_benchmark=False, dropout=0, drop_path=0.0, drop_mode='standard', drop_schedule='constant', cutoff_epoch=0, pretrained_encoder=None, pretrain_weights='rf-detr-base.pth', pretrain_exclude_keys=None, pretrain_keys_modify_to_load=None, pretrained_distiller=None, encoder='dinov2_windowed_small', vit_encoder_num_layers=12, window_block_indexes=None, position_embedding='sine', out_feature_indexes=[2, 5, 8, 11]

fatal: not a git repository (or any of the parent directories): .git


Epoch: [0]  [  0/276]  eta: 2:33:21  lr: 0.000100  class_error: -0.00  loss: 14.0371 (14.0371)  loss_ce: 0.6401 (0.6401)  loss_bbox: 1.0160 (1.0160)  loss_giou: 1.4825 (1.4825)  loss_ce_0: 0.5939 (0.5939)  loss_bbox_0: 1.6259 (1.6259)  loss_giou_0: 1.4471 (1.4471)  loss_ce_1: 0.5547 (0.5547)  loss_bbox_1: 1.3343 (1.3343)  loss_giou_1: 1.6193 (1.6193)  loss_ce_enc: 0.5320 (0.5320)  loss_bbox_enc: 1.5720 (1.5720)  loss_giou_enc: 1.6192 (1.6192)  loss_ce_unscaled: 0.6401 (0.6401)  class_error_unscaled: -0.0000 (-0.0000)  loss_bbox_unscaled: 0.2032 (0.2032)  loss_giou_unscaled: 0.7412 (0.7412)  cardinality_error_unscaled: 1.7500 (1.7500)  loss_ce_0_unscaled: 0.5939 (0.5939)  loss_bbox_0_unscaled: 0.3252 (0.3252)  loss_giou_0_unscaled: 0.7236 (0.7236)  cardinality_error_0_unscaled: 1.7500 (1.7500)  loss_ce_1_unscaled: 0.5547 (0.5547)  loss_bbox_1_unscaled: 0.2669 (0.2669)  loss_giou_1_unscaled: 0.8097 (0.8097)  cardinality_error_1_unscaled: 1.7500 (1.7500)  loss_ce_enc_unscaled: 0.5320 (0.5

## Load Trained Model

In [None]:
# load the best checkpoint
best_checkpoint = Path(OUTPUT_DIR) / 'checkpoint_best_total.pth'

if best_checkpoint.exists():
    if MODEL_VARIANT == 'large':
        trained_model = RFDETRLarge(pretrain_weights=str(best_checkpoint))
    else:
        trained_model = RFDETRBase(pretrain_weights=str(best_checkpoint))
    print(f'Loaded checkpoint: {best_checkpoint}')
else:
    print(f'Checkpoint not found: {best_checkpoint}')

## Test Prediction

In [None]:
def predict_and_visualize(model, image_path, threshold=0.5):
    """Run prediction on an image and visualize results."""
    image = Image.open(image_path)
    detections = model.predict(image, threshold=threshold)
    
    annotated = image.copy()
    annotated = sv.BoxAnnotator().annotate(annotated, detections)
    
    labels = [f'{conf:.2f}' for conf in detections.confidence]
    annotated = sv.LabelAnnotator().annotate(annotated, detections, labels)
    
    plt.figure(figsize=(12, 8))
    plt.imshow(annotated)
    plt.axis('off')
    plt.title(f'{len(detections)} detections')
    plt.show()
    
    return detections

In [None]:
# test on an image
test_image = '<path_to_test_image>'
detections = predict_and_visualize(trained_model, test_image, threshold=0.5)

## Video Inference (Optional)

In [5]:
def process_video(model, input_path, output_path, threshold=0.5, display=False):
    """Process video with trained model."""
    cap = cv2.VideoCapture(input_path)
    
    width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fps = cap.get(cv2.CAP_PROP_FPS)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
    
    print(f'Processing: {width}x{height} @ {fps:.1f} fps ({total_frames} frames)')
    
    frame_count = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # convert BGR to RGB for prediction
        rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        pil_image = Image.fromarray(rgb_frame)
        
        # predict
        detections = model.predict(pil_image, threshold=threshold)
        
        # draw boxes
        display_frame = frame.copy()
        for i in range(len(detections.xyxy)):
            x1, y1, x2, y2 = detections.xyxy[i].astype(int)
            conf = detections.confidence[i]
            
            cv2.rectangle(display_frame, (x1, y1), (x2, y2), (0, 255, 0), 2)
            label = f'{conf:.2f}'
            cv2.putText(display_frame, label, (x1, y1 - 5), 
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 255, 0), 2)
        
        out.write(display_frame)
        frame_count += 1
        
        if frame_count % 100 == 0:
            print(f'Progress: {frame_count}/{total_frames} ({100*frame_count/total_frames:.1f}%)')
        
        if display:
            cv2.imshow('Output', display_frame)
            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
    
    cap.release()
    out.release()
    if display:
        cv2.destroyAllWindows()
    
    print(f'Done! Saved to {output_path}')
    return frame_count

In [3]:
checkpoint_path='/Users/elisd/Desktop/vult/models/trained_models/bounding_box_rfdetr_jan7/rfdetr_jan7/checkpoint_best_total.pth'
trained_model = RFDETRBase(pretrain_weights=str(checkpoint_path))

Loading pretrain weights


In [6]:
# process a video (uncomment to use)
process_video(
    trained_model,
    input_path='/Users/elisd/Desktop/overhead.mp4',
    output_path='/Users/elisd/Desktop/overhead_test.mp4',
    threshold=0.7,
    display=True
)

Model is not optimized for inference. Latency may be higher than expected. You can optimize the model for inference by calling model.optimize_for_inference().


Processing: 800x600 @ 30.0 fps (1770 frames)




Done! Saved to /Users/elisd/Desktop/overhead_test.mp4


36