In [1]:
import fiftyone as fo
import fiftyone.brain as fob
import fiftyone.zoo as foz

from fiftyone.utils.huggingface import load_from_hub

In [2]:
# Load dataset from Hugging Face Hub
# https://huggingface.co/datasets/Voxel51/fisheye8k
dataset = load_from_hub("Voxel51/fisheye8k", name="fisheye8k-100", max_samples=100, overwrite=True)

  from .autonotebook import tqdm as notebook_tqdm


Downloading config file fiftyone.yml from Voxel51/fisheye8k
Loading dataset
Importing samples...
 100% |█████████████████| 100/100 [8.6ms elapsed, 0s remaining, 11.7K samples/s]      


In [3]:
# Iterate over samples
for sample in dataset:
    print(sample)
    break

<Sample: {
    'id': '6761e9c84d61e31cfcf2b7d0',
    'media_type': 'image',
    'filepath': '/home/eo233/fiftyone/huggingface/hub/Voxel51/fisheye8k/data/camera3_A_0.png',
    'tags': ['train'],
    'metadata': <ImageMetadata: {
        'size_bytes': None,
        'mime_type': None,
        'width': 1225,
        'height': 1088,
        'num_channels': None,
    }>,
    'created_at': datetime.datetime(2025, 3, 17, 22, 20, 36, 537000),
    'last_modified_at': datetime.datetime(2025, 3, 17, 22, 20, 36, 537000),
    'detections': <Detections: {
        'detections': [
            <Detection: {
                'id': '6761e9c84d61e31cfcf2b776',
                'attributes': {},
                'tags': [],
                'label': 'Bike',
                'bounding_box': [
                    0.11836734693877551,
                    0.31525735294117646,
                    0.060408163265306125,
                    0.09926470588235294,
                ],
                'mask': None,
          

In [4]:
# Compute and visualize image embeddings
model_embeddings = foz.load_zoo_model("mobilenet-v2-imagenet-torch")
fob.compute_visualization(
    dataset,
    model=model_embeddings,
    method="tsne",
    brain_key="mobilenet_tsne",
    num_workers=16
)

Computing embeddings...
 100% |█████████████████| 100/100 [4.9s elapsed, 0s remaining, 17.8 samples/s]      
Generating visualization...




[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Indexed 100 samples in 0.000s...
[t-SNE] Computed neighbors for 100 samples in 0.088s...
[t-SNE] Computed conditional probabilities for sample 100 / 100
[t-SNE] Mean sigma: 1.372347
[t-SNE] Computed conditional probabilities in 0.002s
[t-SNE] Iteration 50: error = 55.2537537, gradient norm = 0.2911240 (50 iterations in 0.014s)
[t-SNE] Iteration 100: error = 54.4217834, gradient norm = 0.3335772 (50 iterations in 0.013s)
[t-SNE] Iteration 150: error = 55.4286613, gradient norm = 0.3036207 (50 iterations in 0.014s)
[t-SNE] Iteration 200: error = 53.5542450, gradient norm = 0.3284553 (50 iterations in 0.013s)
[t-SNE] Iteration 250: error = 55.4162216, gradient norm = 0.3306054 (50 iterations in 0.014s)
[t-SNE] KL divergence after 250 iterations with early exaggeration: 55.416222
[t-SNE] Iteration 300: error = 0.5796844, gradient norm = 0.0166757 (50 iterations in 0.011s)
[t-SNE] Iteration 350: error = 0.2880096, gradient norm = 0.0120758 (

<fiftyone.brain.visualization.VisualizationResults at 0x7f277803a420>

In [None]:
# Compute and visualize detection embeddings
model_embeddings = foz.load_zoo_model("mobilenet-v2-imagenet-torch")
fob.compute_visualization(
    dataset,
    patches_field="detections",
    model=model_embeddings,
    method="tsne",
    brain_key="mobilenet_tsne",
    num_workers=16
)

In [5]:
# Zero-Shot Object Detection based on existing object classes
# Models from Hugging Face: https://huggingface.co/models?pipeline_tag=zero-shot-object-detection&library=transformers&sort=trending
dataset_classes = dataset.default_classes
print(f"Object classes in dataset: {dataset_classes}")

# Grounding DINO
model_zero_shot_grounding_dino = foz.load_zoo_model(
    "zero-shot-detection-transformer-torch",
    name_or_path="IDEA-Research/grounding-dino-base",
    classes=dataset_classes,
)
dataset.apply_model(model_zero_shot_grounding_dino, label_field="grounding_dino", confidence_thresh=0.2, progress=True)

# OwlV2
model_zero_shot_owlv2 = foz.load_zoo_model(
    "zero-shot-detection-transformer-torch",
    name_or_path="google/owlv2-base-patch16-ensemble",
    classes=dataset_classes,
)
dataset.apply_model(model_zero_shot_owlv2, label_field="owlv2", confidence_thresh=0.2, progress=True)

Object classes in dataset: ['Bus', 'Bike', 'Car', 'Pedestrian', 'Truck']
 100% |█████████████████| 100/100 [57.1s elapsed, 0s remaining, 1.6 samples/s]      
 100% |█████████████████| 100/100 [1.2m elapsed, 0s remaining, 1.1 samples/s]      


In [None]:
# Evaluate detections
dataset.evaluate_detections(
    "grounding_dino",
    gt_field="detections",
    eval_key="eval_grounding_dino",
    compute_mAP=True,
)

dataset.evaluate_detections(
    "owlv2",
    gt_field="detections",
    eval_key="eval_owlv2",
    compute_mAP=True,
)

Evaluating detections...
  35% |█████------------|  35/100 [3.3s elapsed, 5.2s remaining, 18.2 samples/s]    

In [None]:
# Launch Voxel51 GUI
fo.launch_app(dataset)