# Init

In [1]:
%load_ext autoreload
%autoreload 2

import fiftyone as fo
import fiftyone.zoo as foz
from fiftyone import ViewField as F
import fiftyone.brain as fob
import fiftyone.utils.eval as foue
import config

import cv2

In [2]:
from IPython.display import clear_output
session = fo.launch_app()
# session.open_tab()
clear_output()

# Load dataset

In [41]:
# fo.delete_dataset(name=config.DATASET_MOD_NAME)

In [3]:
# # Load existing dataset from the database
try:
    dataset = fo.load_dataset(name=config.DATASET_MOD_NAME)
except ValueError:
    print(f"Dataset {config.DATASET_MOD_NAME} not exists, cloning from zoo")
    dataset = fo.load_dataset(name=config.DATASET_ZOO_NAME)
    dataset = dataset.clone()
    dataset.name = config.DATASET_MOD_NAME

dataset.persistent=True
# dataset = dataset.view()

# Filter label by class

In [11]:
view = dataset.filter_labels(config.LABEL_FIELD, F("label").is_in(config.CLASSES_OF_INTEREST))
# dataset = dataset.match_tags('validation')
session.view = view
clear_output()

# Remove duplicates

In [13]:
IOU_THRES = 0.75

foue.compute_max_ious(view, config.LABEL_FIELD, attr_name="max_iou", classwise=True)

# Retrieve detections that overlap above a chosen threshold
dups_view = dataset.filter_labels(config.LABEL_FIELD, F("max_iou") >= IOU_THRES)
session.view = dups_view
clear_output()

In [14]:
# # Method 1: Go to app, tag images as 'duplicate' => delete by 3 lines below
# print(dataset.count_label_tags())
# dataset.delete_labels(tags="duplicate")
# print(dataset.count_label_tags())

# # Method 2: Delete all images filtered by the expression
dataset.delete_samples(dups_view)

# # Method 3: Tag labels and delete only tho
# print(dataset.count_label_tags())
# dups_view.tag_labels("duplicate", label_fields=config.LABEL_FIELD)
# print(dataset.count_label_tags())
# dataset.delete_labels(tags="duplicate")
# print(dataset.count_label_tags())

# Calculate uniqueness


In [91]:
fob.compute_uniqueness(view)
dataset.save()

Generating embeddings...
 100% |█████████████| 27888/27888 [6.9m elapsed, 0s remaining, 108.6 samples/s]      
Computing uniqueness...
Computing neighbors for 27888 embeddings; this may take awhile...
Uniqueness computation complete


In [42]:
unique_view = view.sort_by("uniqueness")
session.view = unique_view
clear_output()

# Calculate similarity


In [92]:
fob.compute_similarity(view, brain_key="image_sim")
dataset.save()

Computing embeddings...
 100% |█████████████| 27888/27888 [2.1h elapsed, 0s remaining, 4.2 samples/s]      


# Add custom tag

## Lightness

## Lightness on whole img

In [None]:
dataset.add_sample_field(field_name='lightness', ftype=fo.FloatField)

In [None]:
import cv2
import numpy as np
from tqdm import tqdm

lightness_values = []

for f in tqdm(view.values("filepath")):
    img = cv2.imread(f)
    hsv = cv2.cvtColor(img,cv2.COLOR_BGR2HSV)
    v = np.mean(hsv[...,[2]])
    lightness_values.append(v)

In [None]:
view.set_values('lightness', lightness_values)

In [None]:
light_view = view.sort_by("lightness")
session.view = light_view
clear_output()

In [15]:
dataset.count_sample_tags()

{'test': 15095, 'validation': 5025, 'del': 142, 'train': 7768}

In [9]:
view

Dataset:     open-images-v6_mod
Media type:  image
Num samples: 27888
Tags:        ['del', 'test', 'train', 'validation']
Sample fields:
    id:             fiftyone.core.fields.ObjectIdField
    filepath:       fiftyone.core.fields.StringField
    tags:           fiftyone.core.fields.ListField(fiftyone.core.fields.StringField)
    metadata:       fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.metadata.Metadata)
    detections:     fiftyone.core.fields.EmbeddedDocumentField(fiftyone.core.labels.Detections)
    open_images_id: fiftyone.core.fields.StringField
    lightness:      fiftyone.core.fields.FloatField
    contrast:       fiftyone.core.fields.FloatField
    uniqueness:     fiftyone.core.fields.FloatField
View stages:
    1. FilterLabels(field='detections', filter={'$in': ['$$this.label', [...]]}, only_matches=True, trajectories=False)

## Lowest lightness from boxes in img

In [80]:
import cv2
import numpy as np
from tqdm import tqdm

lightness_values = []
contrast_values = []

for sample in tqdm(view):
    fp  = sample.filepath
    img = cv2.imread(fp)
    h, w = img.shape[:2]
    v = 1000
    for det in sample[config.LABEL_FIELD].detections:
        x1, y1, wl, hl = det.bounding_box
        x2 = int((x1+wl)*w)
        y2 = int((y1+hl)*h)
        x1 = int(x1*w)
        y1 = int(y1*h)
        box_img = img[y1:y2, x1:x2]
        hsv = cv2.cvtColor(box_img,cv2.COLOR_BGR2HSV)
        new_v = np.mean(hsv[...,[2]])
        if new_v < v:
            v = new_v
            c = np.std(hsv[...,[2]])
        
    lightness_values.append(v)
    contrast_values.append(c)

    
view.set_values('lightness', lightness_values)
view.set_values('contrast', contrast_values)

light_view = view.sort_by("lightness")
session.view = light_view
clear_output()

 63%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████                                                                                                     | 17579/27888 [05:37<03:04, 55.90it/s]Corrupt JPEG data: bad Huffman code
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 27888/27888 [08:42<00:00, 53.36it/s]


In [None]:
for i, id_ in enumerate(view.values('id')):
    view.values('lightness')
    view.values('contrast')

## Aggregate statistics (box area)

In [None]:
view.compute_metadata()

# Expression that computes the area of a bounding box, in pixels
# Bboxes are in [top-left-x, top-left-y, width, height] format
bbox_width = F("bounding_box")[2] * F("$metadata.width")
bbox_height = F("bounding_box")[3] * F("$metadata.height")
bbox_area = bbox_width * bbox_height

# Expression that computes the area of ground truth bboxes
gt_areas = F("ground_truth.detections[]").apply(bbox_area)

# Compute (min, max, mean) of ground truth bounding boxes
print(dataset.bounds(gt_areas))
print(dataset.mean(gt_areas))

In [None]:
for sample in view:
    fp  = sample.filepath
    img = cv2.imread(fp)
    h, w = img.shape[:2]
    for det in sample[config.LABEL_FIELD].detections:
        box = det.bounding_box
        x1, y1, wl, hl = box
        x2 = int((x1+wl)*w)
        y2 = int((y1+hl)*h)
        x1 = int(x1*w)
        y1 = int(y1*h)
        box_img = img[y1:y2, x1:x2]
        
    break

In [None]:
dataset.first()[config.LABEL_FIELD]['detections']

# Remove labels that is in group and inside the object

In [16]:
group_view = view.filter_labels(config.LABEL_FIELD, F('IsGroupOf') == True)
group_view.tag_labels("isGroup", label_fields=config.LABEL_FIELD)
print(dataset.count_label_tags())

session.view = group_view
clear_output()

In [17]:
inside_view = view.filter_labels(config.LABEL_FIELD, F('IsInside') == True)
inside_view.tag_labels("isInside", label_fields=config.LABEL_FIELD)
print(dataset.count_label_tags())

session.view = inside_view
clear_output()

Exception in thread Thread-9:
Traceback (most recent call last):
  File "/home/dhp/miniconda3/envs/dhp/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/home/dhp/miniconda3/envs/dhp/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/home/dhp/miniconda3/envs/dhp/lib/python3.7/site-packages/fiftyone/core/client.py", line 125, in run_client
    io_loop.run_sync(connect)
  File "/home/dhp/.local/lib/python3.7/site-packages/tornado/ioloop.py", line 530, in run_sync
    return future_cell[0].result()
  File "/home/dhp/miniconda3/envs/dhp/lib/python3.7/site-packages/fiftyone/core/client.py", line 104, in connect
    message["state"], with_config=config
  File "/home/dhp/miniconda3/envs/dhp/lib/python3.7/site-packages/fiftyone/core/state.py", line 132, in from_dict
    view = fov.DatasetView._build(dataset, stages)
  File "/home/dhp/miniconda3/envs/dhp/lib/python3.7/site-packages/fiftyone/core/view.py", line 736

In [16]:
# # Method 1: delete labels, keep sample
# print(dataset.count_label_tags())
# dataset.delete_labels(tags="isGroup")
# dataset.delete_labels(tags="isInside")
# print(dataset.count_label_tags())

# # Method 2: Delete sample
print(dataset.count_label_tags())
dataset.delete_samples(group_view)
dataset.delete_samples(inside_view)
print(dataset.count_label_tags())

{}
{}


# Export data and labels

In [18]:
# Use when any change applied to dataset
for split in view.count_sample_tags().keys():
    print(f'Exporting {split}')
    split_view = view.match_tags(split)

    # Export the dataset
    split_view.export(
        export_dir=config.EXPORT_DIR,
        dataset_type=config.EXPORT_DATASET_TYPE,
        label_field=config.LABEL_FIELD,
        classes=config.CLASSES_OF_INTEREST,
        split=split,
    )
dataset.count_sample_tags()

Exporting test
Directory 'export' already exists; export will be merged with existing files


KeyboardInterrupt: 