# Datum demonstrator - basics

**The aim of this notebook is to provide example code for using Datum dataset management library and tools.**

In [1]:
import sys, os
import random
from pathlib import Path

import numpy as np
from tqdm import tqdm

from datum.datasets.detection_dataset import Dataset
from datum.formatters import DatumFormatter
from datum.readers.voc_detection_reader import VocDetectionReader
from datum.readers.coco_detection_reader import CocoDetectionReader
from datum.readers import DatumReader
from datum.transformers import EntryMapper, ObservableMapper, AttributesTransformer
from datum.utils.datastore import list_existing_datastores, list_existing_datasets, register_dataset, load_dataset

## Advanced operations : combine multiple base Datum features to construct a new custom + filtered + composite dataset

In [2]:
# (1) Construct datasets to manipulate (15s)

# Construct VOC train dataset
root_dir = Path('/home/clacroix/databases/VOC2012/')
train_images_list = Path('/home/clacroix/databases/VOC2012/ImageSets/Main/val.txt')
reader = VocDetectionReader(root_dir, 'jpg', set_images_lists={'train': train_images_list})
voc_train = Dataset()
reader.feed(voc_train)

to_int_xmin = ObservableMapper(['xmin'], ['xmin'], lambda x: int(round(x)))
to_int_ymin = ObservableMapper(['ymin'], ['ymin'], lambda x: int(round(x)))
to_int_xmax = ObservableMapper(['xmax'], ['xmax'], lambda x: int(round(x)))
to_int_ymax = ObservableMapper(['ymax'], ['ymax'], lambda x: int(round(x)))
voc_transformer = AttributesTransformer(observables_mappers=[to_int_xmin, to_int_ymin,
                                                             to_int_xmax, to_int_ymax])
voc_transformer.transform(voc_train)

# Construct COCO train dataset (15 sec)
root_dir = Path('/home/clacroix/workspace_corentin/databases/coco/')
reader = CocoDetectionReader(root_dir, 'jpg')
coco_val = Dataset()
reader.feed(coco_val, sets=['val2017'])

100%|██████████| 5823/5823 [00:01<00:00, 3585.00it/s]
100%|██████████| 5000/5000 [00:00<00:00, 51987.30it/s]
100%|██████████| 36781/36781 [00:00<00:00, 78514.65it/s]


In [3]:
# (2) Modify Coco dataset to align to VOC standards : use EntryMapper and ObservableMapper (30sec)

# set attribute : 'train2017'-> 'train"
setmapper = EntryMapper(['set'], ['set'], lambda x: 'train' if x == 'train2017' else x)

# Remap cat+cow+dog to animal and motorcycle to motorbike
def remap(name):
    if name in ['cat', 'cow', 'dog']:
        return 'animal'
    elif name == 'motorcycle':
        return 'motorbike'
    else:
        return name
nameremapper = ObservableMapper(['name'], ['name'], remap, obs_type='object')

# Add 1 to xmin/ymin
expand_xmin = ObservableMapper(['xmin'], ['xmin'], lambda x: x + 1)
expand_ymin = ObservableMapper(['ymin'], ['ymin'], lambda x: x + 1)

# apply all mappers
coco_transformer = AttributesTransformer(entries_mappers=[setmapper],
                                         observables_mappers=[nameremapper,
                                                              to_int_xmin, to_int_ymin,
                                                              to_int_xmax, to_int_ymax,
                                                              expand_xmin, expand_ymin])
coco_transformer.transform(coco_val)

In [4]:
# (3) Filter COCO objects based en their classes (5s)
coco_obs = coco_val.observables_df

# Filter out non VOC target classes
target_classes = ['bicycle', 'boat', 'car', 'motorcycle']
coco_obs['target'] = coco_obs['name'].apply(lambda x: True if x in target_classes else False)
valid = coco_obs[coco_obs['target']]

# Manually remove invalid observables from Datum dataset
to_remove = coco_obs.drop(valid.index).index.values
for obs_id in tqdm(to_remove):
    coco_val.remove_observable(obs_id)

# Remove images without GT objects
coco_val.remove_entries_without_obs()

# (4) Filter COCO images based on their characteristics (aspect ratio, dimension, #objects, etc)
ar_min, ar_max = 1.3, 1.8
min_height = 400
max_n_objects = 15
keep = 200

# create objects DataFrame + columns aspect_ratio (ar), n_objects 
coco_images = coco_val.entries_df
coco_images['ar'] = coco_images['width'] / coco_images['height']
coco_images['n_objects'] = coco_images['obs_ids'].apply(lambda x: len(x))

# filter based on aspect ratio
valid = coco_images[(coco_images['ar'] > ar_min) & (coco_images['ar'] < ar_max)]

# filter based on size
valid = valid[valid['height'] > min_height]

# filter based on #objects
valid = valid[valid['n_objects'] < max_n_objects]

# keep randomly only 1000 images
np.random.seed(42)
drop_indices = np.random.choice(valid.index, keep, replace=False)
valid = valid.drop(drop_indices)

# Manually remove invalid images from Datum dataset
to_remove_idxs = coco_images.drop(valid.index).index.values
for entry_id in tqdm(to_remove_idxs):
    coco_val.remove_entry(int(entry_id))

100%|██████████| 34103/34103 [00:00<00:00, 340245.88it/s]
100%|██████████| 519/519 [00:00<00:00, 172164.17it/s]


In [5]:
# (5) Merge datasets and format result back to disk
voc_coco250 = voc_train + coco_val
print(len(voc_coco250))

save_path = Path('/home/clacroix/tmp/VOC_COCO250/')
formatter = DatumFormatter(save_path)
formatter.format(voc_coco250)

6040


## Datum Datastore features

Available functions :

- list_existing_datastores() : display all existing datastores

<pre> $list-existing-datastores </pre>

- list_existing_datasets() :  display all existing datasets

<pre> $list-existing-datasets </pre>

- register_dataset() : register a dataset under Datum format @dataset_root_path to datastore with dataset_name

<pre> $register-dataset dataset_root_path dataset_name datastore </pre>


- load_dataset() : utility function for loading a dataset from a datastore

In [6]:
# (1) Show information about existing datastores : command-line of Python functions
list_existing_datastores()
list_existing_datasets()

# (2) Add some datasets to a datastore : command-line or Python function register_dataset()
register_dataset('/home/clacroix/tmp/VOC_COCO250/',
                 'voc_coco200',
                 'detection')
list_existing_datasets()

# DATASTORES
## detection

# DATASTORE detection (/home/clacroix/.local/lib/python3.6/site-packages/datum/datastores/detection.csv)
## coco_val_reduced_datum (path : /home/corentin/tmp/datum_datum_format_test)



# DATASTORE detection (/home/clacroix/.local/lib/python3.6/site-packages/datum/datastores/detection.csv)
## coco_val_reduced_datum (path : /home/corentin/tmp/datum_datum_format_test)
## voc_coco200 (path : /home/clacroix/tmp/VOC_COCO250)




In [7]:
# (3) Load a dataset from a datastore
voc_coco250_ah = load_dataset('voc_coco200')

entry, observables = voc_coco250_ah[458]
print('Entry #458 : \n{}\n'.format(entry))
print('Entry #458 1st observable : \n{}\n'.format(observables[0]))

Entry #458 : 
{'name': '2008_001765', 'filename': '2008_001765.jpg', 'dir': '/home/clacroix/databases/VOC2012/JPEGImages', 'width': 375, 'height': 500, 'set': 'train', 'idx': 458, 'obs_ids': [1202, 1203, 1204, 1205, 1206, 1207, 1208]}

Entry #458 1st observable : 
{'type': 'object', 'name': 'person', 'xmin': 97, 'ymin': 151, 'xmax': 252, 'ymax': 500, 'idx': 1202, 'entry_id': 458}

