# Datum demonstrator - basics

**The aim of this notebook is to provide example code for using Datum dataset management library and tools.**

In [1]:
import sys, os
import random
from pathlib import Path

import numpy as np
from tqdm import tqdm

from datum.datasets import Entry, Observable, Dataset
from datum.readers import ObsAttributeConstructor, VocDetectionReader, CocoDetectionReader
from datum.readers import load_datum_dataset
from datum.formatters import VocDetectionFormatter, DatumFormatter
from datum.transformers import EntryMapper, ObservableMapper, AttributesTransformer

## Construct datasets with existing readers and existing data

In [2]:
# Load VOC dataset
root_dir = Path('/home/clacroix/databases/VOC2012/')
test_images_list = Path('/home/clacroix/databases/VOC2012/ImageSets/Main/val.txt')

pose_attribute = ObsAttributeConstructor('object', 'pose', 'lookup', str, 'unknown', True, ['pose'])
reader = VocDetectionReader(root_dir, 'jpg',
                            set_images_lists={'test': test_images_list},
                            entries_constructors=[],
                            obs_constructors=[pose_attribute])
voc = Dataset() # Makes sure that entries and observables have some basic required attributes
reader.feed(voc)

100%|██████████| 5823/5823 [00:02<00:00, 2040.77it/s]


In [3]:
# Load COCO val dataset
root_dir = Path('/home/clacroix/databases/coco/')

# add a non-default ObsAttributesConstructor to the reader to construct "is_crowd" attribute
iscrowd_constructor = ObsAttributeConstructor('object', 'iscrowd', 'lookup', int, None, False, ['iscrowd'])

reader = CocoDetectionReader(root_dir, 'jpg', obs_constructors=[iscrowd_constructor])

# read the dataset
coco_val = Dataset()
reader.feed(coco_val, sets=['val2017'])

100%|██████████| 5000/5000 [00:00<00:00, 58095.27it/s]
100%|██████████| 36781/36781 [00:00<00:00, 71195.61it/s]


In [10]:
# Query entries + its observables
# (i) by entry id
entry, observables = coco_val[458]
print('Entry #458 : \n{}\n'.format(entry))
print('Entry #458 1st observable : \n{}\n'.format(observables[0]))

# (ii) by entry name
entry, observables = coco_val['000000491497']
print('Entry "000000491497" : \n{}\n'.format(entry))
print('Entry "000000491497" 1st observable : \n{}\n'.format(observables[0]))

Entry #458 : 
{'name': '000000384670', 'filename': '000000384670.jpg', 'dir': PosixPath('/home/clacroix/rouky/workspace_corentin/databases/coco/val2017'), 'width': 640, 'height': 480, 'depth': 3, 'set': 'val2017', 'idx': 458, 'obs_ids': [29457, 29469, 29965, 32161]}

Entry #458 1st observable : 
{'type': 'object', 'iscrowd': 0, 'name': 'person', 'xmin': 308.19, 'ymin': 94.6, 'xmax': 397.32, 'ymax': 325.47, 'idx': 29457, 'entry_id': 458}

Entry "000000491497" : 
{'name': '000000491497', 'filename': '000000491497.jpg', 'dir': PosixPath('/home/clacroix/rouky/workspace_corentin/databases/coco/val2017'), 'width': 375, 'height': 500, 'depth': 3, 'set': 'val2017', 'idx': 13, 'obs_ids': [4146, 4330, 6106, 6116, 6124, 6129, 6131, 6132, 7062, 7069, 8054, 8203]}

Entry "000000491497" 1st observable : 
{'type': 'object', 'iscrowd': 0, 'name': 'tv', 'xmin': 1.03, 'ymin': 180.41, 'xmax': 92.78, 'ymax': 367.01, 'idx': 4146, 'entry_id': 13}



## Add/Remove new entries / observables to a dataset

In [11]:
# Add entry with 1 associated observable
entry = {'name': 'foo', 'filename': 'foo.jpg', 'dir': '/',
         'width': 600, 'height':400, 'depth': 3,
         'set': 'val2017'}
observables = [{'type': 'object', 'name': 'horse',
                'xmin': 0.0, 'ymin': 0.0, 'xmax': 10.0, 'ymax': 10.0}]

idx = coco_val.add_entry(entry, observables=observables)

In [18]:
# Remove entry / observable
entry, observables = coco_val['000000491497']
for k, obs in enumerate(observables):
    print('Entry "000000491497" {}-th observable : \n {} \n'.format(k, obs))

coco_val.remove_observable(observables[0].idx)

for k, obs in enumerate(coco_val['000000491497'][1]):
    print('After removal - entry "000000491497" {}-th observable : \n {} \n'.format(k, obs))

Entry "000000491497" 0-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name': 'tv', 'xmin': 2.0300000000000002, 'ymin': 180.41, 'xmax': 92.78, 'ymax': 367.01, 'idx': 4146, 'entry_id': 13, 'area': 16933.95} 

Entry "000000491497" 1-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name': 'chair', 'xmin': 168.38, 'ymin': 293.56, 'xmax': 354.08, 'ymax': 492.06, 'idx': 4330, 'entry_id': 13, 'area': 36861.45} 

Entry "000000491497" 2-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name': 'book', 'xmin': 297.35, 'ymin': 217.67, 'xmax': 375.0, 'ymax': 267.68, 'idx': 6106, 'entry_id': 13, 'area': 3883.2765000000004} 

Entry "000000491497" 3-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name': 'book', 'xmin': 344.38, 'ymin': 150.84, 'xmax': 353.48, 'ymax': 196.18, 'idx': 6116, 'entry_id': 13, 'area': 412.5940000000011} 

Entry "000000491497" 4-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name': 'book', 'xmin': 338.41, 'ymin': 276.9, 'xmax': 345.17, 'ymax': 332.71, 'idx': 

## Update existing entries / observables

In [14]:
# Update observable
entry, observables = coco_val[357]
for k, obs in enumerate(observables):
    print('Entry #357 {}-th observable : \n {} \n'.format(k, obs))

update = {'xmin': 0.0, 'ymin': 0.0, 'xmax': 10.0, 'ymax': 10.0}
coco_val.update_observable_data(observables[0].idx, update)

for k, obs in enumerate(observables):
    print('After update - Entry #357 {}-th observable : \n {} \n'.format(k, obs))

Entry #357 0-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name': 'teddy bear', 'xmin': 177.04, 'ymin': 217.34, 'xmax': 309.28, 'ymax': 388.53, 'idx': 30461, 'entry_id': 357} 

Entry #357 1-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name': 'banana', 'xmin': 395.53, 'ymin': 320.81, 'xmax': 437.89, 'ymax': 375.34000000000003, 'idx': 31469, 'entry_id': 357} 

Entry #357 2-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name': 'banana', 'xmin': 120.5, 'ymin': 292.07, 'xmax': 175.77, 'ymax': 331.93, 'idx': 31470, 'entry_id': 357} 

Entry #357 3-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name': 'banana', 'xmin': 96.26, 'ymin': 302.47, 'xmax': 124.08000000000001, 'ymax': 323.43, 'idx': 31471, 'entry_id': 357} 

Entry #357 4-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name': 'bowl', 'xmin': 57.66, 'ymin': 322.0, 'xmax': 174.7, 'ymax': 384.95, 'idx': 32273, 'entry_id': 357} 

After update - Entry #357 0-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name

In [15]:
# Try to update entry forbidden attributes
entry, observables = coco_val[357]

update = {'entry_id': 42}
coco_val.update_observable_data(observables[0].idx, update)

ForbiddenAttribute: Modification of forbidden attribute entry_id in {'type': 'object', 'iscrowd': 0, 'name': 'teddy bear', 'xmin': 0.0, 'ymin': 0.0, 'xmax': 10.0, 'ymax': 10.0, 'idx': 30461, 'entry_id': 357}

## Apply modifications to all entries/observables attributes : AttributesTransformer

In [16]:
# Apply modifications to all entries attributes
entry, observables = coco_val[458]
print('Entry #458 : \n {} \n'.format(entry))

# Upper 'filename' attribute
upper = EntryMapper(['filename'], ['filename'], lambda f: f.upper())

# Construct path attribute containing complete image disk path
def f(directory, filename):
    return str(directory) + '/' + filename
full_path = EntryMapper(['dir', 'filename'], ['path'], f)

# Apply attributes mappers to entire dataset
transformer = AttributesTransformer(entries_mappers=[upper, full_path])
transformer.transform(coco_val)

entry, observables = coco_val[458]
print('Entry #458 - after attributes modification : \n {} \n'.format(entry))

Entry #458 : 
 {'name': '000000384670', 'filename': '000000384670.jpg', 'dir': PosixPath('/home/clacroix/rouky/workspace_corentin/databases/coco/val2017'), 'width': 640, 'height': 480, 'depth': 3, 'set': 'val2017', 'idx': 458, 'obs_ids': [29457, 29469, 29965, 32161]} 

Entry #458 - after attributes modification : 
 {'name': '000000384670', 'filename': '000000384670.JPG', 'dir': PosixPath('/home/clacroix/rouky/workspace_corentin/databases/coco/val2017'), 'width': 640, 'height': 480, 'depth': 3, 'set': 'val2017', 'idx': 458, 'obs_ids': [29457, 29469, 29965, 32161], 'path': '/home/clacroix/rouky/workspace_corentin/databases/coco/val2017/000000384670.JPG'} 



In [22]:
# Apply modifications to all observables attributes (~ 10s)
entry, observables = coco_val[452]
print('Entry #452 0-th observable : \n {} \n'.format(observables[0]))

# xmin <- xmin + 1
expand_xmin_mapper = ObservableMapper(['xmin'], ['xmin'], lambda x: x+1)

# Compute width attribute for GT boxes
def width(xmin, xmax):
    return xmax - xmin
width_mapper = ObservableMapper(['xmin', 'xmax'], ['width'], width)

# Apply attributes mappers to entire dataset
transformer = AttributesTransformer(observables_mappers=[expand_xmin_mapper, width_mapper])
transformer.transform(coco_val)

entry, observables = coco_val[452]
print('Entry #458 0-th attribute - after modifications : \n {} \n'.format(observables[0]))

Entry #452 0-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name': 'potted plant', 'xmin': 159.79, 'ymin': 184.66, 'xmax': 291.29999999999995, 'ymax': 350.25, 'idx': 24375, 'entry_id': 452, 'area': 21776.740899999993} 

Entry #458 0-th attribute - after modifications : 
 {'type': 'object', 'iscrowd': 0, 'name': 'potted plant', 'xmin': 160.79, 'ymin': 184.66, 'xmax': 291.29999999999995, 'ymax': 350.25, 'idx': 24375, 'entry_id': 452, 'area': 21776.740899999993, 'width': 130.50999999999996} 



## Iterate over a dataset

In [19]:
# Iterate over all entries and their observables
for (entry, observables) in tqdm(coco_val):
    if entry.idx == 458:
        print('Entry #458 : \n {} \n'.format(entry))
        print('Entry #458 0-th observable : \n {} \n'.format(observables[0]))

100%|██████████| 5001/5001 [00:00<00:00, 236354.07it/s]

Entry #458 : 
 {'name': '000000384670', 'filename': '000000384670.JPG', 'dir': PosixPath('/home/clacroix/rouky/workspace_corentin/databases/coco/val2017'), 'width': 640, 'height': 480, 'depth': 3, 'set': 'val2017', 'idx': 458, 'obs_ids': [29457, 29469, 29965, 32161], 'path': '/home/clacroix/rouky/workspace_corentin/databases/coco/val2017/000000384670.JPG'} 

Entry #458 0-th observable : 
 {'type': 'object', 'iscrowd': 0, 'name': 'person', 'xmin': 309.19, 'ymin': 94.6, 'xmax': 397.32, 'ymax': 325.47, 'idx': 29457, 'entry_id': 458, 'area': 20346.5731} 






## Merge multiple datasets

In [4]:
# Merge 2 datasets : COCO + VOC
new_merged_dataset = coco_val + voc
print(new_merged_dataset[0])

({'name': '000000397133', 'filename': '000000397133.jpg', 'dir': PosixPath('/home/clacroix/rouky/workspace_corentin/databases/coco/val2017'), 'width': 640, 'height': 427, 'depth': 3, 'set': 'val2017', 'idx': 0, 'obs_ids': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]}, [{'type': 'object', 'iscrowd': 0, 'name': 'bottle', 'xmin': 217.62, 'ymin': 240.54, 'xmax': 256.61, 'ymax': 298.28999999999996, 'idx': 0, 'entry_id': 0}, {'type': 'object', 'iscrowd': 0, 'name': 'dining table', 'xmin': 1.0, 'ymin': 240.24, 'xmax': 347.63, 'ymax': 427.0, 'idx': 1, 'entry_id': 0}, {'type': 'object', 'iscrowd': 0, 'name': 'person', 'xmin': 388.66, 'ymin': 69.92, 'xmax': 498.07000000000005, 'ymax': 347.54, 'idx': 2, 'entry_id': 0}, {'type': 'object', 'iscrowd': 0, 'name': 'knife', 'xmin': 135.57, 'ymin': 249.43, 'xmax': 157.89, 'ymax': 278.22, 'idx': 3, 'entry_id': 0}, {'type': 'object', 'iscrowd': 0, 'name': 'bowl', 'xmin': 31.28, 'ymin': 344.0, 'xmax': 99.4, 'ymax': 384.83, 'idx': 4, '

## Format a dataset to disk

In [25]:
# Format dataset at VOC disk format
# Reduce dataset size to make formatting < 1 sec
coco_val_reduced = Dataset()
for k, (entry, observables) in enumerate(coco_val):
    coco_val_reduced.add_entry(entry, observables=observables)
    if k == 500:
        break

# Format dataset
formatter_root_path = Path('/home/clacroix/tmp/datum_voc_format_test/')
formatter = VocDetectionFormatter(formatter_root_path)
formatter.format(coco_val_reduced, copy_images=False) # format dataset

Constructing annotation "width" with path ['annotation', 'size', 'width'], type <class 'int'>, default value=None and set_to_default=False
Constructing annotation "height" with path ['annotation', 'size', 'height'], type <class 'int'>, default value=None and set_to_default=False
Constructing annotation "depth" with path ['annotation', 'size', 'depth'], type <class 'int'>, default value=3 and set_to_default=True
Constructing annotation "name" for observable type "object" with path ['name'], type <class 'str'>, default value=None and set_to_default=False
Constructing annotation "xmin" for observable type "object" with path ['bndbox', 'xmin'], type <class 'int'>, default value=None and set_to_default=False
Constructing annotation "ymin" for observable type "object" with path ['bndbox', 'ymin'], type <class 'int'>, default value=None and set_to_default=False
Constructing annotation "xmax" for observable type "object" with path ['bndbox', 'xmax'], type <class 'int'>, default value=None and 

In [26]:
# Format dataset at Datum disk format
formatter_root_path = Path('/home/clacroix/tmp/datum_datum_format_test/')
formatter = DatumFormatter(formatter_root_path)
formatter.format(coco_val_reduced) # format dataset

# Reload from Datum format
coco_val_reduced_2 = load_datum_dataset(formatter_root_path)

entry, _ = coco_val_reduced['000000491497']
print('coco_val_reduced - Entry "000000491497" : \n {} \n'.format(entry))

entry, _ = coco_val_reduced_2['000000491497']
print('coco_val_reduced_2 - Entry "000000491497" : \n {} \n'.format(entry))

coco_val_reduced - Entry "000000491497" : 
 {'name': '000000491497', 'filename': '000000491497.JPG', 'dir': PosixPath('/home/clacroix/rouky/workspace_corentin/databases/coco/val2017'), 'width': 375, 'height': 500, 'depth': 3, 'set': 'val2017', 'idx': 13, 'obs_ids': [145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155], 'path': '/home/clacroix/rouky/workspace_corentin/databases/coco/val2017/000000491497.JPG'} 

coco_val_reduced_2 - Entry "000000491497" : 
 {'name': '000000491497', 'filename': '000000491497.JPG', 'dir': '/home/clacroix/rouky/workspace_corentin/databases/coco/val2017', 'width': 375, 'height': 500, 'depth': 3, 'set': 'val2017', 'idx': 13, 'obs_ids': [145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155], 'path': '/home/clacroix/rouky/workspace_corentin/databases/coco/val2017/000000491497.JPG'} 



## Extract dataframes from dataset

In [23]:
coco_val.entries_df

Unnamed: 0,name,filename,dir,width,height,depth,set,idx,obs_ids,path
0,000000397133,000000397133.JPG,/home/clacroix/rouky/workspace_corentin/databa...,640,427,3,val2017,0,"[158, 292, 620, 1422, 1491, 1507, 1795, 2091, ...",/home/clacroix/rouky/workspace_corentin/databa...
1,000000037777,000000037777.JPG,/home/clacroix/rouky/workspace_corentin/databa...,352,230,3,val2017,1,"[31, 239, 248, 293, 899, 1544, 1770, 1830, 278...",/home/clacroix/rouky/workspace_corentin/databa...
2,000000252219,000000252219.JPG,/home/clacroix/rouky/workspace_corentin/databa...,640,428,3,val2017,2,"[1255, 1262, 1269, 2493, 2527, 2650, 3289]",/home/clacroix/rouky/workspace_corentin/databa...
3,000000087038,000000087038.JPG,/home/clacroix/rouky/workspace_corentin/databa...,640,480,3,val2017,3,"[314, 2072, 2090, 2121, 2164, 2195, 2204, 2304...",/home/clacroix/rouky/workspace_corentin/databa...
4,000000174482,000000174482.JPG,/home/clacroix/rouky/workspace_corentin/databa...,640,388,3,val2017,4,"[336, 395, 436, 1153, 1156, 2325, 2340, 2344, ...",/home/clacroix/rouky/workspace_corentin/databa...
...,...,...,...,...,...,...,...,...,...,...
4996,000000168974,000000168974.JPG,/home/clacroix/rouky/workspace_corentin/databa...,375,500,3,val2017,4996,"[33297, 33781, 36154]",/home/clacroix/rouky/workspace_corentin/databa...
4997,000000552775,000000552775.JPG,/home/clacroix/rouky/workspace_corentin/databa...,375,500,3,val2017,4997,"[33003, 33054, 34112, 34130, 34292, 35462, 356...",/home/clacroix/rouky/workspace_corentin/databa...
4998,000000394940,000000394940.JPG,/home/clacroix/rouky/workspace_corentin/databa...,426,640,3,val2017,4998,"[34080, 35761, 35835, 36217]",/home/clacroix/rouky/workspace_corentin/databa...
4999,000000015335,000000015335.JPG,/home/clacroix/rouky/workspace_corentin/databa...,640,480,3,val2017,4999,"[33072, 34532, 34550, 34581, 34650, 34665, 347...",/home/clacroix/rouky/workspace_corentin/databa...


In [24]:
coco_val.observables_df

Unnamed: 0,type,iscrowd,name,xmin,ymin,xmax,ymax,idx,entry_id,area,width
0,object,0.0,dog,476.07,395.93,511.72,424.60,0,3828,1050.7555,35.65
1,object,0.0,dog,275.10,200.23,424.07,480.00,1,1392,41957.1069,148.97
2,object,0.0,dog,127.71,196.18,497.56,552.99,2,756,132322.9885,369.85
3,object,0.0,dog,115.71,154.82,480.00,634.17,3,495,175101.7615,364.29
4,object,0.0,dog,203.61,89.65,600.83,340.67,4,4052,99961.1844,397.22
...,...,...,...,...,...,...,...,...,...,...,...
36778,object,1.0,bottle,9.00,75.00,480.00,338.00,36778,1486,124136.0000,471.00
36779,object,1.0,banana,13.00,41.00,413.00,193.00,36779,2109,60952.0000,400.00
36780,object,1.0,person,3.00,34.00,639.00,422.00,36780,664,247156.0000,636.00
36781,object,,horse,3.00,0.00,10.00,10.00,36781,5000,80.0000,7.00
