# Open Images Dataset V4 Starter

[Open Images Dataset V4](https://storage.googleapis.com/openimages/web/index.html)

This notebook explains how to prepare annotation database for Open Images Dataset V4.

In [1]:
# Install dl-cliche in advance: $ pip install git+https://github.com/daisukelab/dl-cliche.git@master
from dlcliche.notebook import *
from dlcliche.utils import *
from od_anno import *

### 1. Set your folder

In [None]:
OIDV4 = Path('/path/to/oidv4')

### 2. Confirm you have following files

In [2]:
# Confirm your OIDV4 dataset
! ls {OIDV4}

class-descriptions-boxable.csv
test
test-annotations-bbox.csv
test-annotations-human-imagelabels-boxable.csv
test-images-with-rotation.csv
train
train-annotations-bbox.csv
train_annotations.csv
train-annotations-human-imagelabels-boxable.csv
train-image-ids-with-human-parts-and-mammal-boxes.txt
train-images-boxable-with-rotation.csv
validation
validation-annotations-bbox.csv
validation-annotations-human-imagelabels-boxable.csv
validation-images-with-rotation.csv


### 3. Create annotation database

Run this to create your `annotations.csv` under folder of your copy.

```sh
$ python make_database_oidv4.py /path/to/oidv4/annotations.csv /path/to/oidv4
```

This will get all annotation data together, main job is collecting shape of all images.
Then makes a single database CSV file for further use.

CAUTION: Running on notebook will take hours long, run in the shell.

### 4. Check created database

In [3]:
od = ODAnno(OIDV4/'annotations.csv', OIDV4)
od.anno_df[:5]

Unnamed: 0,ImageID,File,Label,XMin,XMax,YMin,YMax,Width,Height,Rotation,Split,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
0,000002b66c9c498e,train/000002b66c9c498e.jpg,Person,0.0125,0.195312,0.148438,0.5875,1024.0,1024.0,0.0,train,0,1,0,0,0
1,000002b66c9c498e,train/000002b66c9c498e.jpg,Person,0.025,0.276563,0.714063,0.948438,1024.0,1024.0,0.0,train,0,1,0,0,0
2,000002b66c9c498e,train/000002b66c9c498e.jpg,Person,0.151562,0.310937,0.198437,0.590625,1024.0,1024.0,0.0,train,1,0,0,0,0
3,000002b66c9c498e,train/000002b66c9c498e.jpg,Person,0.25625,0.429688,0.651563,0.925,1024.0,1024.0,0.0,train,1,0,0,0,0
4,000002b66c9c498e,train/000002b66c9c498e.jpg,Person,0.257812,0.346875,0.235938,0.385938,1024.0,1024.0,0.0,train,1,0,0,0,0


In [4]:
od.anno_df.describe()

Unnamed: 0,XMin,XMax,YMin,YMax,Width,Height,Rotation,IsOccluded,IsTruncated,IsGroupOf,IsDepiction,IsInside
count,14610230.0,14610230.0,14610230.0,14610230.0,14610230.0,14610230.0,12648960.0,14610230.0,14610230.0,14610230.0,14610230.0,14610230.0
mean,0.4021163,0.5945006,0.3923172,0.6709587,981.0224,769.7436,1.666226,0.6590767,0.2494148,0.05836226,0.05300998,0.0009387943
std,0.2775938,0.27827,0.2425361,0.2496681,138.8364,149.0829,19.84718,0.4771672,0.4361204,0.2407281,0.2306379,0.06270441
min,0.0,0.000625,0.0,0.000625,256.0,256.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
25%,0.160625,0.37207,0.204503,0.481646,1024.0,683.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.39,0.605,0.381875,0.68609,1024.0,766.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,0.623019,0.83625,0.566604,0.898876,1024.0,768.0,0.0,1.0,1.0,0.0,0.0,0.0
max,0.999375,1.0,0.999167,1.0,7360.0,7360.0,270.0,1.0,1.0,1.0,1.0,1.0


Now it's done. Followings are list of classes and number of each classes.

In [11]:
print_class_balance('Class balance', od.anno_df.Label)

Class balance = {'Accordion': 955, 'Adhesive tape': 255, 'Aircraft': 1898, 'Airplane': 21285, 'Alarm clock': 169, 'Alpaca': 829, 'Ambulance': 447, 'Animal': 17442, 'Ant': 925, 'Antelope': 1568, 'Apple': 3898, 'Armadillo': 56, 'Artichoke': 376, 'Asparagus': 387, 'Auto part': 13586, 'Axe': 148, 'Backpack': 1216, 'Bagel': 640, 'Baked goods': 23010, 'Balance beam': 326, 'Ball': 6845, 'Balloon': 13505, 'Banana': 1612, 'Band-aid': 36, 'Banjo': 264, 'Barge': 983, 'Barrel': 2086, 'Baseball bat': 1228, 'Baseball glove': 2529, 'Bat': 655, 'Bathroom accessory': 2678, 'Bathroom cabinet': 358, 'Bathtub': 545, 'Beaker': 168, 'Bear': 427, 'Bed': 3563, 'Bee': 11401, 'Beehive': 511, 'Beer': 9565, 'Beetle': 3523, 'Bell pepper': 802, 'Belt': 422, 'Bench': 7229, 'Bicycle': 40161, 'Bicycle helmet': 15952, 'Bicycle wheel': 59521, 'Bidet': 440, 'Billboard': 9823, 'Billiard table': 912, 'Binoculars': 123, 'Bird': 47921, 'Blender': 235, 'Blue jay': 259, 'Boat': 79113, 'Bomb': 8, 'Book': 41280, 'Bookcase': 5307