### Load data from .csv file
- train-images-boxable.csv file contains the image name and image url
- train-annotations-bbox.csv file contains the bounding box info with the image id (name) and the image label name
- class-descriptions-boxable.csv file contains the image label name corresponding to its class name

https://storage.googleapis.com/openimages/web/download.html
https://storage.googleapis.com/openimages/2018_04/bbox_labels_600_hierarchy_visualizer/circle.html

In [None]:
import os
import random
import numpy as np
import pandas as pd
from skimage import io

In [None]:
DIR = os.getcwd()

In [None]:
images_boxable = pd.read_csv('train-images-boxable.csv')
images_boxable.head()

In [None]:
annotations_bbox = pd.read_csv('train-annotations-bbox.csv')
annotations_bbox.head()

In [None]:
class_descriptions = pd.read_csv('class-descriptions-boxable.csv')
class_descriptions.head()

### Get subset with maple and christmas tree

In [None]:
maple_pd = class_descriptions[class_descriptions['class'] == 'Maple']
christmas_tree_pd = class_descriptions[class_descriptions['class'] == 'Christmas tree']

In [None]:
label_name_maple = maple_pd['name'].values[0]
label_name_christmas_tree = christmas_tree_pd['name'].values[0]

In [None]:
print('Maple label:', label_name_maple)
print('Christmas tree:', label_name_christmas_tree)

In [None]:
maple_bbox = annotations_bbox[annotations_bbox['LabelName'] == label_name_maple]
christmas_tree_bbox = annotations_bbox[annotations_bbox['LabelName'] == label_name_christmas_tree]

In [None]:
print(f'{len(maple_bbox)} maples in the dataset')
print(f'{len(christmas_tree_bbox)} christmas trees in the dataset')

In [None]:
maple_img_id = maple_bbox['ImageID']
christmas_tree_id = christmas_tree_bbox['ImageID']
maple_img_id = np.unique(maple_img_id)
christmas_tree_id = np.unique(christmas_tree_id)

In [None]:
print(f'{len(maple_img_id)} images which contain maples')
print(f'{len(christmas_tree_id)} images which contain christmas trees')

Pick 1000 random imgs for each

In [None]:
copy_maple_id = maple_img_id.copy()
random.seed(1)
random.shuffle(copy_maple_id)

copy_christmas_tree_id = christmas_tree_id.copy()
random.seed(1)
random.shuffle(copy_christmas_tree_id)

n = 1000
subset_maple_img_id = copy_maple_id[:n]
subset_christmas_tree_img_id = copy_christmas_tree_id[:n]

In [None]:
subset_maple_img_url = [images_boxable[images_boxable['image_name'] == name + '.jpg'] for name in subset_maple_img_id]
subset_christmas_tree_img_url = [images_boxable[images_boxable['image_name'] == name + '.jpg'] for name in subset_christmas_tree_img_id]

In [None]:
subset_maple_df = pd.DataFrame()
subset_christmas_tree_df = pd.DataFrame()

for i in range(len(subset_maple_img_url)):
    subset_maple_df = subset_maple_df.append(subset_maple_img_url[i], ignore_index=True)
    subset_christmas_tree_df = subset_christmas_tree_df.append(subset_christmas_tree_img_url[i], ignore_index=True)
    
subset_maple_df.to_csv(os.path.join(DIR ,'subset_maple.csv'))
subset_christmas_tree_df.to_csv(os.path.join(DIR, 'subset_christmas_tree.csv'))

In [None]:
subset_maple_df.head()

In [None]:
maple_urls = subset_maple_df['image_url'].values
christmas_tree_urls = subset_christmas_tree_df['image_url'].values

### Download images

In [None]:
os.mkdir(os.path.join(DIR, 'images'))

In [None]:
for url in maple_urls:
    img = io.imread(url)
    img_path = os.path.join(DIR, url.split('/')[-1])
    io.imsave(img_path, img)
    
for url in christmas_tree_urls:
    img = io.imread(url)
    img_path = os.path.join(DIR, url.split('/')[-1])
    io.imsave(img_path, img)