In [1]:
# install requirements

!pip install boto3
!pip install tqdm
!pip install requests
!pip install pandas
!pip install opencv-python

Collecting boto3
  Downloading boto3-1.34.86-py3-none-any.whl (139 kB)
[K     |████████████████████████████████| 139 kB 1.2 MB/s 
[?25hCollecting jmespath<2.0.0,>=0.7.1
  Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Collecting s3transfer<0.11.0,>=0.10.0
  Downloading s3transfer-0.10.1-py3-none-any.whl (82 kB)
[K     |████████████████████████████████| 82 kB 579 kB/s 
[?25hCollecting botocore<1.35.0,>=1.34.86
  Downloading botocore-1.34.86-py3-none-any.whl (12.1 MB)
[K     |████████████████████████████████| 12.1 MB 9.6 MB/s 
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.34.86 botocore-1.34.86 jmespath-1.0.1 s3transfer-0.10.1
Collecting pandas
  Downloading pandas-2.2.2-cp39-cp39-macosx_11_0_arm64.whl (11.3 MB)
[K     |████████████████████████████████| 11.3 MB 35.3 MB/s 
[?25hCollecting pytz>=2020.1
  Downloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
[K     |████████████████████████████████| 505 kB 3.1 MB/s 
Collecti

In [2]:
import ast
import os
import shutil
import argparse
import sys

import requests

import pandas as pd


def process(classes, data_out_dir, yolov8_format, max_number_images_per_class):

    if max_number_images_per_class is None:
        max_number_images_per_class = sys.maxsize

    train_data_url = 'https://storage.googleapis.com/openimages/v6/oidv6-train-annotations-bbox.csv'
    val_data_url = 'https://storage.googleapis.com/openimages/v5/validation-annotations-bbox.csv'
    test_data_url = 'https://storage.googleapis.com/openimages/v5/test-annotations-bbox.csv'

    downloader_url = 'https://raw.githubusercontent.com/openimages/dataset/master/downloader.py'

    class_names_all_url = 'https://storage.googleapis.com/openimages/v7/oidv7-class-descriptions.csv'

    for url in [train_data_url, val_data_url, test_data_url, class_names_all_url, downloader_url]:
        if not os.path.exists(url.split('/')[-1]):
            print('downloading {}...'.format(url.split('/')[-1]))
            r = requests.get(url)
            with open(url.split('/')[-1], 'wb') as f:
                f.write(r.content)

    class_ids = []

    classes_all = pd.read_csv(class_names_all_url.split('/')[-1])

    for class_ in classes:
        if class_ not in list(classes_all['DisplayName']) or class_ not in list(classes_all['DisplayName']):
            raise Exception('Class name not found: {}'.format(class_))
        class_index = list(classes_all['DisplayName']).index(class_)
        class_ids.append(classes_all['LabelName'].iloc[class_index])

    image_list_file_path = os.path.join('.', 'image_list_file')
    if os.path.exists(image_list_file_path):
        os.remove(image_list_file_path)


    image_list_file_list = []
    for j, url in enumerate([train_data_url, val_data_url, test_data_url]):
        image_list_file_per_class = [[] for j in class_ids]
        filename = url.split('/')[-1]
        with (open(filename, 'r') as f):
            line = f.readline()
            while len(line) != 0:
                id, _, class_name, _, x1, x2, y1, y2, _, _, _, _, _ = line.split(',')[:13]
                if class_name in class_ids and id not in image_list_file_list \
                    and len(image_list_file_per_class[class_ids.index(class_name)]) < max_number_images_per_class:
                    image_list_file_list.append(id)
                    image_list_file_per_class[class_ids.index(class_name)].append(id)
                    with open(image_list_file_path, 'a') as fw:
                        fw.write('{}/{}\n'.format(['train', 'validation', 'test'][j], id))
                line = f.readline()

            f.close()

    out_dir = './.out'
    shutil.rmtree(out_dir, ignore_errors=True)
    os.system('python downloader.py {} --download_folder={}'.format(image_list_file_path, out_dir))

    DATA_ALL_DIR = out_dir

    for set_ in ['train', 'val', 'test']:
        for dir_ in [os.path.join(data_out_dir, set_),
                     os.path.join(data_out_dir, set_, 'imgs'),
                     os.path.join(data_out_dir, set_, 'anns')]:
            if os.path.exists(dir_):
                shutil.rmtree(dir_)
            os.makedirs(dir_)

    for j, url in enumerate([train_data_url, val_data_url, test_data_url]):
        filename = url.split('/')[-1]
        set_ = ['train', 'val', 'test'][j]
        print(filename)
        with open(filename, 'r') as f:
            line = f.readline()
            while len(line) != 0:
                id, _, class_name, _, x1, x2, y1, y2, _, _, _, _, _ = line.split(',')[:13]
                if class_name in class_ids:
                    if os.path.exists(os.path.join(DATA_ALL_DIR, '{}.jpg'.format(id))):
                        if not os.path.exists(os.path.join(data_out_dir, set_, 'imgs', '{}.jpg'.format(id))):
                            shutil.copy(os.path.join(DATA_ALL_DIR, '{}.jpg'.format(id)),
                                        os.path.join(data_out_dir, set_, 'imgs', '{}.jpg'.format(id)))
                        with open(os.path.join(data_out_dir, set_, 'anns', '{}.txt'.format(id)), 'a') as f_ann:
                            # class_id, xc, yx, w, h
                            x1, x2, y1, y2 = [float(j) for j in [x1, x2, y1, y2]]
                            xc = (x1 + x2) / 2
                            yc = (y1 + y2) / 2
                            w = x2 - x1
                            h = y2 - y1

                            f_ann.write('{} {} {} {} {}\n'.format(int(class_ids.index(class_name)), xc, yc, w, h))
                            f_ann.close()

                line = f.readline()

    shutil.rmtree(out_dir, ignore_errors=True)

    if yolov8_format:
        for set_ in ['train', 'val', 'test']:
            for dir_ in [os.path.join(data_out_dir, 'images', set_),
                         os.path.join(data_out_dir, 'labels', set_)]:
                if os.path.exists(dir_):
                    shutil.rmtree(dir_)
                os.makedirs(dir_)

            for filename in os.listdir(os.path.join(data_out_dir, set_, 'imgs')):
                shutil.copy(os.path.join(data_out_dir, set_, 'imgs', filename), os.path.join(data_out_dir, 'images', set_, filename))
            for filename in os.listdir(os.path.join(data_out_dir, set_, 'anns')):
                shutil.copy(os.path.join(data_out_dir, set_, 'anns', filename), os.path.join(data_out_dir, 'labels', set_, filename))

            shutil.rmtree(os.path.join(data_out_dir, set_))


In [3]:
classes = ['Person']  # list containing all the classes you will download from the open images dataset v7

out_dir = './data'

max_number_images_per_class = 200

yolov8_format = True

process(classes, out_dir, yolov8_format, max_number_images_per_class)

downloading oidv6-train-annotations-bbox.csv...
downloading validation-annotations-bbox.csv...
downloading test-annotations-bbox.csv...
downloading oidv7-class-descriptions.csv...
downloading downloader.py...
oidv6-train-annotations-bbox.csv
validation-annotations-bbox.csv
test-annotations-bbox.csv


In [None]:
# zip the data directory

!zip -r data.zip /content/data

In [None]:
# mount Google Drive

from google.colab import drive

drive.mount('/content/gdrive')

In [6]:
# copy data to your Google Drive

!scp '/content/data.zip' '/content/gdrive/My Drive/data.zip'