In [None]:
!pip install ibm-cos-sdk python-dotenv

In [None]:
import os
import shutil
import json
import uuid

import ibm_boto3

from dotenv import load_dotenv

In [None]:
load_dotenv()

In [None]:
bucket = os.getenv("BUCKET", "")
access_key_id = os.getenv("ACCESS_KEY_ID", "")
secret_access_key = os.getenv("SECRET_ACCESS_KEY", "")
endpoint_url = os.getenv("ENDPOINT_URL", "")

In [None]:
cos = ibm_boto3.resource("s3",
    aws_access_key_id=access_key_id,
    aws_secret_access_key=secret_access_key,
    endpoint_url=endpoint_url
)

# load the annotations
try:
    annotations = json.loads(cos.Object(bucket, "_annotations.json").get()["Body"].read())["annotations"]
except Exception as e:
    print("Unable to retrieve annotations: {}".format(e))

In [None]:
data_dir = "data"
os.makedirs(data_dir)

# create a set of labels and then turn it into a list to remove dupelicates
labels = list({annotation["label"] for image in annotations.values() for annotation in image})

for label in labels:
    # find a list of images with the given label
    image_list = [image_name for image_name in annotations.keys() for annotation in annotations[image_name] if annotation["label"] == label]

    # make directory for the label to store images in
    train_label_dir = os.path.join(data_dir, label)
    os.makedirs(train_label_dir)

    # move images to the their label folder
    for im in image_list:
        try:
            extension = os.path.splitext(im)[1]
            cos.meta.client.download_file(bucket, im, os.path.join(train_label_dir, str(uuid.uuid4()) + extension))
        except Exception as e:
            print("Error: {}, skipping {}...".format(e, im))