In [14]:
import numpy as np
from PIL import Image
import requests
import io
import json
import datetime as dt
import logging
from shapely import wkt
from shapely.geometry import Polygon
import requests
from PIL import Image
import cv2
from tqdm.notebook import tqdm
import os

In [15]:
data_json_url = 'export-2022-12-04T18_21_34.728Z.json'
# data_json_url = 'data.json'

In [16]:
ENV = {}
with open(".env", 'r') as f:
    lines = [row.rstrip('\n') for row in f]
    for line in lines:
        key, value = line.split("=")
        ENV[key] = value
headers = {'Authorization': f"Bearer {ENV['LABELBOX_API_KEY']}"}

In [17]:
def url_to_mask(url):
    filepath = f"data/masks/{url.replace('https://api.labelbox.com/masks/feature/', '').split('?')[0]}"
    if os.path.exists(filepath):
        img = Image.open(filepath)
    else:
        response = requests.get(url, headers=headers)
        stream = io.BytesIO(response.content)
        with open(filepath, 'wb') as f:
            f.write(response.content)
        img = Image.open(stream)
    return np.asarray(img)[:,:,0]

### Building the actual dataset

In [18]:
with open(data_json_url, 'r') as f:
    lines = f.read()
    label_data = json.loads(lines)

In [19]:
coco = {
    'info': None,
    'images': [],
    'annotations': [],
    'licenses': [],
    'categories': []
}

coco['info'] = {
    'year': dt.datetime.now(dt.timezone.utc).year,
    'version': None,
    'description': label_data[0]['Project Name'],
    'contributor': label_data[0]['Created By'],
    'url': 'labelbox.com',
    'date_created': dt.datetime.now(dt.timezone.utc).isoformat()
}


In [20]:
categories = set()
for data in label_data:
    for obj in data['Label']['objects']:
        categories.add(obj['value'])
coco_categories = [{"supercategory": category, "id": idx + 1, "name": category} for idx, category in enumerate(categories)]
coco['categories'] = coco_categories
def category_to_id(category): return list(categories).index(category) + 1

In [21]:
for data in tqdm(label_data):
    if len(data['Label']['objects']) < 1: continue
    try:
        response = requests.get(data['Labeled Data'], stream=True)
    except requests.exceptions.MissingSchema as e:
        logging.exception(('"Labeled Data" field must be a URL. '
                            'Support for local files coming soon'))
        continue
    except requests.exceptions.ConnectionError as e:
        logging.exception('Failed to fetch image from {}'
                            .format(data['Labeled Data']))
        continue

    response.raw.decode_content = True
    im = Image.open(response.raw)
    width, height = im.size

    image = {
        "id": data['ID'],
        "width": width,
        "height": height,
        "file_name": data['Labeled Data'].replace("https://storage.labelbox.com/", "").split("?")[0],
        "license": None,
        "flickr_url": data['Labeled Data'],
        "coco_url": data['Labeled Data'],
        "date_captured": None,
        "categories": coco_categories,
    }

    coco['images'].append(image)
    
    for obj in data['Label']['objects']:
        if len(polygon.exterior.coords) < 3: continue
        
        cat_id = category_to_id(obj['value'])
        imgray = url_to_mask(obj['instanceURI'])
        ret, thresh = cv2.threshold(imgray, 127, 255, 0)
        contours, hierarchy = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
        polygon = Polygon([contour[0] for contour in contours[-1]])
        segmentation = []
        for x, y in polygon.exterior.coords:
            segmentation.extend([x, y])

        annotation = {
            "id": len(coco['annotations']) + 1,
            "image_id": data['ID'],
            "category_id": cat_id,
            "segmentation": [segmentation],
            "area": polygon.area,
            "bbox": [polygon.bounds[0], polygon.bounds[1],
                        polygon.bounds[2]-polygon.bounds[0],
                        polygon.bounds[3]-polygon.bounds[1]],
            "iscrowd": 0
        }

        coco['annotations'].append(annotation)

  0%|          | 0/193 [00:00<?, ?it/s]

In [23]:
with open('data/annotations_dec.json', 'w') as f:
    f.write(json.dumps(coco))