### Preprocessing: Get Cropped Images for AniML Dataset
Reads json file containing image names, bounding box coordinates and labels from AniML and produces cropped images with bouding boxes for each observation.

Inputs:
- Json file containing image names, bounding box coordinates and labels.
- List of labels to skip
- Dictionary to provide name matching between AniML and Wildlife Insights

Outputs:
- Dataframe with image ID, image name, label, and species' "common name" for given label.

In [1]:
import json
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import random
%matplotlib inline

In [2]:
# create dictionary to store image paths
def image_path_dict(data):
    image_paths = {}

    for image in data["images"]:
        img_id = image["id"]
        img_path = image["file_name"]

        # modify path to match annotations
        img_path = img_path.replace(" /", "/")

        # spaces get replaced by underscore
        img_path = img_path.replace(" ", "_")

        # semicolons get replcaed by underscore
        img_path = img_path.replace(":", "_")

        full_path = f"images/{img_path}"

        image_paths[img_id] = full_path
        
    return image_paths

In [3]:
# crop given image based on bounding box coordinatees
def crop_box(img, box):
    y_dim, x_dim, _ = img.shape
    
    # coco format coordinates are (x, y, width, height)
    # coordinates can be negative
    x0 = max(box[0], 0)
    x1 = x0 + box[2]
    y0 = max(box[1], 0)
    y1 = y0 + box[3]
    
    return img[y0:y1, x0:x1, :]

In [4]:
# create dictionary to store categories
def cat_dictionary(data, labels_to_skip, name_matching):
    categories = {}
    
    for cat in data["categories"]:
        cid = cat["id"]
        if cid in labels_to_skip:
            continue
        categories[cid] = name_matching[cat["name"]]
    return categories

In [5]:
# create cropped images and store them in given path
def create_cropped_images(data, image_paths, path = "images-bboxes/jldp"):
    labels_img_id = []
    labels_img_name = []
    labels_label = []
    errors = []
    for i, annot in enumerate(data["annotations"]):

        img_id = annot["image_id"]
        label = annot["category_id"]
        bbox = annot["bbox"]

        # if bounding box is None, then skip this annotation
        if bbox[0] is None:
            continue

        if label in labels_to_skip:
            continue

        # obtain cropped image using original image and bounding box    
        img_path = image_paths[img_id]
        img = cv2.imread(img_path)
        img_bbox = crop_box(img, bbox)

        new_name = f"{img_id}_{i:04d}"
        new_path = f"{path}/{new_name}.jpg"

        # append new info to lists
        labels_img_id.append(img_id)
        labels_img_name.append(new_name)
        labels_label.append(label)

        try:
            cv2.imwrite(new_path, img_bbox)
        except:
            errors.append(annot)
            
    return labels_img_id, labels_img_name, labels_label, errors

In [6]:
# create labels dataframe
def create_labels_df(labels_img_id, labels_img_name, labels_label, categories):
    labels = pd.DataFrame({"img_id": labels_img_id,
                     "img_name": labels_img_name,
                     "label": labels_label})

    labels["common_name"] = labels["label"].apply(lambda x: categories[x])
    return labels

In [7]:
# create df with images and labels based on given json path
def get_img_label_df(animl_json, labels_to_skip, name_matching):
    file = open(animl_json)
    data = json.load(file)

    # dictionary to store image paths
    image_paths = image_path_dict(data)
    
    # dictionary to store categories
    categories = cat_dictionary(data, labels_to_skip, name_matching)
    
    # image ids, names and labels
    labels_img_id, labels_img_name, labels_label, _ = create_cropped_images(data, image_paths)
    
    # dataframe to store labels
    labels = create_labels_df(labels_img_id, labels_img_name, labels_label, categories)
    return labels

In [8]:
animl_json = "jldp-animl-cct.json"

# labels to skip
# some categories have no useful info: 0 (empty), 10 (none), 11 (person)
# some categories don't make sense: 5 (c), 7 (dd), 13 (pwe), 4 (boycot)
# some categories are too unspecific: 1 (animal), 2 (bird), 15 (rodent)
# some categories are ambiguous: 12 (pig - contains domestic and wild bigs)
labels_to_skip = [0, 1, 2, 4, 5, 7, 10, 11, 12, 13, 15]

# manually modify category names to patch AWS labels
name_matching = {"bobcat": "Bobcat",
            "coyote": "Coyote",
            "deer": "Mule Deer",
            "dog": "Domestic Dog",
            "raccoon": "Northern Raccoon",
            "skunk": "Striped Skunk"}

# dataframe to store labels
labels = get_img_label_df(animl_json, labels_to_skip, name_matching)

In [11]:
labels

Unnamed: 0,img_id,img_name,label,common_name
0,1d180eeb08d46ae06b1f09f0a9580dae,1d180eeb08d46ae06b1f09f0a9580dae_0018,6,Coyote
1,4ce92a455b504ef50b9ddf5e2250f799,4ce92a455b504ef50b9ddf5e2250f799_0019,6,Coyote
2,ea1322f836ac39e9224f58aceff01919,ea1322f836ac39e9224f58aceff01919_0020,6,Coyote
3,5e725bbdf56bf8e575ac088f4d13381c,5e725bbdf56bf8e575ac088f4d13381c_0021,6,Coyote
4,d97c58fa1279da3f926cc80744437608,d97c58fa1279da3f926cc80744437608_0022,6,Coyote
...,...,...,...,...
2670,jldp:02ea11cea3794ee22cffdd66235a8e76,jldp:02ea11cea3794ee22cffdd66235a8e76_6150,8,Mule Deer
2671,jldp:7b1e0549b33d2ca922afe2423cf3367c,jldp:7b1e0549b33d2ca922afe2423cf3367c_6151,6,Coyote
2672,jldp:c2d28b49b86c2f8c1baec791134eab53,jldp:c2d28b49b86c2f8c1baec791134eab53_6152,6,Coyote
2673,jldp:269e05af4f8b1fcc803537a147b4ec95,jldp:269e05af4f8b1fcc803537a147b4ec95_6153,6,Coyote


In [10]:
# store label in new path
labels.to_csv("df_animl.csv")