### Preprocessing - Merge Data
Merges cropped images (stored locally) with label annotations, performing some data cleaning and producing a resulting dataframe.

Input:
- Path where cropped images from MegaDetector detections are stored.
- CSV file with species labels.

Output:
- Clean dataframe with image IDs and human-reviewed species labels.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import cv2

In [2]:
# return dataframe with human-reviewed annotations
# duplicates and ununsed columns are removed
def df_annotated(annotations):
    df_annot = pd.read_csv(annotations)
    
    # remove annotations identified by computer vision
    df_annot = df_annot[df_annot["identified_by"] != "Computer vision"]
    
    # select subset of columns
    df_annot = df_annot[["image_id", "class", "species", "common_name"]]
    
    # 14,232 images
    # however, some of them are duplicated, e.g. "e6ea708e-001d-48d2-bc3c-74bc38309fae"
    # same animal, one observation labels behavior as "walking" and the other one shows behavior as "jumping"
    df_annot = df_annot.drop_duplicates()
    return df_annot

In [3]:
# return dataframe with MegaDetector animal detections
def df_detected(images_path):
    img_name = [img.split(".")[0] for img in os.listdir(images_path)]
    
    # modify image ID to remove extra "_{confidence}" in name added in previous preprocessing step
    img_id = ["_".join(img.split("_")[:-1]) for img in img_name]
    
    # add column to check whether MegaDetector detected something in the image
    megadetector = [1] * len(img_id)
    
    df_img = pd.DataFrame.from_dict({"image_id": img_id,
                                "img_name": img_name,
                                "megadetector_output": megadetector})
    
    return df_img

In [4]:
# produce a "clean" subset of dataframe
# including images with both MegaDetector detections and human-reviewed labels
# and removing images with no species and unclear results
def df_subset(df):
    # there are 3,326 images for which human reviewers found annotations but MegaDetector found nothing
    # 49,598 images have no human-reviewed label
    # df["common_name"].isna().sum()
    
    # 1) remove images for which MegaDetector didn't find anything
    df_grouped = df[["image_id", "megadetector_output"]].groupby("image_id", as_index=False).count().\
        sort_values(by="megadetector_output", ascending=False)
    det_none = list(df_grouped[df_grouped["megadetector_output"] == 0]["image_id"])
    df = df[~df.image_id.isin(det_none)]
    
    # 2) remove images with no human-reviewed label
    df = df[~df["common_name"].isna()]
    
    # 3) remove images with no species
    df = df[~df["species"].isna()]
    
    # 4) remove images with "no CV result"
    df = df[df["species"] != "No CV Result"]
    return df

In [5]:
def df_uniqueobs(df):
    # there are issues when merging images with > 1 observations
    # for now, only include images with 1 observation
    df_count = df[["image_id", "img_name"]].groupby("image_id").count()
    df = df[df["image_id"].isin(list(df_count[df_count["img_name"] == 1].index))]
    return df

In [6]:
annotations = "jldp-annotations.csv"
images_path = "images-bboxes/jldp"

# 14,192 unique images
df_annot = df_annotated(annotations)

# 62,080 animals detected by MegaDetector
df_img = df_detected(images_path)

df = df_annot.merge(df_img, on="image_id", how="outer")

# # 12,383 observations (including duplicates when images have > 1 animal)
df = df_subset(df)

# 9,251 observations (with only images that include 1 animal)
df_unique = df_uniqueobs(df)

df_unique.to_csv("df.csv")

In [7]:
df

Unnamed: 0,image_id,class,species,common_name,img_name,megadetector_output
0,2ce50250-84ca-4a83-b263-2f24bf522d0d,Mammalia,latrans,Coyote,2ce50250-84ca-4a83-b263-2f24bf522d0d_932,1.0
2,3b45f93b-1539-4a7a-8577-8de6a12015e1,Mammalia,latrans,Coyote,3b45f93b-1539-4a7a-8577-8de6a12015e1_647,1.0
3,85f21059-cb89-48bf-98c6-30440c02f0f5,Mammalia,latrans,Coyote,85f21059-cb89-48bf-98c6-30440c02f0f5_947,1.0
4,ed0e7715-c052-4599-a366-bd50134aca77,Mammalia,latrans,Coyote,ed0e7715-c052-4599-a366-bd50134aca77_939,1.0
5,a9879613-86d9-4638-b098-32602dc341b0,Mammalia,scrofa,Wild Boar,a9879613-86d9-4638-b098-32602dc341b0_520,1.0
...,...,...,...,...,...,...
16220,56e27cbb-91d3-45ba-82db-f7f739f418ca,Mammalia,scrofa,Wild Boar,56e27cbb-91d3-45ba-82db-f7f739f418ca_823,1.0
16221,5c916beb-2522-4564-b9da-4b1b7eafc17e,Mammalia,scrofa,Wild Boar,5c916beb-2522-4564-b9da-4b1b7eafc17e_871,1.0
16222,a7dfdafd-bed1-4057-92a6-9c34c86775a1,Mammalia,scrofa,Wild Boar,a7dfdafd-bed1-4057-92a6-9c34c86775a1_910,1.0
16223,9b50d978-866e-4fd7-b813-de2dd824b163,Mammalia,scrofa,Wild Boar,9b50d978-866e-4fd7-b813-de2dd824b163_779,1.0


In [10]:
df_annot[df_annot["image_id"] == "8fdef6d7-9186-419b-9da9-845416afbe2a"]

Unnamed: 0,image_id,class,species,common_name
15142,8fdef6d7-9186-419b-9da9-845416afbe2a,Aves,penicillatus,Brandt's Cormorant
15991,8fdef6d7-9186-419b-9da9-845416afbe2a,Mammalia,familiaris,Domestic Dog


In [11]:
df_img[df_img["image_id"] == "8fdef6d7-9186-419b-9da9-845416afbe2a"]

Unnamed: 0,image_id,img_name,megadetector_output
34690,8fdef6d7-9186-419b-9da9-845416afbe2a,8fdef6d7-9186-419b-9da9-845416afbe2a_508,1
34691,8fdef6d7-9186-419b-9da9-845416afbe2a,8fdef6d7-9186-419b-9da9-845416afbe2a_866,1


In [9]:
df[df["image_id"] == "8fdef6d7-9186-419b-9da9-845416afbe2a"]

Unnamed: 0,image_id,class,species,common_name,img_name,megadetector_output
5361,8fdef6d7-9186-419b-9da9-845416afbe2a,Aves,penicillatus,Brandt's Cormorant,8fdef6d7-9186-419b-9da9-845416afbe2a_508,1.0
5362,8fdef6d7-9186-419b-9da9-845416afbe2a,Aves,penicillatus,Brandt's Cormorant,8fdef6d7-9186-419b-9da9-845416afbe2a_866,1.0
5363,8fdef6d7-9186-419b-9da9-845416afbe2a,Mammalia,familiaris,Domestic Dog,8fdef6d7-9186-419b-9da9-845416afbe2a_508,1.0
5364,8fdef6d7-9186-419b-9da9-845416afbe2a,Mammalia,familiaris,Domestic Dog,8fdef6d7-9186-419b-9da9-845416afbe2a_866,1.0


In [20]:
df_count = df[["image_id", "img_name"]].groupby("image_id").count()

In [28]:
df[df["image_id"].isin(list(df_count[df_count["img_name"] == 1].index))]

Unnamed: 0,image_id,class,species,common_name,img_name,megadetector_output
0,2ce50250-84ca-4a83-b263-2f24bf522d0d,Mammalia,latrans,Coyote,2ce50250-84ca-4a83-b263-2f24bf522d0d_932,1.0
1,cad9d5ae-503c-436d-a81e-4cbc322f2492,Mammalia,,Canine Family,cad9d5ae-503c-436d-a81e-4cbc322f2492_927,1.0
2,3b45f93b-1539-4a7a-8577-8de6a12015e1,Mammalia,latrans,Coyote,3b45f93b-1539-4a7a-8577-8de6a12015e1_647,1.0
3,85f21059-cb89-48bf-98c6-30440c02f0f5,Mammalia,latrans,Coyote,85f21059-cb89-48bf-98c6-30440c02f0f5_947,1.0
4,ed0e7715-c052-4599-a366-bd50134aca77,Mammalia,latrans,Coyote,ed0e7715-c052-4599-a366-bd50134aca77_939,1.0
...,...,...,...,...,...,...
65584,fff45ee1-6821-4f1b-a355-ec3b9d00e4fb_500,,,,fff45ee1-6821-4f1b-a355-ec3b9d00e4fb_500_944,1.0
65585,fff45ee1-6821-4f1b-a355-ec3b9d00e4fb,,,,fff45ee1-6821-4f1b-a355-ec3b9d00e4fb_952,1.0
65586,fffa5c94-e4b2-4a62-9566-1662bf2ec63f_500,,,,fffa5c94-e4b2-4a62-9566-1662bf2ec63f_500_961,1.0
65587,fffa5c94-e4b2-4a62-9566-1662bf2ec63f,,,,fffa5c94-e4b2-4a62-9566-1662bf2ec63f_976,1.0


### Exploratory Data Analysis

In [24]:
# 33 species
# class imbalance
# i.e. 4,613 coyotes, 8 great foxes, 1 house finch
df[["species", "common_name", "img_name"]].groupby(["species", "common_name"], as_index=False).\
    count().sort_values(by="img_name", ascending=False)

Unnamed: 0,species,common_name,img_name
14,latrans,Coyote,4613
27,scrofa,Wild Boar,3537
12,hemionus,Mule Deer,1297
29,taurus,Domestic Cattle,757
23,occidentalis,Western fence lizard,475
2,aura,Turkey Vulture,381
5,californicus,California Gull,275
25,platyrhynchos,Mallard,268
13,herodias,Great Blue Heron,182
22,occidentalis,Western Gull,173
