In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os
import cv2

## Preprocessing - Merge Data (1)
Merges cropped images (stored locally) with label annotations, performing some data cleaning and producing a resulting dataframe for AWS pictures.

Input:
- Path where cropped images from MegaDetector detections are stored.
- CSV file with species labels.

Output:
- Clean dataframe with image IDs and human-reviewed species labels.

In [7]:
# return dataframe with human-reviewed annotations
# duplicates and ununsed columns are removed
def df_annotated(annotations):
    df_annot = pd.read_excel(annotations)
    
    # remove annotations identified by computer vision
    df_annot = df_annot[df_annot["identified_by"] != "Computer vision"]
    
    # select subset of columns
    df_annot = df_annot[["img_id", "class", "species", "common_name"]]
    
    # 14,232 images
    # however, some of them are duplicated, e.g. "e6ea708e-001d-48d2-bc3c-74bc38309fae"
    # same animal, one observation labels behavior as "walking" and the other one shows behavior as "jumping"
    df_annot = df_annot.drop_duplicates()
    return df_annot

In [3]:
# return dataframe with MegaDetector animal detections
def df_detected(images_path):
    img_name = [img.split(".")[0] for img in os.listdir(images_path)]
    
    # modify image ID to remove extra "_{confidence}" in name added in previous preprocessing step
    img_id = ["_".join(img.split("_")[:-1]) for img in img_name]
    
    # add column to check whether MegaDetector detected something in the image
    megadetector = [1] * len(img_id)
    
    df_img = pd.DataFrame.from_dict({"img_id": img_id,
                                "img_name": img_name,
                                "megadetector_output": megadetector})
    
    return df_img

In [4]:
# produce a "clean" subset of dataframe
# including images with both MegaDetector detections and human-reviewed labels
# and removing images with no species and unclear results
def df_subset(df):
    # there are 3,326 images for which human reviewers found annotations but MegaDetector found nothing
    # 49,598 images have no human-reviewed label
    # df["common_name"].isna().sum()
    
    # 1) remove images for which MegaDetector didn't find anything
    df_grouped = df[["img_id", "megadetector_output"]].groupby("img_id", as_index=False).count().\
        sort_values(by="megadetector_output", ascending=False)
    det_none = list(df_grouped[df_grouped["megadetector_output"] == 0]["image_id"])
    df = df[~df.image_id.isin(det_none)]
    
    # 2) remove images with no human-reviewed label
    df = df[~df["common_name"].isna()]
    
    # 3) remove images with no species
    df = df[~df["species"].isna()]
    
    # 4) remove images with "no CV result"
    df = df[df["species"] != "No CV Result"]
    return df

In [5]:
def df_uniqueobs(df):
    # there are issues when merging images with > 1 observations
    # for now, only include images with 1 observation
    df_count = df[["img_id", "img_name"]].groupby("img_id").count()
    df = df[df["img_id"].isin(list(df_count[df_count["img_name"] == 1].index))]
    return df

In [8]:
annotations = "jldp-annotations-IB.xlsx"
images_path = "images-bboxes/jldp"

# 14,192 unique images
df_annot = df_annotated(annotations)

# 62,080 animals detected by MegaDetector
df_img = df_detected(images_path)

df = df_annot.merge(df_img, on="img_id", how="outer")

# # 12,383 observations (including duplicates when images have > 1 animal)
df = df_subset(df)

# 9,251 observations (with only images that include 1 animal)
df_unique = df_uniqueobs(df)
df_unique.to_csv("df_aws.csv")

## Preprocessing - Merge Data (2)
Merges resulting dataframe from AWS data with dataframe with Animl annotations.

In [31]:
df1 = pd.read_csv("df_aws.csv")
df2 = pd.read_csv("df_animl.csv")

In [32]:
cols = ["img_id", "common_name", "img_name"]
df = pd.concat([df1[cols], df2[cols]], ignore_index=True)

In [33]:
df.to_csv("df.csv")
df

Unnamed: 0,img_id,common_name,img_name
0,2ce50250-84ca-4a83-b263-2f24bf522d0d,Coyote,2ce50250-84ca-4a83-b263-2f24bf522d0d_932
1,3b45f93b-1539-4a7a-8577-8de6a12015e1,Coyote,3b45f93b-1539-4a7a-8577-8de6a12015e1_647
2,85f21059-cb89-48bf-98c6-30440c02f0f5,Coyote,85f21059-cb89-48bf-98c6-30440c02f0f5_947
3,ed0e7715-c052-4599-a366-bd50134aca77,Coyote,ed0e7715-c052-4599-a366-bd50134aca77_939
4,1f29dc37-29b1-4c7b-937d-8bc4e31372ec,Coyote,1f29dc37-29b1-4c7b-937d-8bc4e31372ec_937
...,...,...,...
11921,jldp:02ea11cea3794ee22cffdd66235a8e76,Mule Deer,jldp:02ea11cea3794ee22cffdd66235a8e76_6150
11922,jldp:7b1e0549b33d2ca922afe2423cf3367c,Coyote,jldp:7b1e0549b33d2ca922afe2423cf3367c_6151
11923,jldp:c2d28b49b86c2f8c1baec791134eab53,Coyote,jldp:c2d28b49b86c2f8c1baec791134eab53_6152
11924,jldp:269e05af4f8b1fcc803537a147b4ec95,Coyote,jldp:269e05af4f8b1fcc803537a147b4ec95_6153


### Exploratory Data Analysis

In [36]:
# 33 species
# class imbalance
# i.e. 5,409 coyotes, 8 great foxes, 1 house finch
df[["common_name", "img_name"]].groupby(["common_name"], as_index=False).\
    count().sort_values(by="img_name", ascending=False)

Unnamed: 0,common_name,img_name
11,Coyote,5409
22,Mule Deer,2265
32,Wild Boar,2178
12,Domestic Cattle,595
31,Western fence lizard,475
30,Western Gull,158
6,California Gull,151
16,Great Blue Heron,125
27,Turkey Vulture,113
21,Mallard,101
