## 1. Setup

In [57]:
import os

import numpy as np
import pandas as pd

import cv2
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [58]:
DATA_PATH = '../data/'
RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw')
IMG_PATH = os.path.join(RAW_DATA_PATH, 'images')

## 2. Load datasets

In [59]:
# Load a dataset with image information
hs_img_info_df = pd.read_csv(os.path.join(RAW_DATA_PATH, 'image_info.csv'))

hs_img_info_df.head()

Unnamed: 0,Name,Author,Number_HSparrows,Source,License
0,51028450922_6e5c22a557_w.jpg,John Freshney,1,Flickr,Attribution 2.0 Generic (CC BY 2.0)
1,50582637617_025db80280_w.jpg,I Am birdsaspoetry.com,1,Flickr,Attribution 2.0 Generic (CC BY 2.0)
2,51629116862_45cc557dcd_w.jpg,pete beard,1,Flickr,Attribution 2.0 Generic (CC BY 2.0)
3,51326420290_5596ceb008_w.jpg,pete beard,1,Flickr,Attribution 2.0 Generic (CC BY 2.0)
4,51571836216_f2f22feed1_e.jpg,"611catbirds, too",2,Flickr,Attribution 2.0 Generic (CC BY 2.0)


In [60]:
hs_img_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 501 entries, 0 to 500
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Name              501 non-null    object
 1   Author            501 non-null    object
 2   Number_HSparrows  501 non-null    int64 
 3   Source            501 non-null    object
 4   License           501 non-null    object
dtypes: int64(1), object(4)
memory usage: 19.7+ KB


In [61]:
hs_img_info_df.duplicated().sum()

0

In [62]:
# Load bounding boxes
hs_img_bbox_df = pd.read_csv(os.path.join(RAW_DATA_PATH, 'bboxes/bounding_boxes.csv'))

hs_img_bbox_df.head()

Unnamed: 0,label_name,bbox_x,bbox_y,bbox_width,bbox_height,image_name,image_width,image_height
0,house_sparrow,185,75,149,166,10392264536_04a4ae14ee_w.jpg,400,267
1,house_sparrow,5,96,230,181,10455990684_02e02ca64d_w.jpg,400,300
2,house_sparrow,231,75,165,217,10455990684_02e02ca64d_w.jpg,400,300
3,house_sparrow,530,100,174,133,10736475654_56604258c5_c.jpg,800,600
4,house_sparrow,378,237,268,147,10736475654_56604258c5_c.jpg,800,600


In [63]:
hs_img_bbox_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1201 entries, 0 to 1200
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   label_name    1201 non-null   object
 1   bbox_x        1201 non-null   int64 
 2   bbox_y        1201 non-null   int64 
 3   bbox_width    1201 non-null   int64 
 4   bbox_height   1201 non-null   int64 
 5   image_name    1201 non-null   object
 6   image_width   1201 non-null   int64 
 7   image_height  1201 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 75.2+ KB


In [64]:
hs_img_bbox_df.duplicated().sum()

0

## 3. Check datasets

In [65]:
def compare_two_img_lists(img_l1, img_l2):
    """Prints non-matching image names, if any."""
    img_l1 = sorted(img_l1)
    img_l2 = sorted(img_l2)
    
    if img_l1 == img_l2:
        print(f"Equal!: {len(img_l1)} images")
    elif (len(set(img_l1)) != len(img_l1)) or (len(set(img_l2)) != len(img_l2)):
        print("Duplicates!")
    else:
        not_match = list(set(img_l1) ^ set(img_l2))
        print(f"Not Equal! {len(not_match)} image(s)")
        print(not_match)

In [66]:
# Check if all the images have information
hs_img_names = os.listdir(IMG_PATH)
df_img_names = hs_img_info_df.Name.to_list()

compare_two_img_lists(hs_img_names, df_img_names)

Not Equal! 1 image(s)
['32080514535_28e593ce40_c.jpg']


In [67]:
# Check if all the images have bounding boxes
bbox_img_names = hs_img_bbox_df.image_name.drop_duplicates().to_list() 

compare_two_img_lists(hs_img_names, bbox_img_names)

Equal!: 500 images


In [68]:
# Exclude the wrong entry
hs_img_info_df = hs_img_info_df[hs_img_info_df.Name != '32080514535_28e593ce40_c.jpg']

In [69]:
bbox_number_df = (
    hs_img_bbox_df['image_name'].value_counts()
                                .reset_index()
                                .rename(columns={'index': 'Name', 
                                                 'image_name': 'Number_Bboxes'}))
                                                 
hs_bbox_number_df = (hs_img_info_df[['Name', 'Number_HSparrows']]
                         .merge(bbox_number_df, on='Name', how='outer'))

hs_bbox_number_df.head()


Unnamed: 0,Name,Number_HSparrows,Number_Bboxes
0,51028450922_6e5c22a557_w.jpg,1,1
1,50582637617_025db80280_w.jpg,1,1
2,51629116862_45cc557dcd_w.jpg,1,1
3,51326420290_5596ceb008_w.jpg,1,1
4,51571836216_f2f22feed1_e.jpg,2,2


In [70]:
# Check if the number of house sparrows and the number of bounding boxes are equal in the images
diff_number_df = hs_bbox_number_df[hs_bbox_number_df['Number_HSparrows'] != hs_bbox_number_df['Number_Bboxes']]

diff_number_df

Unnamed: 0,Name,Number_HSparrows,Number_Bboxes
131,50290086398_7680547fd9_b.jpg,21,20
148,50290762416_0ac8c4bdf8_b.jpg,20,21
158,5935039566_a1f8b12406_c.jpg,1,2
167,11142365154_92794d7bc8_c.jpg,4,3
173,50290910432_01738ec2a4_b.jpg,19,20
194,50609223351_94863c7bb3_c.jpg,3,4
205,49279450632_b142d667dc_b.jpg,8,7
224,34665526810_63b5f2da24_w.jpg,1,2
225,31560672978_5f54bd711b_c.jpg,4,5
237,50183829093_5cf15f754f_c.jpg,1,2


In [71]:
# View the images with the wrong number of house sparrows or bounding boxes
for img_name in diff_number_df.Name:
    img_df = hs_img_bbox_df[hs_img_bbox_df.image_name == img_name]
    img = cv2.imread(os.path.join(IMG_PATH, img_name))
    
    # Draw bounding box rectangles on images
    for x, y, w, h in img_df[['bbox_x', 'bbox_y', 'bbox_width', 'bbox_height']].values:
        img = cv2.rectangle(img, (x, y), (x + w, y + h), color=(255, 0, 0), thickness=2)

    cv2.putText(img, img_name, org=(0, 25), 
                fontFace=cv2.FONT_HERSHEY_PLAIN, fontScale=2, 
                color=(0, 69, 255), thickness = 2)
    window_name = 'House Sparrows'
    cv2.namedWindow(window_name, cv2.WINDOW_NORMAL)
    cv2.moveWindow(window_name, 200, 0)
    cv2.resizeWindow(window_name, 900, 700)
    cv2.imshow(window_name, img)
    
    k = cv2.waitKey(120000)
    # Close all of the image windows by pressing the ESC key on a keyboard 
    # or the X button on an open window
    if k == 27:
        print("The ESC key was pressed.")
        break
    elif not cv2.getWindowProperty('House Sparrows', cv2.WND_PROP_VISIBLE):
        print("The X button was pressed on the window.")
        break
    
cv2.destroyAllWindows()
print("All of the image windows are closed.")

The ESC key was pressed.
All of the image windows are closed.
