## 1. Setup

In [2]:
import os

import numpy as np
import pandas as pd

In [3]:
DATA_PATH = '../data/'
RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw')
IMG_PATH = os.path.join(RAW_DATA_PATH, 'images')

In [4]:
# Load a dataset with image information
img_info_df = pd.read_csv(os.path.join(RAW_DATA_PATH, 'image_info.csv'))

In [5]:
# Load a bounding box dataset
img_bbox_df = pd.read_csv(os.path.join(RAW_DATA_PATH, 'bboxes/bounding_boxes.csv'))

In [6]:
# Get a list of image names
img_names = os.listdir(IMG_PATH)

In [7]:
# Create a dict of validation results for summary report
validation_results = {}

## 2. Create help functions

In [8]:
def check_that_two_sorted_lists_are_equal(l1, l2):
    """Returns a dict with list of non-matching items or 
    the number of duplicates, if any."""
    l1 = sorted(l1)
    l2 = sorted(l2)
    
    if l1 == l2:
        return {'PASS': None}
    elif (len(set(l1)) != len(l1)) or (len(set(l2)) != len(l2)):
        return {'WARNING: Duplicates!': len(l1 + l2) - len(set(l1)) - len(set(l2))}
    else:        
        not_match = list(set(l1) ^ set(l2))
        return {'FAIL': not_match}

In [9]:
def check_that_series_less_or_equal(s1, s2, comparison_sign, s3=None):
    """Returns a dict with a list of indexes or DataFrame columns with incorrect values, if any.
    
    Parameters:
    s1 -- a pd.Series object
    s2 -- a pd.Series object or a value
    comparison_sign -- one of '==', '<'
    s3 -- a pd.Series object for select values by a comparison result Series.
    """
    comp_result = {}
    comp_series = 0
    
    # Get the result of the comparison
    if comparison_sign == '==':
        comp_series = s1.eq(s2)
    elif comparison_sign == '<':
        comp_series = s1.lt(s2) 
    elif comparison_sign == '<=':
        comp_series = s1.le(s2) 
    else:
        raise ValueError()         
    
    # Get the result
    if comp_series.sum() == s1.shape[0]:
        return {'PASS': None}
    else:
        if isinstance(s3, pd.Series):
            s1 = s3
        return {'FAIL': s1[~comp_series].index}
        

## 3. Check Data Consistency

### 3.1. images vs image_info.csv

In [10]:
# Check whether the names of the available images are identical to the names in image_info.csv
validation_results['Image Name and Number Match Check (info)'] = check_that_two_sorted_lists_are_equal(
    img_info_df.Name.to_list(), 
    img_names
)

### 3.2. bounding_boxes.csv

In [11]:
# Check the correctness of bounding box parameters
for bb_param, img_param in [('bbox_x', 'width'), 
                            ('bbox_y', 'height')]:
    
    for add_bb_param in ('', 'bbox_' + img_param):
        add_values = 0
        comp_bbox_img_param_result = {}

        if add_bb_param:
            add_values = img_bbox_df[add_bb_param]

        comp_bbox_img_param_series = img_bbox_df[bb_param].add(add_values).le(img_bbox_df['image_' + img_param])
    
        if comp_bbox_img_param_series.all():
            comp_bbox_img_param_result['PASS'] = None
        else:
            comp_bbox_img_param_result['FAIL'] = img_bbox_df[~comp_bbox_img_param_series].index
        
        validation_results[f'Bbox Parameter Check ({bb_param} + {add_bb_param})'] = comp_bbox_img_param_result

In [12]:
# Check the correctness of image parameters
uniq_img_param_df = (img_bbox_df[['image_name', 'image_width', 'image_height']]
                         .groupby('image_name', group_keys=True)
                         .nunique())

for img_param in ('image_width', 'image_height'):
    validation_results[f'Image Parameter Check ({img_param})'] = check_that_series_less_or_equal(
        uniq_img_param_df[img_param], 1, '==')

### 3.3. images vs bounding_boxes.csv

In [13]:
# Check whether the names of the available images are identical to the names in bounding_boxes.csv
validation_results['Image Name and Number Match Check (bbox)'] = check_that_two_sorted_lists_are_equal(
    img_bbox_df.image_name
               .unique(), 
    img_names
)

### 3.4. image_info.csv vs bounding_boxes.csv

In [14]:
# Check if the number of house sparrows and the number of bounding boxes match
number_hsparrows = img_info_df[['Name', 'Number_HSparrows']].sort_values(by='Name').set_index('Name').squeeze()
number_bboxes = img_bbox_df['image_name'].sort_values().value_counts(sort=False)

validation_results['HSparrows and Bboxes Number Check'] = check_that_series_less_or_equal(
    number_hsparrows, number_bboxes, '==')

## 4. Summury Report

In [15]:
validation_results

{'Image Name and Number Match Check (info)': {'PASS': None},
 'Bbox Parameter Check (bbox_x + )': {'PASS': None},
 'Bbox Parameter Check (bbox_x + bbox_width)': {'FAIL': Int64Index([565], dtype='int64')},
 'Bbox Parameter Check (bbox_y + )': {'PASS': None},
 'Bbox Parameter Check (bbox_y + bbox_height)': {'PASS': None},
 'Image Parameter Check (image_width)': {'PASS': None},
 'Image Parameter Check (image_height)': {'PASS': None},
 'Image Name and Number Match Check (bbox)': {'PASS': None},
 'HSparrows and Bboxes Number Check': {'PASS': None}}

In [16]:
# Show the result of the failed check
failed_check_result = img_bbox_df[['image_name', 'bbox_x', 'bbox_width', 'image_width']].iloc[565]

failed_check_result

image_name     43661281862_5d2a15a38c_w.jpg
bbox_x                                  343
bbox_width                               57
image_width                             399
Name: 565, dtype: object