## 1. Setup

In [None]:
import os

import numpy as np
import pandas as pd

In [None]:
DATA_PATH = '../data/'
RAW_DATA_PATH = os.path.join(DATA_PATH, 'raw')
IMG_PATH = os.path.join(RAW_DATA_PATH, 'images')

INFO_CSV_DATA_FILE = 'image_info.csv'
BBOX_CSV_DATA_FILE = 'bboxes/bounding_boxes.csv'

In [None]:
# Load a dataset with image information
img_info_df = pd.read_csv(os.path.join(RAW_DATA_PATH, INFO_CSV_DATA_FILE))

In [None]:
# Load a bounding box dataset
img_bbox_df = pd.read_csv(os.path.join(RAW_DATA_PATH, BBOX_CSV_DATA_FILE))

In [None]:
# Get a list of image names
img_names = os.listdir(IMG_PATH)

In [None]:
# Create a dict of validation results for summary report
validation_results = {}

## 2. Create help functions

In [None]:
def check_that_two_sorted_lists_are_equal(l1, l2, passed_message=''):
    """Return a dictionary of the validation status with a list 
    of non-matching elements or the number of duplicates, if any."""
    l1 = sorted(l1)
    l2 = sorted(l2)
    
    if l1 == l2:
        return {'PASSED': passed_message}
    elif (len(set(l1)) != len(l1)) or (len(set(l2)) != len(l2)):
        return {'WARNING: Duplicates!': len(l1 + l2) - len(set(l1)) - len(set(l2))}
    else:        
        not_match = list(set(l1) ^ set(l2))
        return {'FAILED': not_match}

In [None]:
def check_that_series_is_less_than_or_equal_to(s1, other, comparison_sign, passed_message=''):
    """Return a dictionary of the validation status with indices with incorrect values, if any.
    
    Parameters:
        s1 (pd.Series): a object to be compared
        other (pd.Series or scalar value): a object to compare
        comparison_sign (str): must be one of '==', '<='. Otherwise raises ValueError.
        passed_message (str): a message that describes a passage of the check
    """  
    comp_series_result = 0

    if comparison_sign == '==':
        comp_series_result  = s1.eq(other)
    elif comparison_sign == '<=':
        comp_series_result  = s1.le(other) 
    else:
        raise ValueError()       

    if comp_series_result.sum() == s1.shape[0]:
        return {'PASSED': passed_message}
    else:
        return {'FAILED': s1[~comp_series_result].index}

## 3. Check Data Consistency

### 3.1. images vs image_info.csv

In [None]:
# Check whether the names of the available images are identical to the names in image_info.csv
validation_results["Image Name Match Check: " + INFO_CSV_DATA_FILE] = check_that_two_sorted_lists_are_equal(
    img_info_df.Name.to_list(), 
    img_names,
    passed_message="The image names in the file correspond to the available images."
)

### 3.2. bounding_boxes.csv

In [None]:
# Check the correctness of bounding box parameters
for bb_param, img_param in [('bbox_x', 'width'), 
                            ('bbox_y', 'height')]:
    
    for add_bb_param in ('', 'bbox_' + img_param):
        add_values = 0
        img_name_param = 'image_' + img_param
        check_param_name = bb_param        

        if add_bb_param:
            add_values = img_bbox_df[add_bb_param]
            check_param_name = ' + '.join([check_param_name, add_bb_param])

        comp_bbox_img_param_result = check_that_series_is_less_than_or_equal_to(
                                         img_bbox_df[bb_param].add(add_values), 
                                         img_bbox_df[img_name_param], '<=', 
                                         passed_message=f"Correct: ({check_param_name}) <= {img_name_param}.") 
        check_name = f"Bbox Parameter Correctness Check: " + check_param_name  
        validation_results[check_name] = comp_bbox_img_param_result

In [None]:
# Check the correctness of image parameters
uniq_img_param_df = (img_bbox_df[['image_name', 'image_width', 'image_height']]
                         .groupby('image_name', group_keys=True)
                         .nunique())

for img_param in ('image_width', 'image_height'):
    validation_results[f"Image Parameter Correctness Check: " + img_param] = check_that_series_is_less_than_or_equal_to(
        uniq_img_param_df[img_param], 1, '==',
        passed_message="One unique value for each image.")

### 3.3. images vs bounding_boxes.csv

In [None]:
# Check whether the names of the available images are identical to the names in bounding_boxes.csv
validation_results["Image Name Match Check: " + BBOX_CSV_DATA_FILE] = check_that_two_sorted_lists_are_equal(
    img_bbox_df.image_name.unique(), 
    img_names,
    passed_message="The image names in the file correspond to the available images."
)

### 3.4. image_info.csv vs bounding_boxes.csv

In [None]:
# Check if the number of house sparrows and the number of bounding boxes match
number_hsparrows = (img_info_df[['Name', 'Number_HSparrows']].sort_values(by='Name')
                                                             .set_index('Name')
                                                             .squeeze())
number_bboxes = img_bbox_df['image_name'].sort_values().value_counts(sort=False)

validation_results["Number Match Check: Number_HSparrows vs image_name"] = check_that_series_is_less_than_or_equal_to(
    number_hsparrows, number_bboxes, '==', passed_message="The numbers match.")

## 4. Summury Report

In [None]:
validation_results

In [None]:
# Show the result of the failed check
failed_check_result = img_bbox_df[['image_name', 'bbox_x', 'bbox_width', 'image_width']].iloc[565]

failed_check_result