In [None]:
# code adapted, original source: ISEAD project (HITeC e.V.)

## Overview
<br>
This file is for evaluating the TP, TN, FP, FN, Precision, Recall and F1 Scores of BBox Predictions. Those scores are calculated on an image level and on a BBox level.<br>

### Image Level
True Positives: Those are images that should have predictions and have some (not specified if bboxes are correct).<br>
False Positives: Those are images that should not have predictions but have some.<br>
False Negatives: Those are images that should have predictions but have none.<br>
True Negatives: Those are images that should not have predictions and have none.<br>
<br>

### BBox Level:<br>
True Positives: Those are bboxes that were correctly predicted.<br>
False Positives: Those are bboxes that were unwanted.<br>
False Negatives: Those are missing bboxes.<br>
True Negatives: This can't appear in our case.<br>
<br>
This analysis is done via evaluating CSV files containing BBox information from either a BBox or a Segmentation Model. <br>

### Steps:<br>
At first, we read all necessary <br>
    - prediction data from the csv files,<br>
    - ground truth data,<br>
    - and make sure the prediction data only contains data from the TEST dataset (use Test.txt to filter test images).<br>

 

## Read Files 

In [8]:
import csv
import json

# get test data filenames
def read_txt(file_path):
    test = []
    with open(file_path, 'r') as fp:
        for line in fp.readlines():
            test.append(line.removesuffix('\n'))
    return test

# read prediction csv file
def read_csv(file_path, test_file, delimiter=';'):
    data = []
    # test split only relevant for 38k not negative dataset
    if test_file:
        # get test data filenames
        test = read_txt(test_file)
    print(file_path)
    with open(file_path, 'r') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=delimiter)
        headers = next(csv_reader)
        for row in csv_reader:
            # filter test data if necessary
            bg = "sea"
            anom = "bird"
            fs = "."
            if (test_file and (((row[0].removesuffix('.jpg')).removesuffix('.png')).removesuffix(f'_{anom}{fs}_{bg}_80_0_0.05_0.1_0.1')).removesuffix("_majority") in test) or not test_file:
                # confidence threshold
                if float(row[7]) > 0.1:
                    data.append({headers[i]: (((row[i].removesuffix('.jpg')).removesuffix('.png')).removesuffix(f"_{anom}{fs}_{bg}_80_0_0.05_0.1_0.1")).removesuffix("_majority") for i in range(len(headers))})
    return data


def get_image_dimensions(data, image_id):
    for image in data['images']:
        if image['id'] == image_id:
            return image['width'], image['height']
    return None, None


# read GT json file (COCO annotations)
def read_json(json_file_paths, test_file, neg_test_file):
    if test_file:
        # get test data filenames
        test = read_txt(test_file)
        neg_test = read_txt(neg_test_file)
    annotations_seg = []
    annotations_saa = []
    for i in range(len(json_file_paths)):
        with open(json_file_paths[i], 'r') as f:
            data = json.load(f)

        image_id_to_filename = {img['id']: img['file_name'] for img in data['images']}
        print(data['annotations'][0])
        for annotation in data['annotations']:
            
            image_id = annotation['image_id']
            image_width, image_height = get_image_dimensions(data, image_id)

            bbox_dict = {}
            bbox_dict['image'] = (image_id_to_filename.get(annotation['image_id'], "Unknown")).removesuffix(".jpg")
            
            bbox_dict['xmin'] = annotation['bbox'][0]/image_width
            bbox_dict['ymin'] = annotation['bbox'][1]/image_height
            bbox_dict['xmax'] = (annotation['bbox'][0] + annotation['bbox'][2])/image_width
            bbox_dict['ymax'] = (annotation['bbox'][1] + annotation['bbox'][3])/image_height
            if i == 0: 
                if (test_file and bbox_dict['image'] in test and bbox_dict['image'] not in neg_test) or not test_file:
                    annotations_seg.append(bbox_dict)

            if annotation['category_id'] == 3:
                annotations_saa.append(bbox_dict)
            
    return annotations_seg, annotations_saa

## Check IoU

In [9]:
def calculate_iou(bbox1, bbox2):
    x1 = max(float(bbox1['xmin']), float(bbox2['xmin']))
    y1 = max(float(bbox1['ymin']), float(bbox2['ymin']))
    x2 = min(float(bbox1['xmax']), float(bbox2['xmax']))
    y2 = min(float(bbox1['ymax']), float(bbox2['ymax']))

    intersection_area = max(0, x2 - x1 + 0.0001) * max(0, y2 - y1 + 0.0001)

    bbox1_area = (float(bbox1['xmax']) - float(bbox1['xmin']) + 0.0001) * (float(bbox1['ymax']) - float(bbox1['ymin']) + 0.0001)
    bbox2_area = (float(bbox2['xmax']) - float(bbox2['xmin']) + 0.0001) * (float(bbox2['ymax']) - float(bbox2['ymin']) + 0.0001)

    union_area = bbox1_area + bbox2_area - intersection_area

    iou = intersection_area / union_area
    return iou

## BBox Level Evaluation

In [10]:
import numpy as np


def calculate_metrics_bbox_level(ground_truth, predictions, iou_threshold):
    true_positive_predictions = []
    false_positive_predictions = []
    for pred in predictions:
        false_positive_predictions.append(pred)
    false_negative_predictions = []
    true_positives = 0
    false_positives = 0
    false_negatives = 0
    confidence_tp = []
    confidence_fp = []

    for gt_bbox in ground_truth:
        found_matching_prediction = False
        for pred_bbox in predictions:
            # only if image is also the same
            if pred_bbox['image'] == gt_bbox['image']:
                iou = calculate_iou(gt_bbox, pred_bbox)
                if iou >= iou_threshold:
                    true_positives += 1
                    true_positive_predictions.append(gt_bbox)
                    if pred_bbox in false_positive_predictions:
                        false_positive_predictions.remove(pred_bbox)
                    confidence_tp.append(float(pred_bbox['confidence']))
                    found_matching_prediction = True
                    break

        if not found_matching_prediction:
            false_negative_predictions.append(gt_bbox)
            false_negatives += 1

    for pred_bbox in false_positive_predictions:
        confidence_fp.append(float(pred_bbox['confidence']))

    
    false_positives = len(false_positive_predictions)
    

    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0


    return ({
        'tp': true_positives,
        'fp': false_positives,
        'fn': false_negatives,
        'tn': 0,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    }, [confidence_tp, confidence_fp])

## Image Level Evaluation

In [11]:
def calculate_metrics_image_level(ground_truth, predictions, num_total_images):

    # set of unique image filenames mentioned in files (images without detections are not listed)
    gt_images = set([bbox['image'] for bbox in ground_truth])
    gt = len(gt_images)
    pred_images = set([bbox['image'] for bbox in predictions])
    pred = len(pred_images)

    # get overlap/intersection with AND operator
    true_positive_images = gt_images & pred_images
    tp = len(true_positive_images)
    # get non overlaps 
    # files that only have detections in prediction
    false_positive_images = pred_images - true_positive_images
    fp = len(false_positive_images)
    # files that only have detections in gt
    false_negative_images = gt_images - true_positive_images
    fn = len(false_negative_images)

    # get true negatives via union with | Operator
    tn = num_total_images - len(false_negative_images | true_positive_images | false_positive_images)

    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return {
        'tp': tp,
        'fp': fp,
        'fn': fn,
        'tn': tn,
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    }



## Run Evaluation

### for 38k Dataset

In [13]:
# check that prediction csv only contains test dataset


# paths to csv 
predictions_seg_file = '/home/daniela/Documents/Thesis/Data/ISEAD/38k/birds/SegmentationMask/student_predictions/GT_birds/GT_birds_student_bird_sea_20_0.05_GT/test_prediction_2.csv'
ground_truth_test_file_csv = '/home/daniela/Documents/Thesis/Data/ISEAD/38k/birds/SegmentationMask/GT_birds/csv_GT/prediction_no_spread.csv'
test_file = "/home/daniela/Documents/Thesis/Data/ISEAD/38k/birds/SegmentationMask/GT/ImageSets/Segmentation/Test.txt"


num_total_images = 3006 # test split
iou_threshold = 0.005

# read data from csvs
ground_truth_seg = read_csv(ground_truth_test_file_csv, test_file)
predictions_seg = read_csv(predictions_seg_file, test_file)



/home/daniela/Documents/Thesis/Data/ISEAD/38k/birds/SegmentationMask/GT_birds/csv_GT/prediction_no_spread.csv
/home/daniela/Documents/Thesis/Data/ISEAD/38k/birds/SegmentationMask/student_predictions/GT_birds/GT_birds_student_bird_sea_20_0.05_GT/test_prediction_2.csv
GROUND TRUTH {'image': '10k_102869', 'xmin': '0.2', 'ymin': '0.004', 'xmax': '0.22', 'ymax': '0.024', 'obj_id': '0', 'obj_class': 'ObjectClass.UNKNOWN', 'confidence': '1.0'}
10365
{'image': '10k_102869', 'xmin': '0.18359375', 'ymin': '0.0', 'xmax': '0.234375', 'ymax': '0.03515625', 'obj_id': '0', 'obj_class': 'ObjectClass.UNKNOWN', 'confidence': '1.0'}
19585


In [15]:
def print_output(metrics_image_level, metrics_bbox_level):
    """
    print()
    print("Image Level Evaluation:")
    print("-----------------------")
    print("True Positives:", metrics_image_level['tp'])
    print("Those are images that should have predictions and have some (not specified if bboxes are correct).")
    print("False Positives:", metrics_image_level['fp'])
    print("Those are images that should not have predictions but have some.")
    print("False Negatives:", metrics_image_level['fn'])
    print("Those are images that should have predictions but have none.")
    print("True Negatives:", metrics_image_level['tn'])
    print("Those are images that should not have predictions and have none.")
    print("Total Images:", num_total_images)
    print()
    print("Precision:", metrics_image_level['precision'])
    print("Recall:", metrics_image_level['recall'])
    print("F1 Score:", metrics_image_level['f1_score'])
    print()

    print("BBox Level Evaluation:")
    print("-----------------------")

    print("True Positives:", metrics_bbox_level['tp'])
    print("Those are bboxes that were correctly predicted.")
    print("False Positives:", metrics_bbox_level['fp'])
    print("Those are bboxes that were unwanted.")
    print("False Negatives:", metrics_bbox_level['fn'])
    print("Those are missing bboxes.")
    print("True Negatives:", metrics_bbox_level['tn'])
    print("This can't appear in our case.")
    print()
    """
    print("Precision:", round(metrics_bbox_level['precision'], ndigits=3))
    print("Recall:", round(metrics_bbox_level['recall'], ndigits=3))
    print("F1 Score:", round(metrics_bbox_level['f1_score'], ndigits=3))

In [16]:

# calculate metrics for BBox
#metrics_image_level_saa = calculate_metrics_image_level(ground_truth_saa, predictions_saa, num_total_images)
#metrics_bbox_level_saa, confidence_saa = calculate_metrics_bbox_level(ground_truth_saa, predictions_saa, iou_threshold)
# calculate metrics for Segmentation
metrics_image_level_seg = calculate_metrics_image_level(ground_truth_seg, predictions_seg, num_total_images)
metrics_bbox_level_seg, confidence_seg = calculate_metrics_bbox_level(ground_truth_seg, predictions_seg, iou_threshold)

# print output Segmentation
print()
print("----------------")
print("| SEGMENTATION |")
print("----------------")
print()
print_output(metrics_image_level_seg, metrics_bbox_level_seg)




False Negative Images:
{'10k_Lachmöwe_SWIM_BIRD_3805208_SK2C107769_t10_cam1', 'task_438_Fluss--Küstenseeschwalbe_FLY_BIRD_660586_HD722173_t11_cam2', 'task_445_Heringsmöwe_FLY_BIRD_653385_HD737376_t10_cam2', 'task_439_Fluss--Küstenseeschwalbe_FLY_BIRD_675887_HD628219_t13_cam1', 'task_445_Heringsmöwe_FLY_BIRD_682922_HD648544_t8_cam1', '10k_Lachmöwe_SWIM_BIRD_3805223_SK2C107769_t10_cam1', 'task_400_Dreizehenmöwe_FLY_BIRD_3620821_HD1700_t5_cam2'}
confidence tp: 1.0
confidence fp: 1.0

----------------
| SEGMENTATION |
----------------

Precision: 0.457
Recall: 0.934
F1 Score: 0.613


In [None]:
# Show confidence distribution
"""
import matplotlib.pyplot as plt
%matplotlib inline

print(np.max(confidence_seg[0]))
print(np.min(confidence_seg[0]))

# plot of what confidence threshold to choose

def get_amount(threshold, list):
    return np.sum([c < threshold for c in  list])/len(list)

confidence_threshold = np.arange(0.0, 1.1, 0.01)
amount_tp = [1-get_amount(t, confidence_seg[0]) for t in confidence_threshold]
amount_fp = [get_amount(t, confidence_seg[1]) for t in confidence_threshold]
plt.plot(confidence_threshold, amount_tp)
plt.plot(confidence_threshold, amount_fp)

# histograms of confidence

#plt.hist(confidence_saa[0], density=True, bins=30)  # density=False would make counts
#plt.ylabel('Frequency')
#plt.xlabel('Confidence SAA TP')
#plt.plot()

#plt.hist(confidence_saa[1], density=True, bins=30)  # density=False would make counts
#plt.ylabel('Frequency')
#plt.xlabel('Confidence SAA FP')
#plt.plot()"""