In [None]:
import pandas as pd
from PIL import Image, ImageDraw
from IPython.display import display
import numpy as np
import matplotlib.pyplot as plt

## Findings

### Duplicate rows
- Expert Data – Boom: **52 duplicate rows**
- Expert Data – Drone: **3 duplicate rows**
- Expert Data – Handheld: **0 duplicate rows**

### Bouding Boxes
- x2 and y2 values do not necessarily have to be greater equal x1 and y1

### Bouding Box Sizes 
There are Boxes with height $=0$ but width $\neq 0$ and vice versa

- Expert Data – Boom:
    - **Width $=0$ but height $\neq 0$: 217**
    - **Height $=0$ but width $\neq 0$: 274**
- Expert Data – Drone:
    - **Width $=0$ but height $\neq 0$: 109**
    - **Height $=0$ but width $\neq 0$: 150**
- Expert Data – Handheld:
    - **Width $=0$ but height $\neq 0$: 22**
    - **Height $=0$ but width $\neq 0$: 15**

### "Image" Column Suffix
- Expert Data – Boom: **no suffix**
- Expert Data – Drone: **.jpg**
- Expert Data – Handheld: **.JPG**

### Negative Bounding Box Coordinate Values
- Only the boom data contains 4 boxes with negative values. 2 can be removed since the width or height of the boxes are 0 and the other 2 negative values can be set to 0

## Analyse expert data

In [None]:
expert_boom_data = pd.read_csv("/exchange/dspro2/M-AI-ZE/data/annotations_expert/annotations_boom.csv", delimiter=",", quotechar='"')
expert_drone_data = pd.read_csv("/exchange/dspro2/M-AI-ZE/data/annotations_expert/annotations_drone.csv", delimiter=",", quotechar='"')
expert_handheld_data = pd.read_csv("/exchange/dspro2/M-AI-ZE/data/annotations_expert/annotations_handheld.csv", delimiter=",", quotechar='"')

In [None]:
expert_boom_data.head()

In [None]:
expert_drone_data.head()

In [None]:
expert_handheld_data.head()

### Check for duplicates

In [None]:
print(f"Expert Data – Boom: {expert_boom_data.duplicated().sum()} duplicate rows")
print(f"Expert Data – Drone: {expert_drone_data.duplicated().sum()} duplicate rows")
print(f"Expert Data – Handheld: {expert_handheld_data.duplicated().sum()} duplicate rows")

In [None]:
expert_boom_data[expert_boom_data.duplicated()].head()

In [None]:
expert_boom_data[expert_boom_data['image'] == 'DSC00982_0']

### Draw Bounding Boxes on Images

In [None]:
def draw_bounding_box(data, img_id, img_type):
    img_path = '/exchange/dspro2/M-AI-ZE/data/images/images_' + img_type + '/' + img_id + '.jpg'
    img = Image.open(img_path)
    draw = ImageDraw.Draw(img)
    data_subset = data[data['image'] == img_id]
    for _, row in data_subset.iterrows():
        x0 = min(row['x1'], row['x2'])
        y0 = min(row['y1'], row['y2'])
        x1 = max(row['x1'], row['x2'])
        y1 = max(row['y1'], row['y2'])
        if (row['x1'] - row['x2'] == 0) or (row['y1'] - row['y2'] == 0):
            draw.rectangle([x0, y0, x1, y1], outline="blue", width=3)    
        else:
            draw.rectangle([x0, y0, x1, y1], outline="red", width=3)
    display(img)

In [None]:
for _ in range(10):
    random_image = expert_boom_data.image[np.random.randint(expert_boom_data.shape[0])]
    print(f'Image ID: {random_image}')
    draw_bounding_box(expert_boom_data, random_image, 'boom')

### Check Bounding Box Sizes

In [None]:
def get_bb_sizes(df):
    df = df.copy()
    df['bb_width'] = abs(df['x2'] - df['x1'])
    df['bb_height'] = abs(df['y2'] - df['y1'])
    return df

def print_bb_stats(df):
    df = df.copy()
    df = get_bb_sizes(df)
    print(f'Amount of boxes: {df.shape[0]}')
    print(f'Min width: {df['bb_width'].min()} Max width: {df['bb_width'].max()} Mean width: {df['bb_width'].mean()}')
    print(f'Min height: {df['bb_height'].min()} Max height: {df['bb_height'].max()} Mean height: {df['bb_height'].mean()}')
    print(f'Amount of boxes with a width of 0 but height != 0: {df[(df['bb_width'] == 0) & (df['bb_height'] != 0)].shape[0]}')
    print(f'Amount of boxes with a height of 0 but width != 0: {df[(df['bb_width'] != 0) & (df['bb_height'] == 0)].shape[0]}')

def get_inconsistent_bb(df):
    df = df.copy()
    df = get_bb_sizes(df)
    return df[((df['bb_width'] == 0) & (df['bb_height'] != 0)) | ((df['bb_width'] != 0) & (df['bb_height'] == 0))]
    

In [None]:
print_bb_stats(expert_boom_data)

In [None]:
print_bb_stats(expert_drone_data)

In [None]:
print_bb_stats(expert_handheld_data)

In [None]:
get_inconsistent_bb(expert_boom_data).head()

In [None]:
draw_bounding_box(expert_boom_data, 'DSC00970_1', 'boom')

### Check if the data is balanced

In [None]:
expert_boom_data[(expert_boom_data['x1'] == 0) & (expert_boom_data['y1'] == 0) & (expert_boom_data['x2'] == 0) & (expert_boom_data['y2'] == 0)].shape

In [None]:
expert_boom_data[(expert_boom_data['x1'] != 0) & (expert_boom_data['y1'] != 0) & (expert_boom_data['x2'] != 0) & (expert_boom_data['y2'] != 0)].shape

In [None]:
def check_class_balance(data):
    data_rows = data.shape[0]
    no_disease = data[(data['x1'] == 0) & (data['y1'] == 0) & (data['x2'] == 0) & (data['y2'] == 0)].shape[0]
    disease = data_rows - no_disease

    dis_percent = 100 * disease / data_rows
    no_dis_percent = 100 * no_disease / data_rows

    if (dis_percent + no_dis_percent) != 100.0:
        raise ValueError("Percentages do not add up to 100. Please check")
    
    return dis_percent, no_dis_percent

In [None]:
print(f"Expert Data – Boom: {round(check_class_balance(expert_boom_data)[0], 2)}% with disease")
print(f"Expert Data – Drone: {round(check_class_balance(expert_drone_data)[0], 2)}% with disease")
print(f"Expert Data – Handheld: {round(check_class_balance(expert_handheld_data)[0], 2)}% with disease")

### Amout of Boxes per Image

In [None]:
def print_bb_dist(df):
    df = get_bb_sizes(df)
    df_bboxes = df[(df['bb_width'] != 0) & (df['bb_height'] != 0)]
    bbox_counts = df_bboxes.groupby('image').size().sort_values(ascending=False)
    
    plt.figure(figsize=(20, 5))
    bbox_counts.value_counts().sort_index().plot(kind='bar')
    plt.xlabel('Number of bounding boxes')
    plt.ylabel('Number of images')
    plt.title('Distribution of Bounding Box Counts per Image')
    plt.xticks(rotation=0)
    plt.show()

    print(f'Mean amount of bouding boxes per image (images without boxes excluded): {bbox_counts.mean()}')
    print(f'Median amount of bouding boxes per image (images without boxes excluded): {bbox_counts.median()}')

In [None]:
print_bb_dist(expert_boom_data)
print_bb_dist(expert_drone_data)
print_bb_dist(expert_handheld_data)

### Check Negative Coordinate Values

In [None]:
def get_negative_cords(df):
    return df[(df['x1'] < 0) | (df['y1'] < 0) | (df['x2'] < 0) | (df['y2'] < 0)]

In [None]:
get_negative_cords(expert_boom_data)

In [None]:
get_negative_cords(expert_drone_data)

In [None]:
get_negative_cords(expert_handheld_data)

In [None]:
draw_bounding_box(get_negative_cords(expert_boom_data), 'DSC02071_3', 'boom')

In [None]:
draw_bounding_box(get_negative_cords(expert_boom_data), 'DSC05511_1', 'boom')

In [None]:
draw_bounding_box(get_negative_cords(expert_boom_data), 'DSC05740_1', 'boom')

In [None]:
draw_bounding_box(get_negative_cords(expert_boom_data), 'DSC06154_0', 'boom')

### Check Time Distribution